Compare commits

...

8 Commits

Author SHA1 Message Date
xxh
a52cdff013 add no gui, no realtime mode, and train bash script 2026-03-21 08:53:31 -04:00
xxh
ec8b648a3b stand stable 0.1 amendment and add some info 2026-03-21 06:46:55 -04:00
xxh
f99fae68f6 change model and policy 2026-03-14 01:08:22 -04:00
xxh
294fe0bd79 restore 2026-03-13 21:44:59 -04:00
xxh
cf80becd17 restore 2026-03-13 21:38:44 -04:00
xxh
6ab356a947 improve train speed and add speed constrain 2026-03-13 08:51:49 -04:00
3a42120857 Revert "improve training speed and add speed constrain"
This reverts commit 648cf32e9c.
2026-03-13 08:43:28 -04:00
648cf32e9c improve training speed and add speed constrain 2026-03-13 08:40:50 -04:00
9 changed files with 704 additions and 412 deletions

10
.gitignore vendored
View File

@@ -10,3 +10,13 @@ poetry.toml
**/log/ **/log/
*.spec *.spec
dist/ dist/
*steps.zip
*.pkl
best_model.zip
*.csv
*.npz
*.xml
*.json
*.yaml
*.iml
*.TXT

14
command.md Normal file
View File

@@ -0,0 +1,14 @@
训练(默认)
bash train.sh
测试(实时+显示画面)
GYM_CPU_MODE=test GYM_CPU_TEST_MODEL=scripts/gyms/logs/Walk_R0_005/best_model.zip GYM_CPU_TEST_FOLDER=scripts/gyms/logs/Walk_R0_005/ GYM_CPU_TEST_NO_RENDER=0 GYM_CPU_TEST_NO_REALTIME=0 bash train.sh
测试(无画面、非实时)
GYM_CPU_MODE=test GYM_CPU_TEST_NO_RENDER=1 GYM_CPU_TEST_NO_REALTIME=1 bash train.sh
retrain继续训练
GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_005/best_model.zip bash train.sh
retrain+改训练超参
GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_004/best_model.zip GYM_CPU_TRAIN_LR=2e-4 GYM_CPU_TRAIN_BATCH_SIZE=256 GYM_CPU_TRAIN_EPOCHS=8 bash train.sh

View File

@@ -1,5 +1,6 @@
import logging import logging
import socket import socket
import time
from select import select from select import select
from communication.world_parser import WorldParser from communication.world_parser import WorldParser
@@ -10,15 +11,27 @@ class Server:
def __init__(self, host: str, port: int, world_parser: WorldParser): def __init__(self, host: str, port: int, world_parser: WorldParser):
self.world_parser: WorldParser = world_parser self.world_parser: WorldParser = world_parser
self.__host: str = host self.__host: str = host
self.__port: str = port self.__port: int = port
self.__socket: socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.__socket: socket.socket = self._create_socket()
self.__socket.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
self.__send_buff = [] self.__send_buff = []
self.__rcv_buffer_size = 1024 self.__rcv_buffer_size = 1024
self.__rcv_buffer = bytearray(self.__rcv_buffer_size) self.__rcv_buffer = bytearray(self.__rcv_buffer_size)
def _create_socket(self) -> socket.socket:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
return sock
def connect(self) -> None: def connect(self) -> None:
logger.info("Connecting to server at %s:%d...", self.__host, self.__port) logger.info("Connecting to server at %s:%d...", self.__host, self.__port)
# Always reconnect with a fresh socket object.
try:
self.__socket.close()
except OSError:
pass
self.__socket = self._create_socket()
while True: while True:
try: try:
self.__socket.connect((self.__host, self.__port)) self.__socket.connect((self.__host, self.__port))
@@ -27,12 +40,19 @@ class Server:
logger.error( logger.error(
"Connection refused. Make sure the server is running and listening on {self.__host}:{self.__port}." "Connection refused. Make sure the server is running and listening on {self.__host}:{self.__port}."
) )
time.sleep(0.05)
logger.info(f"Server connection established to {self.__host}:{self.__port}.") logger.info(f"Server connection established to {self.__host}:{self.__port}.")
def shutdown(self) -> None: def shutdown(self) -> None:
self.__socket.close() try:
self.__socket.shutdown(socket.SHUT_RDWR) self.__socket.shutdown(socket.SHUT_RDWR)
except OSError:
pass
try:
self.__socket.close()
except OSError:
pass
def send_immediate(self, msg: str) -> None: def send_immediate(self, msg: str) -> None:
""" """
@@ -72,37 +92,37 @@ class Server:
self.commit(msg) self.commit(msg)
self.send() self.send()
def receive(self) -> None: def receive(self):
"""
Receive the next message from the TCP/IP socket and updates world
"""
# Receive message length information while True:
if (
self.__socket.recv_into( if (
self.__rcv_buffer, nbytes=4, flags=socket.MSG_WAITALL self.__socket.recv_into(
self.__rcv_buffer, nbytes=4, flags=socket.MSG_WAITALL
) != 4
):
raise ConnectionResetError
msg_size = int.from_bytes(self.__rcv_buffer[:4], byteorder="big", signed=False)
if msg_size > self.__rcv_buffer_size:
self.__rcv_buffer_size = msg_size
self.__rcv_buffer = bytearray(self.__rcv_buffer_size)
if (
self.__socket.recv_into(
self.__rcv_buffer, nbytes=msg_size, flags=socket.MSG_WAITALL
) != msg_size
):
raise ConnectionResetError
self.world_parser.parse(
message=self.__rcv_buffer[:msg_size].decode()
) )
!= 4
):
raise ConnectionResetError
msg_size = int.from_bytes(self.__rcv_buffer[:4], byteorder="big", signed=False) # 如果socket没有更多数据就退出
if len(select([self.__socket], [], [], 0.0)[0]) == 0:
# Ensure receive buffer is large enough to hold the message break
if msg_size > self.__rcv_buffer_size:
self.__rcv_buffer_size = msg_size
self.__rcv_buffer = bytearray(self.__rcv_buffer_size)
# Receive message with the specified length
if (
self.__socket.recv_into(
self.__rcv_buffer, nbytes=msg_size, flags=socket.MSG_WAITALL
)
!= msg_size
):
raise ConnectionResetError
self.world_parser.parse(message=self.__rcv_buffer[:msg_size].decode())
def commit_beam(self, pos2d: list, rotation: float) -> None: def commit_beam(self, pos2d: list, rotation: float) -> None:
assert len(pos2d) == 2 assert len(pos2d) == 2

View File

@@ -1,9 +1,10 @@
import subprocess import subprocess
import os import os
import time
class Server(): class Server():
def __init__(self, first_server_p, first_monitor_p, n_servers) -> None: def __init__(self, first_server_p, first_monitor_p, n_servers, no_render=True, no_realtime=True) -> None:
try: try:
import psutil import psutil
self.check_running_servers(psutil, first_server_p, first_monitor_p, n_servers) self.check_running_servers(psutil, first_server_p, first_monitor_p, n_servers)
@@ -17,30 +18,50 @@ class Server():
# makes it easier to kill test servers without affecting train servers # makes it easier to kill test servers without affecting train servers
cmd = "rcssservermj" cmd = "rcssservermj"
render_arg = "--no-render" if no_render else ""
realtime_arg = "--no-realtime" if no_realtime else ""
for i in range(n_servers): for i in range(n_servers):
self.rcss_processes.append( port = first_server_p + i
subprocess.Popen((f"{cmd} --aport {first_server_p+i} --mport {first_monitor_p+i}").split(), mport = first_monitor_p + i
stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, start_new_session=True)
server_cmd = f"{cmd} -c {port} -m {mport} {render_arg} {realtime_arg}".strip()
proc = subprocess.Popen(
server_cmd.split(),
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT,
start_new_session=True
) )
# Avoid startup storm when launching many servers at once.
time.sleep(0.03)
rc = proc.poll()
if rc is not None:
raise RuntimeError(
f"rcssservermj exited early (code={rc}) on server port {port}, monitor port {mport}"
)
self.rcss_processes.append(proc)
def check_running_servers(self, psutil, first_server_p, first_monitor_p, n_servers): def check_running_servers(self, psutil, first_server_p, first_monitor_p, n_servers):
''' Check if any server is running on chosen ports ''' ''' Check if any server is running on chosen ports '''
found = False found = False
p_list = [p for p in psutil.process_iter() if p.cmdline() and "rcssservermj" in " ".join(p.cmdline())] p_list = [p for p in psutil.process_iter() if p.cmdline() and "rcssservermj" in " ".join(p.cmdline())]
range1 = (first_server_p, first_server_p + n_servers) range1 = (first_server_p, first_server_p + n_servers)
range2 = (first_monitor_p,first_monitor_p + n_servers) range2 = (first_monitor_p, first_monitor_p + n_servers)
bad_processes = [] bad_processes = []
for p in p_list: for p in p_list:
# currently ignoring remaining default port when only one of the ports is specified (uncommon scenario) # currently ignoring remaining default port when only one of the ports is specified (uncommon scenario)
ports = [int(arg) for arg in p.cmdline()[1:] if arg.isdigit()] ports = [int(arg) for arg in p.cmdline()[1:] if arg.isdigit()]
if len(ports) == 0: if len(ports) == 0:
ports = [60000,60100] # default server ports (changing this is unlikely) ports = [60000, 60100] # default server ports (changing this is unlikely)
conflicts = [str(port) for port in ports if ( conflicts = [str(port) for port in ports if (
(range1[0] <= port < range1[1]) or (range2[0] <= port < range2[1]) )] (range1[0] <= port < range1[1]) or (range2[0] <= port < range2[1]))]
if len(conflicts)>0: if len(conflicts) > 0:
if not found: if not found:
print("\nThere are already servers running on the same port(s)!") print("\nThere are already servers running on the same port(s)!")
found = True found = True
@@ -56,7 +77,6 @@ class Server():
p.kill() p.kill()
return return
def kill(self): def kill(self):
for p in self.rcss_processes: for p in self.rcss_processes:
p.kill() p.kill()

View File

@@ -31,9 +31,9 @@ class Train_Base():
args = script.args args = script.args
self.script = script self.script = script
self.ip = args.i self.ip = args.i
self.server_p = args.p # (initial) server port self.server_p = args.p # (initial) server port
self.monitor_p = args.m + 100 # monitor port when testing self.monitor_p = args.m + 100 # monitor port when testing
self.monitor_p_1000 = args.m + 1100 # initial monitor port when training self.monitor_p_1000 = args.m + 1100 # initial monitor port when training
self.robot_type = args.r self.robot_type = args.r
self.team = args.t self.team = args.t
self.uniform = args.u self.uniform = args.u
@@ -41,21 +41,19 @@ class Train_Base():
self.cf_delay = 0 self.cf_delay = 0
# self.cf_target_period = World.STEPTIME # target simulation speed while testing (default: real-time) # self.cf_target_period = World.STEPTIME # target simulation speed while testing (default: real-time)
@staticmethod @staticmethod
def prompt_user_for_model(self): def prompt_user_for_model(self):
gyms_logs_path = "./mujococodebase/scripts/gyms/logs/" gyms_logs_path = "./scripts/gyms/logs/"
folders = [f for f in listdir(gyms_logs_path) if isdir(join(gyms_logs_path, f))] folders = [f for f in listdir(gyms_logs_path) if isdir(join(gyms_logs_path, f))]
folders.sort(key=lambda f: os.path.getmtime(join(gyms_logs_path, f)), reverse=True) # sort by modification date folders.sort(key=lambda f: os.path.getmtime(join(gyms_logs_path, f)), reverse=True) # sort by modification date
while True: while True:
try: try:
folder_name = UI.print_list(folders,prompt="Choose folder (ctrl+c to return): ")[1] folder_name = UI.print_list(folders, prompt="Choose folder (ctrl+c to return): ")[1]
except KeyboardInterrupt: except KeyboardInterrupt:
print() print()
return None # ctrl+c return None # ctrl+c
folder_dir = os.path.join(gyms_logs_path, folder_name) folder_dir = os.path.join(gyms_logs_path, folder_name)
models = [m[:-4] for m in listdir(folder_dir) if isfile(join(folder_dir, m)) and m.endswith(".zip")] models = [m[:-4] for m in listdir(folder_dir) if isfile(join(folder_dir, m)) and m.endswith(".zip")]
@@ -64,16 +62,17 @@ class Train_Base():
print("The chosen folder does not contain any .zip file!") print("The chosen folder does not contain any .zip file!")
continue continue
models.sort(key=lambda m: os.path.getmtime(join(folder_dir, m+".zip")), reverse=True) # sort by modification date models.sort(key=lambda m: os.path.getmtime(join(folder_dir, m + ".zip")),
reverse=True) # sort by modification date
try: try:
model_name = UI.print_list(models,prompt="Choose model (ctrl+c to return): ")[1] model_name = UI.print_list(models, prompt="Choose model (ctrl+c to return): ")[1]
break break
except KeyboardInterrupt: except KeyboardInterrupt:
print() print()
return {"folder_dir":folder_dir, "folder_name":folder_name, "model_file":os.path.join(folder_dir, model_name+".zip")} return {"folder_dir": folder_dir, "folder_name": folder_name,
"model_file": os.path.join(folder_dir, model_name + ".zip")}
# def control_fps(self, read_input = False): # def control_fps(self, read_input = False):
# ''' Add delay to control simulation speed ''' # ''' Add delay to control simulation speed '''
@@ -108,8 +107,8 @@ class Train_Base():
# else: # else:
# self.cf_delay = 0 # self.cf_delay = 0
def test_model(self, model: BaseAlgorithm, env, log_path: str = None, model_path: str = None, max_episodes=0,
def test_model(self, model:BaseAlgorithm, env, log_path:str=None, model_path:str=None, max_episodes=0, enable_FPS_control=True, verbose=1): enable_FPS_control=True, verbose=1):
''' '''
Test model and log results Test model and log results
@@ -152,7 +151,7 @@ class Train_Base():
f.write("reward,ep. length,rew. cumulative avg., ep. len. cumulative avg.\n") f.write("reward,ep. length,rew. cumulative avg., ep. len. cumulative avg.\n")
print("Train statistics are saved to:", log_path) print("Train statistics are saved to:", log_path)
if enable_FPS_control: # control simulation speed (using non blocking user input) if enable_FPS_control: # control simulation speed (using non blocking user input)
print("\nThe simulation speed can be changed by sending a non-negative integer\n" print("\nThe simulation speed can be changed by sending a non-negative integer\n"
"(e.g. '50' sets speed to 50%, '0' pauses the simulation, '' sets speed to MAX)\n") "(e.g. '50' sets speed to 50%, '0' pauses the simulation, '' sets speed to MAX)\n")
@@ -172,8 +171,8 @@ class Train_Base():
ep_reward += reward ep_reward += reward
ep_length += 1 ep_length += 1
if enable_FPS_control: # control simulation speed (using non blocking user input) # if enable_FPS_control: # control simulation speed (using non blocking user input)
self.control_fps(select.select([sys.stdin], [], [], 0)[0]) # self.control_fps(select.select([sys.stdin], [], [], 0)[0])
if done: if done:
obs, _ = env.reset() obs, _ = env.reset()
@@ -182,12 +181,14 @@ class Train_Base():
reward_max = max(ep_reward, reward_max) reward_max = max(ep_reward, reward_max)
reward_min = min(ep_reward, reward_min) reward_min = min(ep_reward, reward_min)
ep_no += 1 ep_no += 1
avg_ep_lengths = ep_lengths_sum/ep_no avg_ep_lengths = ep_lengths_sum / ep_no
avg_rewards = rewards_sum/ep_no avg_rewards = rewards_sum / ep_no
if verbose > 0: if verbose > 0:
print( f"\rEpisode: {ep_no:<3} Ep.Length: {ep_length:<4.0f} Reward: {ep_reward:<6.2f} \n", print(
end=f"--AVERAGE-- Ep.Length: {avg_ep_lengths:<4.0f} Reward: {avg_rewards:<6.2f} (Min: {reward_min:<6.2f} Max: {reward_max:<6.2f})", flush=True) f"\rEpisode: {ep_no:<3} Ep.Length: {ep_length:<4.0f} Reward: {ep_reward:<6.2f} \n",
end=f"--AVERAGE-- Ep.Length: {avg_ep_lengths:<4.0f} Reward: {avg_rewards:<6.2f} (Min: {reward_min:<6.2f} Max: {reward_max:<6.2f})",
flush=True)
if log_path is not None: if log_path is not None:
with open(log_path, 'a') as f: with open(log_path, 'a') as f:
@@ -200,7 +201,8 @@ class Train_Base():
ep_reward = 0 ep_reward = 0
ep_length = 0 ep_length = 0
def learn_model(self, model:BaseAlgorithm, total_steps:int, path:str, eval_env=None, eval_freq=None, eval_eps=5, save_freq=None, backup_env_file=None, export_name=None): def learn_model(self, model: BaseAlgorithm, total_steps: int, path: str, eval_env=None, eval_freq=None, eval_eps=5,
save_freq=None, backup_env_file=None, export_name=None):
''' '''
Learn Model for a specific number of time steps Learn Model for a specific number of time steps
@@ -251,7 +253,7 @@ class Train_Base():
# If path already exists, add suffix to avoid overwriting # If path already exists, add suffix to avoid overwriting
if os.path.isdir(path): if os.path.isdir(path):
for i in count(): for i in count():
p = path.rstrip("/")+f'_{i:03}/' p = path.rstrip("/") + f'_{i:03}/'
if not os.path.isdir(p): if not os.path.isdir(p):
path = p path = p
break break
@@ -265,22 +267,28 @@ class Train_Base():
evaluate = bool(eval_env is not None and eval_freq is not None) evaluate = bool(eval_env is not None and eval_freq is not None)
# Create evaluation callback # Create evaluation callback
eval_callback = None if not evaluate else EvalCallback(eval_env, n_eval_episodes=eval_eps, eval_freq=eval_freq, log_path=path, eval_callback = None if not evaluate else EvalCallback(eval_env, n_eval_episodes=eval_eps, eval_freq=eval_freq,
best_model_save_path=path, deterministic=True, render=False) log_path=path,
best_model_save_path=path, deterministic=True,
render=False)
# Create custom callback to display evaluations # Create custom callback to display evaluations
custom_callback = None if not evaluate else Cyclic_Callback(eval_freq, lambda:self.display_evaluations(path,True)) custom_callback = None if not evaluate else Cyclic_Callback(eval_freq,
lambda: self.display_evaluations(path, True))
# Create checkpoint callback # Create checkpoint callback
checkpoint_callback = None if save_freq is None else CheckpointCallback(save_freq=save_freq, save_path=path, name_prefix="model", verbose=1) checkpoint_callback = None if save_freq is None else CheckpointCallback(save_freq=save_freq, save_path=path,
name_prefix="model", verbose=1)
# Create custom callback to export checkpoint models # Create custom callback to export checkpoint models
export_callback = None if save_freq is None or export_name is None else Export_Callback(save_freq, path, export_name) export_callback = None if save_freq is None or export_name is None else Export_Callback(save_freq, path,
export_name)
callbacks = CallbackList([c for c in [eval_callback, custom_callback, checkpoint_callback, export_callback] if c is not None]) callbacks = CallbackList(
[c for c in [eval_callback, custom_callback, checkpoint_callback, export_callback] if c is not None])
model.learn( total_timesteps=total_steps, callback=callbacks ) model.learn(total_timesteps=total_steps, callback=callbacks)
model.save( os.path.join(path, "last_model") ) model.save(os.path.join(path, "last_model"))
# Display evaluations if they exist # Display evaluations if they exist
if evaluate: if evaluate:
@@ -288,7 +296,7 @@ class Train_Base():
# Display timestamps + Model path # Display timestamps + Model path
end_date = datetime.now().strftime('%d/%m/%Y %H:%M:%S') end_date = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
duration = timedelta(seconds=int(time.time()-start)) duration = timedelta(seconds=int(time.time() - start))
print(f"Train start: {start_date}") print(f"Train start: {start_date}")
print(f"Train end: {end_date}") print(f"Train end: {end_date}")
print(f"Train duration: {duration}") print(f"Train duration: {duration}")
@@ -298,8 +306,8 @@ class Train_Base():
if backup_env_file is not None: if backup_env_file is not None:
with open(backup_file, 'a') as f: with open(backup_file, 'a') as f:
f.write(f"\n# Train start: {start_date}\n") f.write(f"\n# Train start: {start_date}\n")
f.write( f"# Train end: {end_date}\n") f.write(f"# Train end: {end_date}\n")
f.write( f"# Train duration: {duration}") f.write(f"# Train duration: {duration}")
return path return path
@@ -318,50 +326,57 @@ class Train_Base():
with np.load(eval_npz) as data: with np.load(eval_npz) as data:
time_steps = data["timesteps"] time_steps = data["timesteps"]
results_raw = np.mean(data["results"],axis=1) results_raw = np.mean(data["results"], axis=1)
ep_lengths_raw = np.mean(data["ep_lengths"],axis=1) ep_lengths_raw = np.mean(data["ep_lengths"], axis=1)
sample_no = len(results_raw) sample_no = len(results_raw)
xvals = np.linspace(0, sample_no-1, 80) xvals = np.linspace(0, sample_no - 1, 80)
results = np.interp(xvals, range(sample_no), results_raw) results = np.interp(xvals, range(sample_no), results_raw)
ep_lengths = np.interp(xvals, range(sample_no), ep_lengths_raw) ep_lengths = np.interp(xvals, range(sample_no), ep_lengths_raw)
results_limits = np.min(results), np.max(results) results_limits = np.min(results), np.max(results)
ep_lengths_limits = np.min(ep_lengths), np.max(ep_lengths) ep_lengths_limits = np.min(ep_lengths), np.max(ep_lengths)
results_discrete = np.digitize(results, np.linspace(results_limits[0]-1e-5, results_limits[1]+1e-5, console_height+1))-1 results_discrete = np.digitize(results, np.linspace(results_limits[0] - 1e-5, results_limits[1] + 1e-5,
ep_lengths_discrete = np.digitize(ep_lengths, np.linspace(0, ep_lengths_limits[1]+1e-5, console_height+1))-1 console_height + 1)) - 1
ep_lengths_discrete = np.digitize(ep_lengths,
np.linspace(0, ep_lengths_limits[1] + 1e-5, console_height + 1)) - 1
matrix = np.zeros((console_height, console_width, 2), int) matrix = np.zeros((console_height, console_width, 2), int)
matrix[results_discrete[0] ][0][0] = 1 # draw 1st column matrix[results_discrete[0]][0][0] = 1 # draw 1st column
matrix[ep_lengths_discrete[0]][0][1] = 1 # draw 1st column matrix[ep_lengths_discrete[0]][0][1] = 1 # draw 1st column
rng = [[results_discrete[0], results_discrete[0]], [ep_lengths_discrete[0], ep_lengths_discrete[0]]] rng = [[results_discrete[0], results_discrete[0]], [ep_lengths_discrete[0], ep_lengths_discrete[0]]]
# Create continuous line for both plots # Create continuous line for both plots
for k in range(2): for k in range(2):
for i in range(1,console_width): for i in range(1, console_width):
x = [results_discrete, ep_lengths_discrete][k][i] x = [results_discrete, ep_lengths_discrete][k][i]
if x > rng[k][1]: if x > rng[k][1]:
rng[k] = [rng[k][1]+1, x] rng[k] = [rng[k][1] + 1, x]
elif x < rng[k][0]: elif x < rng[k][0]:
rng[k] = [x, rng[k][0]-1] rng[k] = [x, rng[k][0] - 1]
else: else:
rng[k] = [x,x] rng[k] = [x, x]
for j in range(rng[k][0],rng[k][1]+1): for j in range(rng[k][0], rng[k][1] + 1):
matrix[j][i][k] = 1 matrix[j][i][k] = 1
print(f'{"-"*console_width}') print(f'{"-" * console_width}')
for l in reversed(range(console_height)): for l in reversed(range(console_height)):
for c in range(console_width): for c in range(console_width):
if np.all(matrix[l][c] == 0): print(end=" ") if np.all(matrix[l][c] == 0):
elif np.all(matrix[l][c] == 1): print(end=symb_xo) print(end=" ")
elif matrix[l][c][0] == 1: print(end=symb_x) elif np.all(matrix[l][c] == 1):
else: print(end=symb_o) print(end=symb_xo)
elif matrix[l][c][0] == 1:
print(end=symb_x)
else:
print(end=symb_o)
print() print()
print(f'{"-"*console_width}') print(f'{"-" * console_width}')
print(f"({symb_x})-reward min:{results_limits[0]:11.2f} max:{results_limits[1]:11.2f}") print(f"({symb_x})-reward min:{results_limits[0]:11.2f} max:{results_limits[1]:11.2f}")
print(f"({symb_o})-ep. length min:{ep_lengths_limits[0]:11.0f} max:{ep_lengths_limits[1]:11.0f} {time_steps[-1]/1000:15.0f}k steps") print(
print(f'{"-"*console_width}') f"({symb_o})-ep. length min:{ep_lengths_limits[0]:11.0f} max:{ep_lengths_limits[1]:11.0f} {time_steps[-1] / 1000:15.0f}k steps")
print(f'{"-" * console_width}')
# save CSV # save CSV
if save_csv: if save_csv:
@@ -370,8 +385,7 @@ class Train_Base():
writer = csv.writer(f) writer = csv.writer(f)
if sample_no == 1: if sample_no == 1:
writer.writerow(["time_steps", "reward ep.", "length"]) writer.writerow(["time_steps", "reward ep.", "length"])
writer.writerow([time_steps[-1],results_raw[-1],ep_lengths_raw[-1]]) writer.writerow([time_steps[-1], results_raw[-1], ep_lengths_raw[-1]])
# def generate_slot_behavior(self, path, slots, auto_head:bool, XML_name): # def generate_slot_behavior(self, path, slots, auto_head:bool, XML_name):
# ''' # '''
@@ -454,21 +468,21 @@ class Train_Base():
break break
model = PPO.load(input_file) model = PPO.load(input_file)
weights = model.policy.state_dict() # dictionary containing network layers weights = model.policy.state_dict() # dictionary containing network layers
w = lambda name : weights[name].detach().cpu().numpy() # extract weights from policy w = lambda name: weights[name].detach().cpu().numpy() # extract weights from policy
var_list = [] var_list = []
for i in count(0,2): # add hidden layers (step=2 because that's how SB3 works) for i in count(0, 2): # add hidden layers (step=2 because that's how SB3 works)
if f"mlp_extractor.policy_net.{i}.bias" not in weights: if f"mlp_extractor.policy_net.{i}.bias" not in weights:
break break
var_list.append([w(f"mlp_extractor.policy_net.{i}.bias"), w(f"mlp_extractor.policy_net.{i}.weight"), "tanh"]) var_list.append(
[w(f"mlp_extractor.policy_net.{i}.bias"), w(f"mlp_extractor.policy_net.{i}.weight"), "tanh"])
var_list.append( [w("action_net.bias"), w("action_net.weight"), "none"] ) # add final layer var_list.append([w("action_net.bias"), w("action_net.weight"), "none"]) # add final layer
with open(output_file,"wb") as f:
pickle.dump(var_list, f, protocol=4) # protocol 4 is backward compatible with Python 3.4
with open(output_file, "wb") as f:
pickle.dump(var_list, f, protocol=4) # protocol 4 is backward compatible with Python 3.4
def print_list(data, numbering=True, prompt=None, divider=" | ", alignment="<", min_per_col=6): def print_list(data, numbering=True, prompt=None, divider=" | ", alignment="<", min_per_col=6):
''' '''
@@ -503,58 +517,60 @@ class Train_Base():
items = [] items = []
items_len = [] items_len = []
#--------------------------------------------- Add numbers, margins and divider # --------------------------------------------- Add numbers, margins and divider
for i in range(data_size): for i in range(data_size):
number = f"{i}-" if numbering else "" number = f"{i}-" if numbering else ""
items.append( f"{divider}{number}{data[i]}" ) items.append(f"{divider}{number}{data[i]}")
items_len.append( len(items[-1]) ) items_len.append(len(items[-1]))
max_cols = np.clip((WIDTH+len(divider)) // min(items_len),1,math.ceil(data_size/max(min_per_col,1))) # width + len(divider) because it is not needed in last col max_cols = np.clip((WIDTH + len(divider)) // min(items_len), 1, math.ceil(
data_size / max(min_per_col, 1))) # width + len(divider) because it is not needed in last col
#--------------------------------------------- Check maximum number of columns, considering content width (min:1) # --------------------------------------------- Check maximum number of columns, considering content width (min:1)
for i in range(max_cols,0,-1): for i in range(max_cols, 0, -1):
cols_width = [] cols_width = []
cols_items = [] cols_items = []
table_width = 0 table_width = 0
a,b = divmod(data_size,i) a, b = divmod(data_size, i)
for col in range(i): for col in range(i):
start = a*col + min(b,col) start = a * col + min(b, col)
end = start+a+(1 if col<b else 0) end = start + a + (1 if col < b else 0)
cols_items.append( items[start:end] ) cols_items.append(items[start:end])
col_width = max(items_len[start:end]) col_width = max(items_len[start:end])
cols_width.append( col_width ) cols_width.append(col_width)
table_width += col_width table_width += col_width
if table_width <= WIDTH+len(divider): if table_width <= WIDTH + len(divider):
break break
table_width -= len(divider) table_width -= len(divider)
#--------------------------------------------- Print columns # --------------------------------------------- Print columns
print("="*table_width) print("=" * table_width)
for row in range(math.ceil(data_size / i)): for row in range(math.ceil(data_size / i)):
for col in range(i): for col in range(i):
content = cols_items[col][row] if len(cols_items[col]) > row else divider # print divider when there are no items content = cols_items[col][row] if len(
cols_items[col]) > row else divider # print divider when there are no items
if col == 0: if col == 0:
l = len(divider) l = len(divider)
print(end=f"{content[l:]:{alignment}{cols_width[col]-l}}") # remove divider from 1st col print(end=f"{content[l:]:{alignment}{cols_width[col] - l}}") # remove divider from 1st col
else: else:
print(end=f"{content :{alignment}{cols_width[col] }}") print(end=f"{content :{alignment}{cols_width[col]}}")
print() print()
print("="*table_width) print("=" * table_width)
#--------------------------------------------- Prompt # --------------------------------------------- Prompt
if prompt is None: if prompt is None:
return None return None
if numbering is None: if numbering is None:
return None return None
else: else:
idx = UI.read_int( prompt, 0, data_size ) idx = UI.read_int(prompt, 0, data_size)
return idx, data[idx] return idx, data[idx]
class Cyclic_Callback(BaseCallback): class Cyclic_Callback(BaseCallback):
''' Stable baselines custom callback ''' ''' Stable baselines custom callback '''
def __init__(self, freq, function): def __init__(self, freq, function):
super(Cyclic_Callback, self).__init__(1) super(Cyclic_Callback, self).__init__(1)
self.freq = freq self.freq = freq
@@ -563,10 +579,12 @@ class Cyclic_Callback(BaseCallback):
def _on_step(self) -> bool: def _on_step(self) -> bool:
if self.n_calls % self.freq == 0: if self.n_calls % self.freq == 0:
self.function() self.function()
return True # If the callback returns False, training is aborted early return True # If the callback returns False, training is aborted early
class Export_Callback(BaseCallback): class Export_Callback(BaseCallback):
''' Stable baselines custom callback ''' ''' Stable baselines custom callback '''
def __init__(self, freq, load_path, export_name): def __init__(self, freq, load_path, export_name):
super(Export_Callback, self).__init__(1) super(Export_Callback, self).__init__(1)
self.freq = freq self.freq = freq
@@ -577,8 +595,7 @@ class Export_Callback(BaseCallback):
if self.n_calls % self.freq == 0: if self.n_calls % self.freq == 0:
path = os.path.join(self.load_path, f"model_{self.num_timesteps}_steps.zip") path = os.path.join(self.load_path, f"model_{self.num_timesteps}_steps.zip")
Train_Base.export_model(path, f"./scripts/gyms/export/{self.export_name}") Train_Base.export_model(path, f"./scripts/gyms/export/{self.export_name}")
return True # If the callback returns False, training is aborted early return True # If the callback returns False, training is aborted early

518
scripts/gyms/Walk.py Normal file → Executable file
View File

@@ -1,13 +1,14 @@
import os import os
import numpy as np import numpy as np
import math import math
import time
from time import sleep from time import sleep
from random import random from random import random
from random import uniform from random import uniform
from stable_baselines3 import PPO from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
import gymnasium as gym import gymnasium as gym
from gymnasium import spaces from gymnasium import spaces
@@ -28,34 +29,44 @@ Learn how to run forward using step primitive
- class Train: implements algorithms to train a new model or test an existing model - class Train: implements algorithms to train a new model or test an existing model
''' '''
class WalkEnv(gym.Env): class WalkEnv(gym.Env):
def __init__(self, ip, server_p) -> None: def __init__(self, ip, server_p) -> None:
# Args: Server IP, Agent Port, Monitor Port, Uniform No., Robot Type, Team Name, Enable Log, Enable Draw # Args: Server IP, Agent Port, Monitor Port, Uniform No., Robot Type, Team Name, Enable Log, Enable Draw
self.Player = player = Base_Agent( self.Player = player = Base_Agent(
team_name="Gym", team_name="Gym",
number=1, number=1,
host=ip, host=ip,
port=server_p port=server_p
) )
self.robot_type = self.Player.robot self.robot_type = self.Player.robot
self.step_counter = 0 # to limit episode size self.step_counter = 0 # to limit episode size
self.force_play_on = True self.force_play_on = True
self.target_position = np.array([0.0, 0.0]) # target position in the x-y plane self.target_position = np.array([0.0, 0.0]) # target position in the x-y plane
self.initial_position = np.array([0.0, 0.0]) # initial position in the x-y plane self.initial_position = np.array([0.0, 0.0]) # initial position in the x-y plane
self.target_direction = 0.0 # target direction in the x-y plane (relative to the robot's orientation) self.target_direction = 0.0 # target direction in the x-y plane (relative to the robot's orientation)
self.isfallen = False self.isfallen = False
self.waypoint_index = 0 self.waypoint_index = 0
self.route_completed = False self.route_completed = False
self.debug_every_n_steps = 5 self.debug_every_n_steps = 5
self.enable_debug_joint_status = False
self.calibrate_nominal_from_neutral = True self.calibrate_nominal_from_neutral = True
self.auto_calibrate_train_sim_flip = True self.auto_calibrate_train_sim_flip = True
self.nominal_calibrated_once = False self.nominal_calibrated_once = False
self.flip_calibrated_once = False self.flip_calibrated_once = False
self._target_hz = 0.0
self._target_dt = 0.0
self._last_sync_time = None
target_hz_env = 0
if target_hz_env:
try:
self._target_hz = float(target_hz_env)
except ValueError:
self._target_hz = 0.0
if self._target_hz > 0.0:
self._target_dt = 1.0 / self._target_hz
# State space # State space
# 原始观测大小: 78 # 原始观测大小: 78
@@ -71,13 +82,12 @@ class WalkEnv(gym.Env):
action_dim = len(self.Player.robot.ROBOT_MOTORS) action_dim = len(self.Player.robot.ROBOT_MOTORS)
self.no_of_actions = action_dim self.no_of_actions = action_dim
self.action_space = spaces.Box( self.action_space = spaces.Box(
low=-1.0, low=-10.0,
high=1.0, high=10.0,
shape=(action_dim,), shape=(action_dim,),
dtype=np.float32 dtype=np.float32
) )
# 中立姿态 # 中立姿态
self.joint_nominal_position = np.array( self.joint_nominal_position = np.array(
[ [
@@ -106,26 +116,25 @@ class WalkEnv(gym.Env):
0.0, 0.0,
] ]
) )
self.reference_joint_nominal_position = self.joint_nominal_position.copy() self.joint_nominal_position = np.zeros(self.no_of_actions)
self.train_sim_flip = np.array( self.train_sim_flip = np.array(
[ [
1.0, # 0: Head_yaw (he1) 1.0, # 0: Head_yaw (he1)
-1.0, # 1: Head_pitch (he2) -1.0, # 1: Head_pitch (he2)
1.0, # 2: Left_Shoulder_Pitch (lae1) 1.0, # 2: Left_Shoulder_Pitch (lae1)
-1.0, # 3: Left_Shoulder_Roll (lae2) -1.0, # 3: Left_Shoulder_Roll (lae2)
1.0, # 4: Left_Elbow_Pitch (lae3) -1.0, # 4: Left_Elbow_Pitch (lae3)
1.0, # 5: Left_Elbow_Yaw (lae4) 1.0, # 5: Left_Elbow_Yaw (lae4)
-1.0, # 6: Right_Shoulder_Pitch (rae1) -1.0, # 6: Right_Shoulder_Pitch (rae1)
1.0, # 7: Right_Shoulder_Roll (rae2) -1.0, # 7: Right_Shoulder_Roll (rae2)
1.0, # 8: Right_Elbow_Pitch (rae3) 1.0, # 8: Right_Elbow_Pitch (rae3)
1.0, # 9: Right_Elbow_Yaw (rae4) 1.0, # 9: Right_Elbow_Yaw (rae4)
1.0, # 10: Waist (te1) 1.0, # 10: Waist (te1)
1.0, # 11: Left_Hip_Pitch (lle1) 1.0, # 11: Left_Hip_Pitch (lle1)
-1.0, # 12: Left_Hip_Roll (lle2) -1.0, # 12: Left_Hip_Roll (lle2)
-1.0, # 13: Left_Hip_Yaw (lle3) -1.0, # 13: Left_Hip_Yaw (lle3)
1.0, # 14: Left_Knee_Pitch (lle4) 1.0, # 14: Left_Knee_Pitch (lle4)
1.0, # 15: Left_Ankle_Pitch (lle5) 1.0, # 15: Left_Ankle_Pitch (lle5)
-1.0, # 16: Left_Ankle_Roll (lle6) -1.0, # 16: Left_Ankle_Roll (lle6)
-1.0, # 17: Right_Hip_Pitch (rle1) -1.0, # 17: Right_Hip_Pitch (rle1)
-1.0, # 18: Right_Hip_Roll (rle2) -1.0, # 18: Right_Hip_Roll (rle2)
@@ -136,15 +145,51 @@ class WalkEnv(gym.Env):
] ]
) )
self.scaling_factor = 0.5 self.scaling_factor = 0.3
# self.scaling_factor = 1
# Small reset perturbations for robustness training.
self.enable_reset_perturb = True
self.reset_beam_yaw_range_deg = 180 # randomize target direction fully to encourage learning a real walk instead of a fixed gait
self.reset_joint_noise_rad = 0.015
self.reset_perturb_steps = 3
self.reset_recover_steps = 8
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.previous_pos = np.array([0.0, 0.0]) # Track previous position self.previous_pos = np.array([0.0, 0.0]) # Track previous position
self.Player.server.connect() self.Player.server.connect()
sleep(2.0) # Longer wait for connection to establish completely # sleep(2.0) # Longer wait for connection to establish completely
self.Player.server.send_immediate( self.Player.server.send_immediate(
f"(init {self.Player.robot.name} {self.Player.world.team_name} {self.Player.world.number})" f"(init {self.Player.robot.name} {self.Player.world.team_name} {self.Player.world.number})"
) )
self.start_time = time.time()
def _reconnect_server(self):
try:
self.Player.server.shutdown()
except Exception:
pass
self.Player.server.connect()
self.Player.server.send_immediate(
f"(init {self.Player.robot.name} {self.Player.world.team_name} {self.Player.world.number})"
)
def _safe_receive_world_update(self, retries=1):
last_exc = None
for attempt in range(retries + 1):
try:
self.Player.server.receive()
self.Player.world.update()
return
except (ConnectionResetError, OSError) as exc:
last_exc = exc
if attempt >= retries:
raise
self._reconnect_server()
if last_exc is not None:
raise last_exc
def debug_log(self, message): def debug_log(self, message):
print(message) print(message)
@@ -155,46 +200,6 @@ class WalkEnv(gym.Env):
except OSError: except OSError:
pass pass
def calibrate_train_sim_flip_from_neutral(self, neutral_joint_positions):
updated_flip = self.train_sim_flip.copy()
changed = []
for idx, (reference_value, observed_value) in enumerate(
zip(self.reference_joint_nominal_position, neutral_joint_positions)
):
if idx >= 10:
continue
if abs(reference_value) < 0.15 or abs(observed_value) < 0.15:
continue
inferred_flip = 1.0 if np.sign(reference_value) == np.sign(observed_value) else -1.0
if updated_flip[idx] != inferred_flip:
changed.append((idx, updated_flip[idx], inferred_flip))
updated_flip[idx] = inferred_flip
self.train_sim_flip = updated_flip
if changed:
self.debug_log(
"[FlipDebug] "
f"changes={[(idx, old, new) for idx, old, new in changed]}"
)
def is_reliable_neutral_pose(self, neutral_joint_positions):
leg_positions = neutral_joint_positions[11:]
leg_norm = float(np.linalg.norm(leg_positions))
leg_max = float(np.max(np.abs(leg_positions)))
height = float(self.Player.world.global_position[2])
reliable = (
leg_norm > 0.8
and leg_max > 0.35
and 0.12 < height < 0.8
)
return reliable, leg_norm, leg_max, height
def observe(self, init=False): def observe(self, init=False):
"""获取当前观测值""" """获取当前观测值"""
@@ -240,7 +245,6 @@ class WalkEnv(gym.Env):
orientation_quat_inv = R.from_quat(robot._global_cheat_orientation).inv() orientation_quat_inv = R.from_quat(robot._global_cheat_orientation).inv()
projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0])) projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0]))
# 组合观测 # 组合观测
observation = np.concatenate([ observation = np.concatenate([
qpos_qvel_previous_action, qpos_qvel_previous_action,
@@ -249,17 +253,25 @@ class WalkEnv(gym.Env):
projected_gravity, projected_gravity,
]) ])
observation = np.clip(observation, -10.0, 10.0) observation = np.clip(observation, -10.0, 10.0)
return observation.astype(np.float32) return observation.astype(np.float32)
def sync(self): def sync(self):
''' Run a single simulation step ''' ''' Run a single simulation step '''
self.Player.server.receive() self._safe_receive_world_update(retries=1)
self.Player.world.update()
self.Player.robot.commit_motor_targets_pd() self.Player.robot.commit_motor_targets_pd()
self.Player.server.send() self.Player.server.send()
if self._target_dt > 0.0:
now = time.time()
if self._last_sync_time is None:
self._last_sync_time = now
return
elapsed = now - self._last_sync_time
remaining = self._target_dt - elapsed
if remaining > 0.0:
time.sleep(remaining)
now = time.time()
self._last_sync_time = now
def debug_joint_status(self): def debug_joint_status(self):
robot = self.Player.robot robot = self.Player.robot
@@ -284,6 +296,7 @@ class WalkEnv(gym.Env):
f"err_norm={float(np.linalg.norm(joint_error)):.4f} " f"err_norm={float(np.linalg.norm(joint_error)):.4f} "
f"fallen={self.Player.world.global_position[2] < 0.3}" f"fallen={self.Player.world.global_position[2] < 0.3}"
) )
print(f"waist target={target_joint_positions[10]:.3f}, actual={actual_joint_positions[10]:.3f}")
def reset(self, seed=None, options=None): def reset(self, seed=None, options=None):
''' '''
@@ -295,83 +308,82 @@ class WalkEnv(gym.Env):
if seed is not None: if seed is not None:
np.random.seed(seed) np.random.seed(seed)
length1 = np.random.uniform(10, 20) # randomize target distance length1 = 2 # randomize target distance
length2 = np.random.uniform(10, 20) # randomize target distance length2 = np.random.uniform(0.6, 1) # randomize target distance
length3 = np.random.uniform(10, 20) # randomize target distance length3 = np.random.uniform(0.6, 1) # randomize target distance
angle2 = np.random.uniform(-30, 30) # randomize initial orientation angle2 = np.random.uniform(-30, 30) # randomize initial orientation
angle3 = np.random.uniform(-30, 30) # randomize target direction angle3 = np.random.uniform(-30, 30) # randomize target direction
self.step_counter = 0 self.step_counter = 0
self.waypoint_index = 0 self.waypoint_index = 0
self.route_completed = False self.route_completed = False
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step
self.walk_cycle_step = 0 self.walk_cycle_step = 0
# 随机 beam 目标位置和朝向,增加训练多样性 # 随机 beam 目标位置和朝向,增加训练多样性
beam_x = (random() - 0.5) * 10 beam_x = (random() - 0.5) * 10
beam_y = (random() - 0.5) * 10 beam_y = (random() - 0.5) * 10
beam_yaw = uniform(-self.reset_beam_yaw_range_deg, self.reset_beam_yaw_range_deg)
for _ in range(5): for _ in range(5):
self.Player.server.receive() self._safe_receive_world_update(retries=2)
self.Player.world.update()
self.Player.robot.commit_motor_targets_pd() self.Player.robot.commit_motor_targets_pd()
self.Player.server.commit_beam(pos2d=(beam_x, beam_y), rotation=0) self.Player.server.commit_beam(pos2d=(beam_x, beam_y), rotation=beam_yaw)
self.Player.server.send() self.Player.server.send()
# 执行 Neutral 技能直到完成,给机器人足够时间在 beam 位置稳定站立 # 执行 Neutral 技能直到完成,给机器人足够时间在 beam 位置稳定站立
finished_count = 0 finished_count = 0
for _ in range(20): for _ in range(50):
finished = self.Player.skills_manager.execute("Neutral") finished = self.Player.skills_manager.execute("Neutral")
self.sync() self.sync()
if finished: if finished:
finished_count += 1 finished_count += 1
if finished_count >= 2: # 假设需要连续2次完成才算成功 if finished_count >= 20: # 假设需要连续20次完成才算成功
break break
# neutral_joint_positions = np.deg2rad( if self.enable_reset_perturb and self.reset_joint_noise_rad > 0.0:
# [self.Player.robot.motor_positions[motor] for motor in self.Player.robot.ROBOT_MOTORS] perturb_action = np.zeros(self.no_of_actions, dtype=np.float32)
# ) # Perturb waist + lower body only (10:), keep head/arms stable.
# reliable_neutral, neutral_leg_norm, neutral_leg_max, neutral_height = self.is_reliable_neutral_pose(neutral_joint_positions) perturb_action[10:] = np.random.uniform(
-self.reset_joint_noise_rad,
self.reset_joint_noise_rad,
size=(self.no_of_actions - 10,)
)
# if self.auto_calibrate_train_sim_flip and reliable_neutral and not self.flip_calibrated_once: for _ in range(self.reset_perturb_steps):
# self.calibrate_train_sim_flip_from_neutral(neutral_joint_positions) target_joint_positions = (self.joint_nominal_position + perturb_action) * self.train_sim_flip
# self.flip_calibrated_once = True for idx, target in enumerate(target_joint_positions):
# if self.calibrate_nominal_from_neutral and reliable_neutral and not self.nominal_calibrated_once: r.set_motor_target_position(
# self.joint_nominal_position = neutral_joint_positions * self.train_sim_flip r.ROBOT_MOTORS[idx], target * 180 / math.pi, kp=25, kd=0.6
# self.nominal_calibrated_once = True )
# self.debug_log( self.sync()
# "[ResetDebug] "
# f"neutral_pos={np.round(self.Player.world.global_position, 3).tolist()} "
# f"shoulders={np.round(neutral_joint_positions[2:10], 3).tolist()} "
# f"legs={np.round(neutral_joint_positions[11:], 3).tolist()} "
# f"flip={self.train_sim_flip.tolist()} "
# f"nominal_legs={np.round(self.joint_nominal_position[11:], 3).tolist()} "
# f"calibrated_once={(self.flip_calibrated_once, self.nominal_calibrated_once)} "
# f"reliable_neutral={reliable_neutral} "
# f"leg_norm={neutral_leg_norm:.3f} leg_max={neutral_leg_max:.3f} height={neutral_height:.3f}"
# )
# reset_action_noise = np.random.uniform(-0.015, 0.015, size=(len(self.Player.robot.ROBOT_MOTORS),))
# self.target_joint_positions = (self.joint_nominal_position + reset_action_noise) * self.train_sim_flip
# for idx, target in enumerate(self.target_joint_positions):
# r.set_motor_target_position(
# r.ROBOT_MOTORS[idx], target*180/math.pi, kp=25, kd=0.6
# )
for i in range(self.reset_recover_steps):
# Linearly fade perturbation to help policy start from near-neutral.
alpha = 1.0 - float(i + 1) / float(self.reset_recover_steps)
target_joint_positions = (self.joint_nominal_position + alpha * perturb_action) * self.train_sim_flip
for idx, target in enumerate(target_joint_positions):
r.set_motor_target_position(
r.ROBOT_MOTORS[idx], target * 180 / math.pi, kp=25, kd=0.6
)
self.sync()
# memory variables # memory variables
self.sync()
self.initial_position = np.array(self.Player.world.global_position[:2]) self.initial_position = np.array(self.Player.world.global_position[:2])
self.previous_pos = self.initial_position.copy() # Critical: set to actual position self.previous_pos = self.initial_position.copy() # Critical: set to actual position
self.act = np.zeros(self.no_of_actions ,np.float32) self.act = np.zeros(self.no_of_actions, np.float32)
point1 = self.initial_position + np.array([length1, 0]) # Build target in the robot's current forward direction instead of fixed global +x.
heading_deg = float(r.global_orientation_euler[2])
forward_offset = MathOps.rotate_2d_vec(np.array([length1, 0.0]), heading_deg, is_rad=False)
point1 = self.initial_position + forward_offset
point2 = point1 + MathOps.rotate_2d_vec(np.array([length2, 0]), angle2, is_rad=False) point2 = point1 + MathOps.rotate_2d_vec(np.array([length2, 0]), angle2, is_rad=False)
point3 = point2 + MathOps.rotate_2d_vec(np.array([length3, 0]), angle3, is_rad=False) point3 = point2 + MathOps.rotate_2d_vec(np.array([length3, 0]), angle3, is_rad=False)
self.point_list = [point1, point2, point3] self.point_list = [point1]
self.target_position = self.point_list[self.waypoint_index] self.target_position = self.point_list[self.waypoint_index]
self.initial_height = self.Player.world.global_position[2]
return self.observe(True), {} return self.observe(True), {}
@@ -379,97 +391,132 @@ class WalkEnv(gym.Env):
return return
def compute_reward(self, previous_pos, current_pos, action): def compute_reward(self, previous_pos, current_pos, action):
velocity = current_pos - previous_pos height = float(self.Player.world.global_position[2])
velocity_magnitude = np.linalg.norm(velocity)
direction_to_target = self.target_position - current_pos
prev_direction_to_target = self.target_position - previous_pos
distance_to_target = np.linalg.norm(direction_to_target)
prev_distance_to_target = np.linalg.norm(prev_direction_to_target)
progress_reward = np.clip((prev_distance_to_target - distance_to_target) * 30.0, -2.0, 4.0) orientation_quat_inv = R.from_quat(self.Player.robot._global_cheat_orientation).inv()
projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0]))
tilt_mag = float(np.linalg.norm(projected_gravity[:2]))
ang_vel = np.deg2rad(self.Player.robot.gyroscope)
ang_vel_mag = float(np.linalg.norm(ang_vel))
velocity_in_m_per_sec = velocity_magnitude / 0.05 is_fallen = height < 0.3
speed_reward = np.clip(velocity_in_m_per_sec * 1.5, 0.0, 1.5) if is_fallen:
# remain = max(0, 800 - self.step_counter)
# return -8.0 - 0.01 * remain
return -1.0
if velocity_magnitude > 1e-4 and distance_to_target > 1e-4:
directional_alignment = np.dot(velocity, direction_to_target) / (velocity_magnitude * distance_to_target)
directional_alignment = np.clip(directional_alignment, -1.0, 1.0)
direction_reward = max(0.0, directional_alignment)
else:
direction_reward = 0.0
alive_bonus = 0.05
height = self.Player.world.global_position[2] # # 目标方向
if 0.45 <= height <= 1.2: # to_target = self.target_position - current_pos
height_reward = 1.5 # dist_to_target = float(np.linalg.norm(to_target))
else: # if dist_to_target < 0.5:
height_reward = -6.0 # return 15.0
motionless_penalty = -1.5 if velocity_magnitude < 0.003 else 0.0 # forward_dir = to_target / dist_to_target if dist_to_target > 0.1 else np.array([1.0, 0.0])
# delta_pos = current_pos - previous_pos
# forward_step = float(np.dot(delta_pos, forward_dir))
# lateral_step = float(np.linalg.norm(delta_pos - forward_dir * forward_step))
waypoint_bonus = 0.0 # 奖励项
if distance_to_target < 0.5: # progress_reward = 2 * forward_step
waypoint_bonus = 25.0 # lateral_penalty = -0.1 * lateral_step
if self.waypoint_index < len(self.point_list) - 1: alive_bonus = 2.0
self.waypoint_index += 1
self.target_position = self.point_list[self.waypoint_index] # action_penalty = -0.01 * float(np.linalg.norm(action))
else: smoothness_penalty = -0.01 * float(np.linalg.norm(action - self.last_action_for_reward))
waypoint_bonus = 100.0
self.route_completed = True posture_penalty = -0.3 * (tilt_mag)
ang_vel_penalty = -0.02 * ang_vel_mag
target_height = self.initial_height
height_error = height - target_height
height_penalty = -0.5 * abs(height_error) # 惩罚高度偏离,系数可调
# # 在 compute_reward 开头附近,添加高度变化率计算
# if not hasattr(self, 'last_height'):
# self.last_height = height
# self.last_height_time = self.step_counter # 可选,用于时间间隔
# height_rate = height - self.last_height # 正为上升,负为下降
# self.last_height = height
# 惩罚高度下降(负变化率)
# height_down_penalty = -5.0 * max(0, -height_rate) # 系数可调,-height_rate 为正表示下降幅度
# # 在 compute_reward 中
# if self.step_counter > 50:
# avg_prev_action = np.mean(self.prev_action_history, axis=0)
# novelty = float(np.linalg.norm(action - avg_prev_action))
# exploration_bonus = 0.05 * novelty
# else:
# exploration_bonus = 0
# self.prev_action_history[self.history_idx] = action
# self.history_idx = (self.history_idx + 1) % 50
total = (
# progress_reward +
alive_bonus +
# lateral_penalty +
# action_penalty +
smoothness_penalty +
posture_penalty
+ ang_vel_penalty
+ height_penalty
# + exploration_bonus
# + height_down_penalty
)
if time.time() - self.start_time >= 1200:
self.start_time = time.time()
print(
# f"progress_reward:{progress_reward:.4f}",
# f"lateral_penalty:{lateral_penalty:.4f}",
# f"action_penalty:{action_penalty:.4f}"s,
f"height_penalty:{height_penalty:.4f}",
f"smoothness_penalty:{smoothness_penalty:.4f},",
f"posture_penalty:{posture_penalty:.4f}",
# f"ang_vel_penalty:{ang_vel_penalty:.4f}",
# f"height_down_penalty:{height_down_penalty:.4f}",
# f"exploration_bonus:{exploration_bonus:.4f}"
)
return total
action_magnitude = np.linalg.norm(action[11:])
action_penalty = -0.08 * action_magnitude
tilt_penalty = -0.2 * np.linalg.norm(self.Player.robot.gyroscope[:2]) / 100.0
return (
progress_reward
+ speed_reward
+ direction_reward
+ alive_bonus
+ height_reward
+ motionless_penalty
+ waypoint_bonus
+ action_penalty
+ tilt_penalty
)
def step(self, action): def step(self, action):
r = self.Player.robot r = self.Player.robot
self.previous_action = action self.previous_action = action
self.target_joint_positions = ( self.target_joint_positions = (
self.joint_nominal_position # self.joint_nominal_position +
+ self.scaling_factor * action self.scaling_factor * action
) )
self.target_joint_positions *= self.train_sim_flip self.target_joint_positions *= self.train_sim_flip
for idx, target in enumerate(self.target_joint_positions): for idx, target in enumerate(self.target_joint_positions):
r.set_motor_target_position( r.set_motor_target_position(
r.ROBOT_MOTORS[idx], target*180/math.pi, kp=25, kd=0.6 r.ROBOT_MOTORS[idx], target * 180 / math.pi, kp=40, kd=1.0
) )
self.previous_action = action
self.sync() # run simulation step
self.sync() # run simulation step
self.step_counter += 1 self.step_counter += 1
# if self.step_counter % self.debug_every_n_steps == 0: if self.enable_debug_joint_status and self.step_counter % self.debug_every_n_steps == 0:
# self.debug_joint_status() self.debug_joint_status()
current_pos = np.array(self.Player.world.global_position[:2], dtype=np.float32) current_pos = np.array(self.Player.world.global_position[:2], dtype=np.float32)
# Compute reward based on movement from previous step # Compute reward based on movement from previous step
reward = self.compute_reward(self.previous_pos, current_pos, action) reward = self.compute_reward(self.previous_pos, current_pos, action)
# Update previous position # Update previous position
self.previous_pos = current_pos.copy() self.previous_pos = current_pos.copy()
self.last_action_for_reward = action.copy()
# Fall detection and penalty # Fall detection and penalty
is_fallen = self.Player.world.global_position[2] < 0.3 is_fallen = self.Player.world.global_position[2] < 0.3
@@ -481,57 +528,66 @@ class WalkEnv(gym.Env):
return self.observe(), reward, terminated, truncated, {} return self.observe(), reward, terminated, truncated, {}
class Train(Train_Base): class Train(Train_Base):
def __init__(self, script) -> None: def __init__(self, script) -> None:
super().__init__(script) super().__init__(script)
def train(self, args): def train(self, args):
#--------------------------------------- Learning parameters # --------------------------------------- Learning parameters
n_envs = 8 # Reduced from 8 to decrease CPU/network pressure during init n_envs = int(os.environ.get("GYM_CPU_N_ENVS", "20"))
n_steps_per_env = 512 # RolloutBuffer is of size (n_steps_per_env * n_envs) if n_envs < 1:
minibatch_size = 64 # should be a factor of (n_steps_per_env * n_envs) raise ValueError("GYM_CPU_N_ENVS must be >= 1")
server_warmup_sec = float(os.environ.get("GYM_CPU_SERVER_WARMUP_SEC", "3.0"))
n_steps_per_env = int(os.environ.get("GYM_CPU_TRAIN_STEPS_PER_ENV", "256")) # RolloutBuffer is of size (n_steps_per_env * n_envs)
minibatch_size = int(os.environ.get("GYM_CPU_TRAIN_BATCH_SIZE", "512")) # should be a factor of (n_steps_per_env * n_envs)
total_steps = 30000000 total_steps = 30000000
learning_rate = 3e-4 learning_rate = float(os.environ.get("GYM_CPU_TRAIN_LR", "3e-4"))
folder_name = f'Walk_R{self.robot_type}' folder_name = f'Walk_R{self.robot_type}'
model_path = f'./scripts/gyms/logs/{folder_name}/' model_path = f'./scripts/gyms/logs/{folder_name}/'
print(f"Model path: {model_path}") print(f"Model path: {model_path}")
print(f"Using {n_envs} parallel environments") print(f"Using {n_envs} parallel environments")
#--------------------------------------- Run algorithm # --------------------------------------- Run algorithm
def init_env(i_env): def init_env(i_env, monitor=False):
def thunk(): def thunk():
return WalkEnv( self.ip , self.server_p + i_env) env = WalkEnv(self.ip, self.server_p + i_env)
if monitor:
env = Monitor(env)
return env
return thunk return thunk
servers = Train_Server( self.server_p, self.monitor_p_1000, n_envs+1 ) #include 1 extra server for testing server_log_dir = os.path.join(model_path, "server_logs")
os.makedirs(server_log_dir, exist_ok=True)
servers = Train_Server(self.server_p, self.monitor_p_1000, n_envs + 1, no_render=True, no_realtime=True) # include 1 extra server for testing
# Wait for servers to start # Wait for servers to start
print(f"Starting {n_envs+1} rcssservermj servers...") print(f"Starting {n_envs + 1} rcssservermj servers...")
if server_warmup_sec > 0:
print(f"Waiting {server_warmup_sec:.1f}s for server warmup...")
sleep(server_warmup_sec)
print("Servers started, creating environments...") print("Servers started, creating environments...")
env = SubprocVecEnv( [init_env(i) for i in range(n_envs)] ) env = SubprocVecEnv([init_env(i, monitor=True) for i in range(n_envs)])
eval_env = SubprocVecEnv( [init_env(n_envs)] ) # Use single-process eval env to avoid extra subprocess fragility during callback evaluation.
eval_env = DummyVecEnv([init_env(n_envs, monitor=True)])
try: try:
# Custom policy network architecture # Custom policy network architecture
policy_kwargs = dict( policy_kwargs = dict(
net_arch=dict( net_arch=dict(
pi=[256, 256, 128], # Policy network: 3 layers pi=[512, 256, 128], # Policy network: 3 layers
vf=[256, 256, 128] # Value network: 3 layers vf=[512, 256, 128] # Value network: 3 layers
), ),
activation_fn=__import__('torch.nn', fromlist=['ReLU']).ReLU, activation_fn=__import__('torch.nn', fromlist=['ELU']).ELU,
) )
if "model_file" in args: # retrain if "model_file" in args: # retrain
model = PPO.load( args["model_file"], env=env, device="cpu", n_envs=n_envs, n_steps=n_steps_per_env, batch_size=minibatch_size, learning_rate=learning_rate ) model = PPO.load(args["model_file"], env=env, device="cpu", n_envs=n_envs, n_steps=n_steps_per_env,
else: # train new model batch_size=minibatch_size, learning_rate=learning_rate)
else: # train new model
model = PPO( model = PPO(
"MlpPolicy", "MlpPolicy",
env=env, env=env,
@@ -541,15 +597,20 @@ class Train(Train_Base):
learning_rate=learning_rate, learning_rate=learning_rate,
device="cpu", device="cpu",
policy_kwargs=policy_kwargs, policy_kwargs=policy_kwargs,
ent_coef=0.01, # Entropy coefficient for exploration ent_coef=float(os.environ.get("GYM_CPU_TRAIN_ENT_COEF", "0.05")), # Entropy coefficient for exploration
clip_range=0.2, # PPO clipping parameter clip_range=float(os.environ.get("GYM_CPU_TRAIN_CLIP_RANGE", "0.2")), # PPO clipping parameter
gae_lambda=0.95, # GAE lambda gae_lambda=0.95, # GAE lambda
gamma=0.99 # Discount factor gamma=float(os.environ.get("GYM_CPU_TRAIN_GAMMA", "0.95")), # Discount factor
target_kl=0.03,
n_epochs=int(os.environ.get("GYM_CPU_TRAIN_EPOCHS", "5")),
# tensorboard_log=f"./scripts/gyms/logs/{folder_name}/tensorboard/"
) )
model_path = self.learn_model( model, total_steps, model_path, eval_env=eval_env, eval_freq=n_steps_per_env*20, save_freq=n_steps_per_env*20, backup_env_file=__file__ ) model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20,
backup_env_file=__file__)
except KeyboardInterrupt: except KeyboardInterrupt:
sleep(1) # wait for child processes sleep(1) # wait for child processes
print("\nctrl+c pressed, aborting...\n") print("\nctrl+c pressed, aborting...\n")
servers.kill() servers.kill()
return return
@@ -558,17 +619,28 @@ class Train(Train_Base):
eval_env.close() eval_env.close()
servers.kill() servers.kill()
def test(self, args): def test(self, args):
# Uses different server and monitor ports # Uses different server and monitor ports
server = Train_Server( self.server_p-1, self.monitor_p, 1 ) server_log_dir = os.path.join(args["folder_dir"], "server_logs")
env = WalkEnv( self.ip, self.server_p-1 ) os.makedirs(server_log_dir, exist_ok=True)
model = PPO.load( args["model_file"], env=env ) test_no_render = os.environ.get("GYM_CPU_TEST_NO_RENDER", "0") == "1"
test_no_realtime = os.environ.get("GYM_CPU_TEST_NO_REALTIME", "0") == "1"
server = Train_Server(
self.server_p - 1,
self.monitor_p,
1,
no_render=test_no_render,
no_realtime=test_no_realtime,
)
env = WalkEnv(self.ip, self.server_p - 1)
model = PPO.load(args["model_file"], env=env)
try: try:
self.export_model( args["model_file"], args["model_file"]+".pkl", False ) # Export to pkl to create custom behavior self.export_model(args["model_file"], args["model_file"] + ".pkl",
self.test_model( model, env, log_path=args["folder_dir"], model_path=args["folder_dir"] ) False) # Export to pkl to create custom behavior
self.test_model(model, env, log_path=args["folder_dir"], model_path=args["folder_dir"])
except KeyboardInterrupt: except KeyboardInterrupt:
print() print()
@@ -583,13 +655,25 @@ if __name__ == "__main__":
script_args = SimpleNamespace( script_args = SimpleNamespace(
args=SimpleNamespace( args=SimpleNamespace(
i='127.0.0.1', # Server IP i='127.0.0.1', # Server IP
p=3100, # Server port p=3100, # Server port
m=3200, # Monitor port m=3200, # Monitor port
r=0, # Robot type r=0, # Robot type
t='Gym', # Team name t='Gym', # Team name
u=1 # Uniform number u=1 # Uniform number
) )
) )
trainer = Train(script_args) trainer = Train(script_args)
trainer.train({})
run_mode = os.environ.get("GYM_CPU_MODE", "train").strip().lower()
if run_mode == "test":
test_model_file = os.environ.get("GYM_CPU_TEST_MODEL", "scripts/gyms/logs/Walk_R0_004/best_model.zip")
test_folder = os.environ.get("GYM_CPU_TEST_FOLDER", "scripts/gyms/logs/Walk_R0_004/")
trainer.test({"model_file": test_model_file, "folder_dir": test_folder})
else:
retrain_model = os.environ.get("GYM_CPU_TRAIN_MODEL", "").strip()
if retrain_model:
trainer.train({"model_file": retrain_model})
else:
trainer.train({})

Binary file not shown.

126
train.sh Executable file
View File

@@ -0,0 +1,126 @@
#!/usr/bin/env bash
set -euo pipefail
# ------------------------------
# 资源限制配置cgroup v2 + systemd-run
# ------------------------------
# 说明:
# 1) 这个脚本会把训练进程放进一个临时的 systemd scope 中,并施加 CPU/内存上限。
# 2) 仅限制“本次训练进程”,不会永久改系统配置。
# 3) 下面变量都支持“环境变量覆盖”,即你可以在命令前临时指定。
#
# CPU 核数基准(默认 20
# 例如你的机器按 20 核预算来算,可保持默认。
CORES="${CORES:-20}"
# CPU 占用百分比(默认 95
# 最终会与 CORES 相乘得到 CPUQuota。
# 例CORES=20, UTIL_PERCENT=95 -> CPUQuota=1900%(约 19 核等效)
UTIL_PERCENT="${UTIL_PERCENT:-95}"
CPU_QUOTA="$((CORES * UTIL_PERCENT))%"
# 内存上限(默认 28G
# 可改成 16G、24G 等,避免训练把系统内存吃满。
MEMORY_MAX="${MEMORY_MAX:-28G}"
# ------------------------------
# 训练运行参数(由 scripts/gyms/Walk.py 读取)
# ------------------------------
# 运行模式train 或 test
GYM_CPU_MODE="${GYM_CPU_MODE:-train}"
# 并行环境数量:越大通常吞吐越高,但也更容易触发服务器连接不稳定。
# 建议从 8~12 起步,稳定后再升到 16/20。
GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS:-20}"
# 服务器预热时间(秒):
# 在批量拉起 rcssserver 后等待一段时间,再创建 SubprocVecEnv
# 可降低 ConnectionReset/EOFError 概率。
GYM_CPU_SERVER_WARMUP_SEC="${GYM_CPU_SERVER_WARMUP_SEC:-10}"
# 训练专用参数
GYM_CPU_TRAIN_STEPS_PER_ENV="${GYM_CPU_TRAIN_STEPS_PER_ENV:-256}"
GYM_CPU_TRAIN_BATCH_SIZE="${GYM_CPU_TRAIN_BATCH_SIZE:-512}"
GYM_CPU_TRAIN_LR="${GYM_CPU_TRAIN_LR:-1e-4}"
GYM_CPU_TRAIN_ENT_COEF="${GYM_CPU_TRAIN_ENT_COEF:-0.03}"
GYM_CPU_TRAIN_CLIP_RANGE="${GYM_CPU_TRAIN_CLIP_RANGE:-0.13}"
GYM_CPU_TRAIN_GAMMA="${GYM_CPU_TRAIN_GAMMA:-0.95}"
GYM_CPU_TRAIN_EPOCHS="${GYM_CPU_TRAIN_EPOCHS:-5}"
GYM_CPU_TRAIN_MODEL="${GYM_CPU_TRAIN_MODEL:-}"
# 测试专用参数
GYM_CPU_TEST_MODEL="${GYM_CPU_TEST_MODEL:-scripts/gyms/logs/Walk_R0_004/best_model.zip}"
GYM_CPU_TEST_FOLDER="${GYM_CPU_TEST_FOLDER:-scripts/gyms/logs/Walk_R0_004/}"
# 测试默认实时且显示画面:默认均为 0
# 设为 1 表示关闭对应能力
GYM_CPU_TEST_NO_RENDER="${GYM_CPU_TEST_NO_RENDER:-0}"
GYM_CPU_TEST_NO_REALTIME="${GYM_CPU_TEST_NO_REALTIME:-0}"
# Python 解释器选择策略:
# 1) 优先使用你手动传入的 PYTHON_BIN
# 2) 其次用当前激活 conda 环境CONDA_PREFIX/bin/python
# 3) 再回退到默认 mujoco 环境路径
# 4) 最后尝试系统 python / python3
DEFAULT_PYTHON="/home/solren/Downloads/Anaconda/envs/mujoco/bin/python"
CONDA_PYTHON="${CONDA_PREFIX:-}/bin/python"
# 安全保护:不要用 sudo 运行。
# 原因sudo 可能导致 conda 环境与用户会话环境不一致,
# 会引发 python 路径丢失、systemd --user 会话不可见等问题。
if [[ "${EUID}" -eq 0 ]]; then
echo "Do not run this script with sudo; run as your normal user in conda env 'mujoco'."
exit 1
fi
# 解析最终使用的 Python 可执行文件。
if [[ -n "${PYTHON_BIN:-}" ]]; then
PYTHON_EXEC="${PYTHON_BIN}"
elif [[ -n "${CONDA_PREFIX:-}" && -x "${CONDA_PYTHON}" ]]; then
PYTHON_EXEC="${CONDA_PYTHON}"
elif [[ -x "${DEFAULT_PYTHON}" ]]; then
PYTHON_EXEC="${DEFAULT_PYTHON}"
elif command -v python >/dev/null 2>&1; then
PYTHON_EXEC="$(command -v python)"
elif command -v python3 >/dev/null 2>&1; then
PYTHON_EXEC="$(command -v python3)"
else
echo "No Python executable found. Set PYTHON_BIN=/abs/path/to/python and retry."
exit 1
fi
# 脚本所在目录(绝对路径),便于后续定位模块/相对路径。
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# 打印当前生效配置,方便排障和复现实验。
echo "Starting training with limits: CPU=${CPU_QUOTA}, Memory=${MEMORY_MAX}"
echo "Mode: ${GYM_CPU_MODE}"
echo "Runtime knobs: GYM_CPU_N_ENVS=${GYM_CPU_N_ENVS}, GYM_CPU_SERVER_WARMUP_SEC=${GYM_CPU_SERVER_WARMUP_SEC}"
echo "Using Python: ${PYTHON_EXEC}"
if [[ -n "${CONDA_DEFAULT_ENV:-}" ]]; then
echo "Detected conda env: ${CONDA_DEFAULT_ENV}"
fi
# 使用 systemd-run --user --scope 启动“受限资源”的训练进程:
# - CPUQuota: 总 CPU 配额
# - MemoryMax: 最大内存
# - env ... : 显式传递训练参数到 Python 进程
# - python -m scripts.gyms.Walk: 以模块方式启动训练入口
systemd-run --user --scope \
-p CPUQuota="${CPU_QUOTA}" \
-p MemoryMax="${MEMORY_MAX}" \
env \
GYM_CPU_MODE="${GYM_CPU_MODE}" \
GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS}" \
GYM_CPU_SERVER_WARMUP_SEC="${GYM_CPU_SERVER_WARMUP_SEC}" \
GYM_CPU_TRAIN_STEPS_PER_ENV="${GYM_CPU_TRAIN_STEPS_PER_ENV}" \
GYM_CPU_TRAIN_BATCH_SIZE="${GYM_CPU_TRAIN_BATCH_SIZE}" \
GYM_CPU_TRAIN_LR="${GYM_CPU_TRAIN_LR}" \
GYM_CPU_TRAIN_ENT_COEF="${GYM_CPU_TRAIN_ENT_COEF}" \
GYM_CPU_TRAIN_CLIP_RANGE="${GYM_CPU_TRAIN_CLIP_RANGE}" \
GYM_CPU_TRAIN_GAMMA="${GYM_CPU_TRAIN_GAMMA}" \
GYM_CPU_TRAIN_EPOCHS="${GYM_CPU_TRAIN_EPOCHS}" \
GYM_CPU_TRAIN_MODEL="${GYM_CPU_TRAIN_MODEL}" \
GYM_CPU_TEST_MODEL="${GYM_CPU_TEST_MODEL}" \
GYM_CPU_TEST_FOLDER="${GYM_CPU_TEST_FOLDER}" \
GYM_CPU_TEST_NO_RENDER="${GYM_CPU_TEST_NO_RENDER}" \
GYM_CPU_TEST_NO_REALTIME="${GYM_CPU_TEST_NO_REALTIME}" \
"${PYTHON_EXEC}" "-m" "scripts.gyms.Walk"

View File

@@ -47,6 +47,7 @@ class World:
self.their_team_players: list[OtherRobot] = [OtherRobot(is_teammate=False) for _ in self.their_team_players: list[OtherRobot] = [OtherRobot(is_teammate=False) for _ in
range(self.MAX_PLAYERS_PER_TEAM)] range(self.MAX_PLAYERS_PER_TEAM)]
self.field: Field = self.__initialize_field(field_name=field_name) self.field: Field = self.__initialize_field(field_name=field_name)
self.WORLD_STEPTIME: float = 0.005 # Time step of the world in seconds
def update(self) -> None: def update(self) -> None:
""" """