Compare commits

...

8 Commits

Author SHA1 Message Date
xxh
a52cdff013 add no gui, no realtime mode, and train bash script 2026-03-21 08:53:31 -04:00
xxh
ec8b648a3b stand stable 0.1 amendment and add some info 2026-03-21 06:46:55 -04:00
xxh
f99fae68f6 change model and policy 2026-03-14 01:08:22 -04:00
xxh
294fe0bd79 restore 2026-03-13 21:44:59 -04:00
xxh
cf80becd17 restore 2026-03-13 21:38:44 -04:00
xxh
6ab356a947 improve train speed and add speed constrain 2026-03-13 08:51:49 -04:00
3a42120857 Revert "improve training speed and add speed constrain"
This reverts commit 648cf32e9c.
2026-03-13 08:43:28 -04:00
648cf32e9c improve training speed and add speed constrain 2026-03-13 08:40:50 -04:00
9 changed files with 704 additions and 412 deletions

10
.gitignore vendored
View File

@@ -10,3 +10,13 @@ poetry.toml
**/log/ **/log/
*.spec *.spec
dist/ dist/
*steps.zip
*.pkl
best_model.zip
*.csv
*.npz
*.xml
*.json
*.yaml
*.iml
*.TXT

14
command.md Normal file
View File

@@ -0,0 +1,14 @@
训练(默认)
bash train.sh
测试(实时+显示画面)
GYM_CPU_MODE=test GYM_CPU_TEST_MODEL=scripts/gyms/logs/Walk_R0_005/best_model.zip GYM_CPU_TEST_FOLDER=scripts/gyms/logs/Walk_R0_005/ GYM_CPU_TEST_NO_RENDER=0 GYM_CPU_TEST_NO_REALTIME=0 bash train.sh
测试(无画面、非实时)
GYM_CPU_MODE=test GYM_CPU_TEST_NO_RENDER=1 GYM_CPU_TEST_NO_REALTIME=1 bash train.sh
retrain继续训练
GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_005/best_model.zip bash train.sh
retrain+改训练超参
GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_004/best_model.zip GYM_CPU_TRAIN_LR=2e-4 GYM_CPU_TRAIN_BATCH_SIZE=256 GYM_CPU_TRAIN_EPOCHS=8 bash train.sh

View File

@@ -1,5 +1,6 @@
import logging import logging
import socket import socket
import time
from select import select from select import select
from communication.world_parser import WorldParser from communication.world_parser import WorldParser
@@ -10,15 +11,27 @@ class Server:
def __init__(self, host: str, port: int, world_parser: WorldParser): def __init__(self, host: str, port: int, world_parser: WorldParser):
self.world_parser: WorldParser = world_parser self.world_parser: WorldParser = world_parser
self.__host: str = host self.__host: str = host
self.__port: str = port self.__port: int = port
self.__socket: socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.__socket: socket.socket = self._create_socket()
self.__socket.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
self.__send_buff = [] self.__send_buff = []
self.__rcv_buffer_size = 1024 self.__rcv_buffer_size = 1024
self.__rcv_buffer = bytearray(self.__rcv_buffer_size) self.__rcv_buffer = bytearray(self.__rcv_buffer_size)
def _create_socket(self) -> socket.socket:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
return sock
def connect(self) -> None: def connect(self) -> None:
logger.info("Connecting to server at %s:%d...", self.__host, self.__port) logger.info("Connecting to server at %s:%d...", self.__host, self.__port)
# Always reconnect with a fresh socket object.
try:
self.__socket.close()
except OSError:
pass
self.__socket = self._create_socket()
while True: while True:
try: try:
self.__socket.connect((self.__host, self.__port)) self.__socket.connect((self.__host, self.__port))
@@ -27,12 +40,19 @@ class Server:
logger.error( logger.error(
"Connection refused. Make sure the server is running and listening on {self.__host}:{self.__port}." "Connection refused. Make sure the server is running and listening on {self.__host}:{self.__port}."
) )
time.sleep(0.05)
logger.info(f"Server connection established to {self.__host}:{self.__port}.") logger.info(f"Server connection established to {self.__host}:{self.__port}.")
def shutdown(self) -> None: def shutdown(self) -> None:
self.__socket.close() try:
self.__socket.shutdown(socket.SHUT_RDWR) self.__socket.shutdown(socket.SHUT_RDWR)
except OSError:
pass
try:
self.__socket.close()
except OSError:
pass
def send_immediate(self, msg: str) -> None: def send_immediate(self, msg: str) -> None:
""" """
@@ -72,37 +92,37 @@ class Server:
self.commit(msg) self.commit(msg)
self.send() self.send()
def receive(self) -> None: def receive(self):
"""
Receive the next message from the TCP/IP socket and updates world
"""
# Receive message length information while True:
if (
self.__socket.recv_into( if (
self.__rcv_buffer, nbytes=4, flags=socket.MSG_WAITALL self.__socket.recv_into(
self.__rcv_buffer, nbytes=4, flags=socket.MSG_WAITALL
) != 4
):
raise ConnectionResetError
msg_size = int.from_bytes(self.__rcv_buffer[:4], byteorder="big", signed=False)
if msg_size > self.__rcv_buffer_size:
self.__rcv_buffer_size = msg_size
self.__rcv_buffer = bytearray(self.__rcv_buffer_size)
if (
self.__socket.recv_into(
self.__rcv_buffer, nbytes=msg_size, flags=socket.MSG_WAITALL
) != msg_size
):
raise ConnectionResetError
self.world_parser.parse(
message=self.__rcv_buffer[:msg_size].decode()
) )
!= 4
):
raise ConnectionResetError
msg_size = int.from_bytes(self.__rcv_buffer[:4], byteorder="big", signed=False) # 如果socket没有更多数据就退出
if len(select([self.__socket], [], [], 0.0)[0]) == 0:
# Ensure receive buffer is large enough to hold the message break
if msg_size > self.__rcv_buffer_size:
self.__rcv_buffer_size = msg_size
self.__rcv_buffer = bytearray(self.__rcv_buffer_size)
# Receive message with the specified length
if (
self.__socket.recv_into(
self.__rcv_buffer, nbytes=msg_size, flags=socket.MSG_WAITALL
)
!= msg_size
):
raise ConnectionResetError
self.world_parser.parse(message=self.__rcv_buffer[:msg_size].decode())
def commit_beam(self, pos2d: list, rotation: float) -> None: def commit_beam(self, pos2d: list, rotation: float) -> None:
assert len(pos2d) == 2 assert len(pos2d) == 2

View File

@@ -1,15 +1,16 @@
import subprocess import subprocess
import os import os
import time
class Server(): class Server():
def __init__(self, first_server_p, first_monitor_p, n_servers) -> None: def __init__(self, first_server_p, first_monitor_p, n_servers, no_render=True, no_realtime=True) -> None:
try: try:
import psutil import psutil
self.check_running_servers(psutil, first_server_p, first_monitor_p, n_servers) self.check_running_servers(psutil, first_server_p, first_monitor_p, n_servers)
except ModuleNotFoundError: except ModuleNotFoundError:
print("Info: Cannot check if the server is already running, because the psutil module was not found") print("Info: Cannot check if the server is already running, because the psutil module was not found")
self.first_server_p = first_server_p self.first_server_p = first_server_p
self.n_servers = n_servers self.n_servers = n_servers
self.rcss_processes = [] self.rcss_processes = []
@@ -17,30 +18,50 @@ class Server():
# makes it easier to kill test servers without affecting train servers # makes it easier to kill test servers without affecting train servers
cmd = "rcssservermj" cmd = "rcssservermj"
render_arg = "--no-render" if no_render else ""
realtime_arg = "--no-realtime" if no_realtime else ""
for i in range(n_servers): for i in range(n_servers):
self.rcss_processes.append( port = first_server_p + i
subprocess.Popen((f"{cmd} --aport {first_server_p+i} --mport {first_monitor_p+i}").split(), mport = first_monitor_p + i
stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, start_new_session=True)
server_cmd = f"{cmd} -c {port} -m {mport} {render_arg} {realtime_arg}".strip()
proc = subprocess.Popen(
server_cmd.split(),
stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT,
start_new_session=True
) )
# Avoid startup storm when launching many servers at once.
time.sleep(0.03)
rc = proc.poll()
if rc is not None:
raise RuntimeError(
f"rcssservermj exited early (code={rc}) on server port {port}, monitor port {mport}"
)
self.rcss_processes.append(proc)
def check_running_servers(self, psutil, first_server_p, first_monitor_p, n_servers): def check_running_servers(self, psutil, first_server_p, first_monitor_p, n_servers):
''' Check if any server is running on chosen ports ''' ''' Check if any server is running on chosen ports '''
found = False found = False
p_list = [p for p in psutil.process_iter() if p.cmdline() and "rcssservermj" in " ".join(p.cmdline())] p_list = [p for p in psutil.process_iter() if p.cmdline() and "rcssservermj" in " ".join(p.cmdline())]
range1 = (first_server_p, first_server_p + n_servers) range1 = (first_server_p, first_server_p + n_servers)
range2 = (first_monitor_p,first_monitor_p + n_servers) range2 = (first_monitor_p, first_monitor_p + n_servers)
bad_processes = [] bad_processes = []
for p in p_list: for p in p_list:
# currently ignoring remaining default port when only one of the ports is specified (uncommon scenario) # currently ignoring remaining default port when only one of the ports is specified (uncommon scenario)
ports = [int(arg) for arg in p.cmdline()[1:] if arg.isdigit()] ports = [int(arg) for arg in p.cmdline()[1:] if arg.isdigit()]
if len(ports) == 0: if len(ports) == 0:
ports = [60000,60100] # default server ports (changing this is unlikely) ports = [60000, 60100] # default server ports (changing this is unlikely)
conflicts = [str(port) for port in ports if ( conflicts = [str(port) for port in ports if (
(range1[0] <= port < range1[1]) or (range2[0] <= port < range2[1]) )] (range1[0] <= port < range1[1]) or (range2[0] <= port < range2[1]))]
if len(conflicts)>0: if len(conflicts) > 0:
if not found: if not found:
print("\nThere are already servers running on the same port(s)!") print("\nThere are already servers running on the same port(s)!")
found = True found = True
@@ -55,9 +76,8 @@ class Server():
for p in bad_processes: for p in bad_processes:
p.kill() p.kill()
return return
def kill(self): def kill(self):
for p in self.rcss_processes: for p in self.rcss_processes:
p.kill() p.kill()
print(f"Killed {self.n_servers} rcssservermj processes starting at {self.first_server_p}") print(f"Killed {self.n_servers} rcssservermj processes starting at {self.first_server_p}")

View File

@@ -31,31 +31,29 @@ class Train_Base():
args = script.args args = script.args
self.script = script self.script = script
self.ip = args.i self.ip = args.i
self.server_p = args.p # (initial) server port self.server_p = args.p # (initial) server port
self.monitor_p = args.m + 100 # monitor port when testing self.monitor_p = args.m + 100 # monitor port when testing
self.monitor_p_1000 = args.m + 1100 # initial monitor port when training self.monitor_p_1000 = args.m + 1100 # initial monitor port when training
self.robot_type = args.r self.robot_type = args.r
self.team = args.t self.team = args.t
self.uniform = args.u self.uniform = args.u
self.cf_last_time = 0 self.cf_last_time = 0
self.cf_delay = 0 self.cf_delay = 0
# self.cf_target_period = World.STEPTIME # target simulation speed while testing (default: real-time) # self.cf_target_period = World.STEPTIME # target simulation speed while testing (default: real-time)
@staticmethod @staticmethod
def prompt_user_for_model(self): def prompt_user_for_model(self):
gyms_logs_path = "./mujococodebase/scripts/gyms/logs/" gyms_logs_path = "./scripts/gyms/logs/"
folders = [f for f in listdir(gyms_logs_path) if isdir(join(gyms_logs_path, f))] folders = [f for f in listdir(gyms_logs_path) if isdir(join(gyms_logs_path, f))]
folders.sort(key=lambda f: os.path.getmtime(join(gyms_logs_path, f)), reverse=True) # sort by modification date folders.sort(key=lambda f: os.path.getmtime(join(gyms_logs_path, f)), reverse=True) # sort by modification date
while True: while True:
try: try:
folder_name = UI.print_list(folders,prompt="Choose folder (ctrl+c to return): ")[1] folder_name = UI.print_list(folders, prompt="Choose folder (ctrl+c to return): ")[1]
except KeyboardInterrupt: except KeyboardInterrupt:
print() print()
return None # ctrl+c return None # ctrl+c
folder_dir = os.path.join(gyms_logs_path, folder_name) folder_dir = os.path.join(gyms_logs_path, folder_name)
models = [m[:-4] for m in listdir(folder_dir) if isfile(join(folder_dir, m)) and m.endswith(".zip")] models = [m[:-4] for m in listdir(folder_dir) if isfile(join(folder_dir, m)) and m.endswith(".zip")]
@@ -64,16 +62,17 @@ class Train_Base():
print("The chosen folder does not contain any .zip file!") print("The chosen folder does not contain any .zip file!")
continue continue
models.sort(key=lambda m: os.path.getmtime(join(folder_dir, m+".zip")), reverse=True) # sort by modification date models.sort(key=lambda m: os.path.getmtime(join(folder_dir, m + ".zip")),
reverse=True) # sort by modification date
try: try:
model_name = UI.print_list(models,prompt="Choose model (ctrl+c to return): ")[1] model_name = UI.print_list(models, prompt="Choose model (ctrl+c to return): ")[1]
break break
except KeyboardInterrupt: except KeyboardInterrupt:
print() print()
return {"folder_dir":folder_dir, "folder_name":folder_name, "model_file":os.path.join(folder_dir, model_name+".zip")} return {"folder_dir": folder_dir, "folder_name": folder_name,
"model_file": os.path.join(folder_dir, model_name + ".zip")}
# def control_fps(self, read_input = False): # def control_fps(self, read_input = False):
# ''' Add delay to control simulation speed ''' # ''' Add delay to control simulation speed '''
@@ -87,7 +86,7 @@ class Train_Base():
# if speed == '0': # if speed == '0':
# inp = input("Paused. Set new speed or '' to use previous speed:") # inp = input("Paused. Set new speed or '' to use previous speed:")
# if inp != '': # if inp != '':
# speed = inp # speed = inp
# try: # try:
# speed = int(speed) # speed = int(speed)
@@ -95,7 +94,7 @@ class Train_Base():
# self.cf_target_period = World.STEPTIME * 100 / speed # self.cf_target_period = World.STEPTIME * 100 / speed
# print(f"Changed simulation speed to {speed}%") # print(f"Changed simulation speed to {speed}%")
# except: # except:
# print("""Train_Base.py: # print("""Train_Base.py:
# Error: To control the simulation speed, enter a non-negative integer. # Error: To control the simulation speed, enter a non-negative integer.
# To disable this control module, use test_model(..., enable_FPS_control=False) in your gyms environment.""") # To disable this control module, use test_model(..., enable_FPS_control=False) in your gyms environment.""")
@@ -108,15 +107,15 @@ class Train_Base():
# else: # else:
# self.cf_delay = 0 # self.cf_delay = 0
def test_model(self, model: BaseAlgorithm, env, log_path: str = None, model_path: str = None, max_episodes=0,
def test_model(self, model:BaseAlgorithm, env, log_path:str=None, model_path:str=None, max_episodes=0, enable_FPS_control=True, verbose=1): enable_FPS_control=True, verbose=1):
''' '''
Test model and log results Test model and log results
Parameters Parameters
---------- ----------
model : BaseAlgorithm model : BaseAlgorithm
Trained model Trained model
env : Env env : Env
Gym-like environment Gym-like environment
log_path : str log_path : str
@@ -147,12 +146,12 @@ class Train_Base():
break break
else: else:
log_path += "/test.csv" log_path += "/test.csv"
with open(log_path, 'w') as f: with open(log_path, 'w') as f:
f.write("reward,ep. length,rew. cumulative avg., ep. len. cumulative avg.\n") f.write("reward,ep. length,rew. cumulative avg., ep. len. cumulative avg.\n")
print("Train statistics are saved to:", log_path) print("Train statistics are saved to:", log_path)
if enable_FPS_control: # control simulation speed (using non blocking user input) if enable_FPS_control: # control simulation speed (using non blocking user input)
print("\nThe simulation speed can be changed by sending a non-negative integer\n" print("\nThe simulation speed can be changed by sending a non-negative integer\n"
"(e.g. '50' sets speed to 50%, '0' pauses the simulation, '' sets speed to MAX)\n") "(e.g. '50' sets speed to 50%, '0' pauses the simulation, '' sets speed to MAX)\n")
@@ -172,8 +171,8 @@ class Train_Base():
ep_reward += reward ep_reward += reward
ep_length += 1 ep_length += 1
if enable_FPS_control: # control simulation speed (using non blocking user input) # if enable_FPS_control: # control simulation speed (using non blocking user input)
self.control_fps(select.select([sys.stdin], [], [], 0)[0]) # self.control_fps(select.select([sys.stdin], [], [], 0)[0])
if done: if done:
obs, _ = env.reset() obs, _ = env.reset()
@@ -182,25 +181,28 @@ class Train_Base():
reward_max = max(ep_reward, reward_max) reward_max = max(ep_reward, reward_max)
reward_min = min(ep_reward, reward_min) reward_min = min(ep_reward, reward_min)
ep_no += 1 ep_no += 1
avg_ep_lengths = ep_lengths_sum/ep_no avg_ep_lengths = ep_lengths_sum / ep_no
avg_rewards = rewards_sum/ep_no avg_rewards = rewards_sum / ep_no
if verbose > 0: if verbose > 0:
print( f"\rEpisode: {ep_no:<3} Ep.Length: {ep_length:<4.0f} Reward: {ep_reward:<6.2f} \n", print(
end=f"--AVERAGE-- Ep.Length: {avg_ep_lengths:<4.0f} Reward: {avg_rewards:<6.2f} (Min: {reward_min:<6.2f} Max: {reward_max:<6.2f})", flush=True) f"\rEpisode: {ep_no:<3} Ep.Length: {ep_length:<4.0f} Reward: {ep_reward:<6.2f} \n",
end=f"--AVERAGE-- Ep.Length: {avg_ep_lengths:<4.0f} Reward: {avg_rewards:<6.2f} (Min: {reward_min:<6.2f} Max: {reward_max:<6.2f})",
flush=True)
if log_path is not None: if log_path is not None:
with open(log_path, 'a') as f: with open(log_path, 'a') as f:
writer = csv.writer(f) writer = csv.writer(f)
writer.writerow([ep_reward, ep_length, avg_rewards, avg_ep_lengths]) writer.writerow([ep_reward, ep_length, avg_rewards, avg_ep_lengths])
if ep_no == max_episodes: if ep_no == max_episodes:
return return
ep_reward = 0 ep_reward = 0
ep_length = 0 ep_length = 0
def learn_model(self, model:BaseAlgorithm, total_steps:int, path:str, eval_env=None, eval_freq=None, eval_eps=5, save_freq=None, backup_env_file=None, export_name=None): def learn_model(self, model: BaseAlgorithm, total_steps: int, path: str, eval_env=None, eval_freq=None, eval_eps=5,
save_freq=None, backup_env_file=None, export_name=None):
''' '''
Learn Model for a specific number of time steps Learn Model for a specific number of time steps
@@ -251,7 +253,7 @@ class Train_Base():
# If path already exists, add suffix to avoid overwriting # If path already exists, add suffix to avoid overwriting
if os.path.isdir(path): if os.path.isdir(path):
for i in count(): for i in count():
p = path.rstrip("/")+f'_{i:03}/' p = path.rstrip("/") + f'_{i:03}/'
if not os.path.isdir(p): if not os.path.isdir(p):
path = p path = p
break break
@@ -265,22 +267,28 @@ class Train_Base():
evaluate = bool(eval_env is not None and eval_freq is not None) evaluate = bool(eval_env is not None and eval_freq is not None)
# Create evaluation callback # Create evaluation callback
eval_callback = None if not evaluate else EvalCallback(eval_env, n_eval_episodes=eval_eps, eval_freq=eval_freq, log_path=path, eval_callback = None if not evaluate else EvalCallback(eval_env, n_eval_episodes=eval_eps, eval_freq=eval_freq,
best_model_save_path=path, deterministic=True, render=False) log_path=path,
best_model_save_path=path, deterministic=True,
render=False)
# Create custom callback to display evaluations # Create custom callback to display evaluations
custom_callback = None if not evaluate else Cyclic_Callback(eval_freq, lambda:self.display_evaluations(path,True)) custom_callback = None if not evaluate else Cyclic_Callback(eval_freq,
lambda: self.display_evaluations(path, True))
# Create checkpoint callback # Create checkpoint callback
checkpoint_callback = None if save_freq is None else CheckpointCallback(save_freq=save_freq, save_path=path, name_prefix="model", verbose=1) checkpoint_callback = None if save_freq is None else CheckpointCallback(save_freq=save_freq, save_path=path,
name_prefix="model", verbose=1)
# Create custom callback to export checkpoint models # Create custom callback to export checkpoint models
export_callback = None if save_freq is None or export_name is None else Export_Callback(save_freq, path, export_name) export_callback = None if save_freq is None or export_name is None else Export_Callback(save_freq, path,
export_name)
callbacks = CallbackList([c for c in [eval_callback, custom_callback, checkpoint_callback, export_callback] if c is not None]) callbacks = CallbackList(
[c for c in [eval_callback, custom_callback, checkpoint_callback, export_callback] if c is not None])
model.learn( total_timesteps=total_steps, callback=callbacks ) model.learn(total_timesteps=total_steps, callback=callbacks)
model.save( os.path.join(path, "last_model") ) model.save(os.path.join(path, "last_model"))
# Display evaluations if they exist # Display evaluations if they exist
if evaluate: if evaluate:
@@ -288,18 +296,18 @@ class Train_Base():
# Display timestamps + Model path # Display timestamps + Model path
end_date = datetime.now().strftime('%d/%m/%Y %H:%M:%S') end_date = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
duration = timedelta(seconds=int(time.time()-start)) duration = timedelta(seconds=int(time.time() - start))
print(f"Train start: {start_date}") print(f"Train start: {start_date}")
print(f"Train end: {end_date}") print(f"Train end: {end_date}")
print(f"Train duration: {duration}") print(f"Train duration: {duration}")
print(f"Model path: {path}") print(f"Model path: {path}")
# Append timestamps to backup environment file # Append timestamps to backup environment file
if backup_env_file is not None: if backup_env_file is not None:
with open(backup_file, 'a') as f: with open(backup_file, 'a') as f:
f.write(f"\n# Train start: {start_date}\n") f.write(f"\n# Train start: {start_date}\n")
f.write( f"# Train end: {end_date}\n") f.write(f"# Train end: {end_date}\n")
f.write( f"# Train duration: {duration}") f.write(f"# Train duration: {duration}")
return path return path
@@ -318,50 +326,57 @@ class Train_Base():
with np.load(eval_npz) as data: with np.load(eval_npz) as data:
time_steps = data["timesteps"] time_steps = data["timesteps"]
results_raw = np.mean(data["results"],axis=1) results_raw = np.mean(data["results"], axis=1)
ep_lengths_raw = np.mean(data["ep_lengths"],axis=1) ep_lengths_raw = np.mean(data["ep_lengths"], axis=1)
sample_no = len(results_raw) sample_no = len(results_raw)
xvals = np.linspace(0, sample_no-1, 80) xvals = np.linspace(0, sample_no - 1, 80)
results = np.interp(xvals, range(sample_no), results_raw) results = np.interp(xvals, range(sample_no), results_raw)
ep_lengths = np.interp(xvals, range(sample_no), ep_lengths_raw) ep_lengths = np.interp(xvals, range(sample_no), ep_lengths_raw)
results_limits = np.min(results), np.max(results) results_limits = np.min(results), np.max(results)
ep_lengths_limits = np.min(ep_lengths), np.max(ep_lengths) ep_lengths_limits = np.min(ep_lengths), np.max(ep_lengths)
results_discrete = np.digitize(results, np.linspace(results_limits[0]-1e-5, results_limits[1]+1e-5, console_height+1))-1 results_discrete = np.digitize(results, np.linspace(results_limits[0] - 1e-5, results_limits[1] + 1e-5,
ep_lengths_discrete = np.digitize(ep_lengths, np.linspace(0, ep_lengths_limits[1]+1e-5, console_height+1))-1 console_height + 1)) - 1
ep_lengths_discrete = np.digitize(ep_lengths,
np.linspace(0, ep_lengths_limits[1] + 1e-5, console_height + 1)) - 1
matrix = np.zeros((console_height, console_width, 2), int) matrix = np.zeros((console_height, console_width, 2), int)
matrix[results_discrete[0] ][0][0] = 1 # draw 1st column matrix[results_discrete[0]][0][0] = 1 # draw 1st column
matrix[ep_lengths_discrete[0]][0][1] = 1 # draw 1st column matrix[ep_lengths_discrete[0]][0][1] = 1 # draw 1st column
rng = [[results_discrete[0], results_discrete[0]], [ep_lengths_discrete[0], ep_lengths_discrete[0]]] rng = [[results_discrete[0], results_discrete[0]], [ep_lengths_discrete[0], ep_lengths_discrete[0]]]
# Create continuous line for both plots # Create continuous line for both plots
for k in range(2): for k in range(2):
for i in range(1,console_width): for i in range(1, console_width):
x = [results_discrete, ep_lengths_discrete][k][i] x = [results_discrete, ep_lengths_discrete][k][i]
if x > rng[k][1]: if x > rng[k][1]:
rng[k] = [rng[k][1]+1, x] rng[k] = [rng[k][1] + 1, x]
elif x < rng[k][0]: elif x < rng[k][0]:
rng[k] = [x, rng[k][0]-1] rng[k] = [x, rng[k][0] - 1]
else: else:
rng[k] = [x,x] rng[k] = [x, x]
for j in range(rng[k][0],rng[k][1]+1): for j in range(rng[k][0], rng[k][1] + 1):
matrix[j][i][k] = 1 matrix[j][i][k] = 1
print(f'{"-"*console_width}') print(f'{"-" * console_width}')
for l in reversed(range(console_height)): for l in reversed(range(console_height)):
for c in range(console_width): for c in range(console_width):
if np.all(matrix[l][c] == 0): print(end=" ") if np.all(matrix[l][c] == 0):
elif np.all(matrix[l][c] == 1): print(end=symb_xo) print(end=" ")
elif matrix[l][c][0] == 1: print(end=symb_x) elif np.all(matrix[l][c] == 1):
else: print(end=symb_o) print(end=symb_xo)
elif matrix[l][c][0] == 1:
print(end=symb_x)
else:
print(end=symb_o)
print() print()
print(f'{"-"*console_width}') print(f'{"-" * console_width}')
print(f"({symb_x})-reward min:{results_limits[0]:11.2f} max:{results_limits[1]:11.2f}") print(f"({symb_x})-reward min:{results_limits[0]:11.2f} max:{results_limits[1]:11.2f}")
print(f"({symb_o})-ep. length min:{ep_lengths_limits[0]:11.0f} max:{ep_lengths_limits[1]:11.0f} {time_steps[-1]/1000:15.0f}k steps") print(
print(f'{"-"*console_width}') f"({symb_o})-ep. length min:{ep_lengths_limits[0]:11.0f} max:{ep_lengths_limits[1]:11.0f} {time_steps[-1] / 1000:15.0f}k steps")
print(f'{"-" * console_width}')
# save CSV # save CSV
if save_csv: if save_csv:
@@ -370,8 +385,7 @@ class Train_Base():
writer = csv.writer(f) writer = csv.writer(f)
if sample_no == 1: if sample_no == 1:
writer.writerow(["time_steps", "reward ep.", "length"]) writer.writerow(["time_steps", "reward ep.", "length"])
writer.writerow([time_steps[-1],results_raw[-1],ep_lengths_raw[-1]]) writer.writerow([time_steps[-1], results_raw[-1], ep_lengths_raw[-1]])
# def generate_slot_behavior(self, path, slots, auto_head:bool, XML_name): # def generate_slot_behavior(self, path, slots, auto_head:bool, XML_name):
# ''' # '''
@@ -394,7 +408,7 @@ class Train_Base():
# xml_pretty = minidom.parseString(xml_rough).toprettyxml(indent=" ") # xml_pretty = minidom.parseString(xml_rough).toprettyxml(indent=" ")
# with open(file, "w") as x: # with open(file, "w") as x:
# x.write(xml_pretty) # x.write(xml_pretty)
# print(file, "was created!") # print(file, "was created!")
# @staticmethod # @staticmethod
@@ -406,7 +420,7 @@ class Train_Base():
# ---------- # ----------
# initial_value : float # initial_value : float
# Initial learning rate # Initial learning rate
# Returns # Returns
# ------- # -------
# schedule : Callable[[float], float] # schedule : Callable[[float], float]
@@ -420,7 +434,7 @@ class Train_Base():
# ---------- # ----------
# progress_remaining : float # progress_remaining : float
# Progress will decrease from 1 (beginning) to 0 # Progress will decrease from 1 (beginning) to 0
# Returns # Returns
# ------- # -------
# learning_rate : float # learning_rate : float
@@ -452,28 +466,28 @@ class Train_Base():
if not os.path.isfile(f): if not os.path.isfile(f):
output_file = f output_file = f
break break
model = PPO.load(input_file)
weights = model.policy.state_dict() # dictionary containing network layers
w = lambda name : weights[name].detach().cpu().numpy() # extract weights from policy model = PPO.load(input_file)
weights = model.policy.state_dict() # dictionary containing network layers
w = lambda name: weights[name].detach().cpu().numpy() # extract weights from policy
var_list = [] var_list = []
for i in count(0,2): # add hidden layers (step=2 because that's how SB3 works) for i in count(0, 2): # add hidden layers (step=2 because that's how SB3 works)
if f"mlp_extractor.policy_net.{i}.bias" not in weights: if f"mlp_extractor.policy_net.{i}.bias" not in weights:
break break
var_list.append([w(f"mlp_extractor.policy_net.{i}.bias"), w(f"mlp_extractor.policy_net.{i}.weight"), "tanh"]) var_list.append(
[w(f"mlp_extractor.policy_net.{i}.bias"), w(f"mlp_extractor.policy_net.{i}.weight"), "tanh"])
var_list.append( [w("action_net.bias"), w("action_net.weight"), "none"] ) # add final layer var_list.append([w("action_net.bias"), w("action_net.weight"), "none"]) # add final layer
with open(output_file,"wb") as f: with open(output_file, "wb") as f:
pickle.dump(var_list, f, protocol=4) # protocol 4 is backward compatible with Python 3.4 pickle.dump(var_list, f, protocol=4) # protocol 4 is backward compatible with Python 3.4
def print_list(data, numbering=True, prompt=None, divider=" | ", alignment="<", min_per_col=6): def print_list(data, numbering=True, prompt=None, divider=" | ", alignment="<", min_per_col=6):
''' '''
Print list - prints list, using as many columns as possible Print list - prints list, using as many columns as possible
Parameters Parameters
---------- ----------
data : `list` data : `list`
@@ -488,7 +502,7 @@ class Train_Base():
f-string style alignment ( '<', '>', '^' ) f-string style alignment ( '<', '>', '^' )
min_per_col : int min_per_col : int
avoid splitting columns with fewer items avoid splitting columns with fewer items
Returns Returns
------- -------
item : `int`, item item : `int`, item
@@ -496,65 +510,67 @@ class Train_Base():
or `None` (if `numbering` or `prompt` are `None`) or `None` (if `numbering` or `prompt` are `None`)
''' '''
WIDTH = shutil.get_terminal_size()[0] WIDTH = shutil.get_terminal_size()[0]
data_size = len(data) data_size = len(data)
items = [] items = []
items_len = [] items_len = []
#--------------------------------------------- Add numbers, margins and divider # --------------------------------------------- Add numbers, margins and divider
for i in range(data_size): for i in range(data_size):
number = f"{i}-" if numbering else "" number = f"{i}-" if numbering else ""
items.append( f"{divider}{number}{data[i]}" ) items.append(f"{divider}{number}{data[i]}")
items_len.append( len(items[-1]) ) items_len.append(len(items[-1]))
max_cols = np.clip((WIDTH+len(divider)) // min(items_len),1,math.ceil(data_size/max(min_per_col,1))) # width + len(divider) because it is not needed in last col max_cols = np.clip((WIDTH + len(divider)) // min(items_len), 1, math.ceil(
data_size / max(min_per_col, 1))) # width + len(divider) because it is not needed in last col
#--------------------------------------------- Check maximum number of columns, considering content width (min:1) # --------------------------------------------- Check maximum number of columns, considering content width (min:1)
for i in range(max_cols,0,-1): for i in range(max_cols, 0, -1):
cols_width = [] cols_width = []
cols_items = [] cols_items = []
table_width = 0 table_width = 0
a,b = divmod(data_size,i) a, b = divmod(data_size, i)
for col in range(i): for col in range(i):
start = a*col + min(b,col) start = a * col + min(b, col)
end = start+a+(1 if col<b else 0) end = start + a + (1 if col < b else 0)
cols_items.append( items[start:end] ) cols_items.append(items[start:end])
col_width = max(items_len[start:end]) col_width = max(items_len[start:end])
cols_width.append( col_width ) cols_width.append(col_width)
table_width += col_width table_width += col_width
if table_width <= WIDTH+len(divider): if table_width <= WIDTH + len(divider):
break break
table_width -= len(divider) table_width -= len(divider)
#--------------------------------------------- Print columns # --------------------------------------------- Print columns
print("="*table_width) print("=" * table_width)
for row in range(math.ceil(data_size / i)): for row in range(math.ceil(data_size / i)):
for col in range(i): for col in range(i):
content = cols_items[col][row] if len(cols_items[col]) > row else divider # print divider when there are no items content = cols_items[col][row] if len(
cols_items[col]) > row else divider # print divider when there are no items
if col == 0: if col == 0:
l = len(divider) l = len(divider)
print(end=f"{content[l:]:{alignment}{cols_width[col]-l}}") # remove divider from 1st col print(end=f"{content[l:]:{alignment}{cols_width[col] - l}}") # remove divider from 1st col
else: else:
print(end=f"{content :{alignment}{cols_width[col] }}") print(end=f"{content :{alignment}{cols_width[col]}}")
print() print()
print("="*table_width) print("=" * table_width)
#--------------------------------------------- Prompt # --------------------------------------------- Prompt
if prompt is None: if prompt is None:
return None return None
if numbering is None: if numbering is None:
return None return None
else: else:
idx = UI.read_int( prompt, 0, data_size ) idx = UI.read_int(prompt, 0, data_size)
return idx, data[idx] return idx, data[idx]
class Cyclic_Callback(BaseCallback): class Cyclic_Callback(BaseCallback):
''' Stable baselines custom callback ''' ''' Stable baselines custom callback '''
def __init__(self, freq, function): def __init__(self, freq, function):
super(Cyclic_Callback, self).__init__(1) super(Cyclic_Callback, self).__init__(1)
self.freq = freq self.freq = freq
@@ -563,10 +579,12 @@ class Cyclic_Callback(BaseCallback):
def _on_step(self) -> bool: def _on_step(self) -> bool:
if self.n_calls % self.freq == 0: if self.n_calls % self.freq == 0:
self.function() self.function()
return True # If the callback returns False, training is aborted early return True # If the callback returns False, training is aborted early
class Export_Callback(BaseCallback): class Export_Callback(BaseCallback):
''' Stable baselines custom callback ''' ''' Stable baselines custom callback '''
def __init__(self, freq, load_path, export_name): def __init__(self, freq, load_path, export_name):
super(Export_Callback, self).__init__(1) super(Export_Callback, self).__init__(1)
self.freq = freq self.freq = freq
@@ -577,8 +595,7 @@ class Export_Callback(BaseCallback):
if self.n_calls % self.freq == 0: if self.n_calls % self.freq == 0:
path = os.path.join(self.load_path, f"model_{self.num_timesteps}_steps.zip") path = os.path.join(self.load_path, f"model_{self.num_timesteps}_steps.zip")
Train_Base.export_model(path, f"./scripts/gyms/export/{self.export_name}") Train_Base.export_model(path, f"./scripts/gyms/export/{self.export_name}")
return True # If the callback returns False, training is aborted early return True # If the callback returns False, training is aborted early

592
scripts/gyms/Walk.py Normal file → Executable file
View File

@@ -1,13 +1,14 @@
import os import os
import numpy as np import numpy as np
import math import math
from time import sleep import time
from time import sleep
from random import random from random import random
from random import uniform from random import uniform
from stable_baselines3 import PPO from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
import gymnasium as gym import gymnasium as gym
from gymnasium import spaces from gymnasium import spaces
@@ -28,56 +29,65 @@ Learn how to run forward using step primitive
- class Train: implements algorithms to train a new model or test an existing model - class Train: implements algorithms to train a new model or test an existing model
''' '''
class WalkEnv(gym.Env): class WalkEnv(gym.Env):
def __init__(self, ip, server_p) -> None: def __init__(self, ip, server_p) -> None:
# Args: Server IP, Agent Port, Monitor Port, Uniform No., Robot Type, Team Name, Enable Log, Enable Draw # Args: Server IP, Agent Port, Monitor Port, Uniform No., Robot Type, Team Name, Enable Log, Enable Draw
self.Player = player = Base_Agent( self.Player = player = Base_Agent(
team_name="Gym", team_name="Gym",
number=1, number=1,
host=ip, host=ip,
port=server_p port=server_p
) )
self.robot_type = self.Player.robot self.robot_type = self.Player.robot
self.step_counter = 0 # to limit episode size self.step_counter = 0 # to limit episode size
self.force_play_on = True self.force_play_on = True
self.target_position = np.array([0.0, 0.0]) # target position in the x-y plane self.target_position = np.array([0.0, 0.0]) # target position in the x-y plane
self.initial_position = np.array([0.0, 0.0]) # initial position in the x-y plane self.initial_position = np.array([0.0, 0.0]) # initial position in the x-y plane
self.target_direction = 0.0 # target direction in the x-y plane (relative to the robot's orientation) self.target_direction = 0.0 # target direction in the x-y plane (relative to the robot's orientation)
self.isfallen = False self.isfallen = False
self.waypoint_index = 0 self.waypoint_index = 0
self.route_completed = False self.route_completed = False
self.debug_every_n_steps = 5 self.debug_every_n_steps = 5
self.enable_debug_joint_status = False
self.calibrate_nominal_from_neutral = True self.calibrate_nominal_from_neutral = True
self.auto_calibrate_train_sim_flip = True self.auto_calibrate_train_sim_flip = True
self.nominal_calibrated_once = False self.nominal_calibrated_once = False
self.flip_calibrated_once = False self.flip_calibrated_once = False
self._target_hz = 0.0
self._target_dt = 0.0
self._last_sync_time = None
target_hz_env = 0
if target_hz_env:
try:
self._target_hz = float(target_hz_env)
except ValueError:
self._target_hz = 0.0
if self._target_hz > 0.0:
self._target_dt = 1.0 / self._target_hz
# State space # State space
# 原始观测大小: 78 # 原始观测大小: 78
obs_size = 78 obs_size = 78
self.obs = np.zeros(obs_size, np.float32) self.obs = np.zeros(obs_size, np.float32)
self.observation_space = spaces.Box( self.observation_space = spaces.Box(
low=-10.0, low=-10.0,
high=10.0, high=10.0,
shape=(obs_size,), shape=(obs_size,),
dtype=np.float32 dtype=np.float32
) )
action_dim = len(self.Player.robot.ROBOT_MOTORS) action_dim = len(self.Player.robot.ROBOT_MOTORS)
self.no_of_actions = action_dim self.no_of_actions = action_dim
self.action_space = spaces.Box( self.action_space = spaces.Box(
low=-1.0, low=-10.0,
high=1.0, high=10.0,
shape=(action_dim,), shape=(action_dim,),
dtype=np.float32 dtype=np.float32
) )
# 中立姿态 # 中立姿态
self.joint_nominal_position = np.array( self.joint_nominal_position = np.array(
[ [
@@ -106,26 +116,25 @@ class WalkEnv(gym.Env):
0.0, 0.0,
] ]
) )
self.reference_joint_nominal_position = self.joint_nominal_position.copy() self.joint_nominal_position = np.zeros(self.no_of_actions)
self.train_sim_flip = np.array( self.train_sim_flip = np.array(
[ [
1.0, # 0: Head_yaw (he1) 1.0, # 0: Head_yaw (he1)
-1.0, # 1: Head_pitch (he2) -1.0, # 1: Head_pitch (he2)
1.0, # 2: Left_Shoulder_Pitch (lae1) 1.0, # 2: Left_Shoulder_Pitch (lae1)
-1.0, # 3: Left_Shoulder_Roll (lae2) -1.0, # 3: Left_Shoulder_Roll (lae2)
1.0, # 4: Left_Elbow_Pitch (lae3) -1.0, # 4: Left_Elbow_Pitch (lae3)
1.0, # 5: Left_Elbow_Yaw (lae4) 1.0, # 5: Left_Elbow_Yaw (lae4)
-1.0, # 6: Right_Shoulder_Pitch (rae1) -1.0, # 6: Right_Shoulder_Pitch (rae1)
1.0, # 7: Right_Shoulder_Roll (rae2) -1.0, # 7: Right_Shoulder_Roll (rae2)
1.0, # 8: Right_Elbow_Pitch (rae3) 1.0, # 8: Right_Elbow_Pitch (rae3)
1.0, # 9: Right_Elbow_Yaw (rae4) 1.0, # 9: Right_Elbow_Yaw (rae4)
1.0, # 10: Waist (te1) 1.0, # 10: Waist (te1)
1.0, # 11: Left_Hip_Pitch (lle1) 1.0, # 11: Left_Hip_Pitch (lle1)
-1.0, # 12: Left_Hip_Roll (lle2) -1.0, # 12: Left_Hip_Roll (lle2)
-1.0, # 13: Left_Hip_Yaw (lle3) -1.0, # 13: Left_Hip_Yaw (lle3)
1.0, # 14: Left_Knee_Pitch (lle4) 1.0, # 14: Left_Knee_Pitch (lle4)
1.0, # 15: Left_Ankle_Pitch (lle5) 1.0, # 15: Left_Ankle_Pitch (lle5)
-1.0, # 16: Left_Ankle_Roll (lle6) -1.0, # 16: Left_Ankle_Roll (lle6)
-1.0, # 17: Right_Hip_Pitch (rle1) -1.0, # 17: Right_Hip_Pitch (rle1)
-1.0, # 18: Right_Hip_Roll (rle2) -1.0, # 18: Right_Hip_Roll (rle2)
@@ -136,15 +145,51 @@ class WalkEnv(gym.Env):
] ]
) )
self.scaling_factor = 0.5 self.scaling_factor = 0.3
# self.scaling_factor = 1
# Small reset perturbations for robustness training.
self.enable_reset_perturb = True
self.reset_beam_yaw_range_deg = 180 # randomize target direction fully to encourage learning a real walk instead of a fixed gait
self.reset_joint_noise_rad = 0.015
self.reset_perturb_steps = 3
self.reset_recover_steps = 8
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.previous_pos = np.array([0.0, 0.0]) # Track previous position self.previous_pos = np.array([0.0, 0.0]) # Track previous position
self.Player.server.connect() self.Player.server.connect()
sleep(2.0) # Longer wait for connection to establish completely # sleep(2.0) # Longer wait for connection to establish completely
self.Player.server.send_immediate( self.Player.server.send_immediate(
f"(init {self.Player.robot.name} {self.Player.world.team_name} {self.Player.world.number})" f"(init {self.Player.robot.name} {self.Player.world.team_name} {self.Player.world.number})"
) )
self.start_time = time.time()
def _reconnect_server(self):
try:
self.Player.server.shutdown()
except Exception:
pass
self.Player.server.connect()
self.Player.server.send_immediate(
f"(init {self.Player.robot.name} {self.Player.world.team_name} {self.Player.world.number})"
)
def _safe_receive_world_update(self, retries=1):
last_exc = None
for attempt in range(retries + 1):
try:
self.Player.server.receive()
self.Player.world.update()
return
except (ConnectionResetError, OSError) as exc:
last_exc = exc
if attempt >= retries:
raise
self._reconnect_server()
if last_exc is not None:
raise last_exc
def debug_log(self, message): def debug_log(self, message):
print(message) print(message)
@@ -155,70 +200,30 @@ class WalkEnv(gym.Env):
except OSError: except OSError:
pass pass
def calibrate_train_sim_flip_from_neutral(self, neutral_joint_positions):
updated_flip = self.train_sim_flip.copy()
changed = []
for idx, (reference_value, observed_value) in enumerate(
zip(self.reference_joint_nominal_position, neutral_joint_positions)
):
if idx >= 10:
continue
if abs(reference_value) < 0.15 or abs(observed_value) < 0.15:
continue
inferred_flip = 1.0 if np.sign(reference_value) == np.sign(observed_value) else -1.0
if updated_flip[idx] != inferred_flip:
changed.append((idx, updated_flip[idx], inferred_flip))
updated_flip[idx] = inferred_flip
self.train_sim_flip = updated_flip
if changed:
self.debug_log(
"[FlipDebug] "
f"changes={[(idx, old, new) for idx, old, new in changed]}"
)
def is_reliable_neutral_pose(self, neutral_joint_positions):
leg_positions = neutral_joint_positions[11:]
leg_norm = float(np.linalg.norm(leg_positions))
leg_max = float(np.max(np.abs(leg_positions)))
height = float(self.Player.world.global_position[2])
reliable = (
leg_norm > 0.8
and leg_max > 0.35
and 0.12 < height < 0.8
)
return reliable, leg_norm, leg_max, height
def observe(self, init=False): def observe(self, init=False):
"""获取当前观测值""" """获取当前观测值"""
robot = self.Player.robot robot = self.Player.robot
world = self.Player.world world = self.Player.world
# Safety check: ensure data is available # Safety check: ensure data is available
# 计算目标速度 # 计算目标速度
raw_target = self.target_position - world.global_position[:2] raw_target = self.target_position - world.global_position[:2]
velocity = MathOps.rotate_2d_vec( velocity = MathOps.rotate_2d_vec(
raw_target, raw_target,
-robot.global_orientation_euler[2], -robot.global_orientation_euler[2],
is_rad=False is_rad=False
) )
# 计算相对方向 # 计算相对方向
rel_orientation = MathOps.vector_angle(velocity) * 0.3 rel_orientation = MathOps.vector_angle(velocity) * 0.3
rel_orientation = np.clip(rel_orientation, -0.25, 0.25) rel_orientation = np.clip(rel_orientation, -0.25, 0.25)
velocity = np.concatenate([velocity, np.array([rel_orientation])]) velocity = np.concatenate([velocity, np.array([rel_orientation])])
velocity[0] = np.clip(velocity[0], -0.5, 0.5) velocity[0] = np.clip(velocity[0], -0.5, 0.5)
velocity[1] = np.clip(velocity[1], -0.25, 0.25) velocity[1] = np.clip(velocity[1], -0.25, 0.25)
# 关节状态 # 关节状态
radian_joint_positions = np.deg2rad( radian_joint_positions = np.deg2rad(
[robot.motor_positions[motor] for motor in robot.ROBOT_MOTORS] [robot.motor_positions[motor] for motor in robot.ROBOT_MOTORS]
@@ -226,21 +231,20 @@ class WalkEnv(gym.Env):
radian_joint_speeds = np.deg2rad( radian_joint_speeds = np.deg2rad(
[robot.motor_speeds[motor] for motor in robot.ROBOT_MOTORS] [robot.motor_speeds[motor] for motor in robot.ROBOT_MOTORS]
) )
qpos_qvel_previous_action = np.concatenate([ qpos_qvel_previous_action = np.concatenate([
(radian_joint_positions * self.train_sim_flip - self.joint_nominal_position) / 4.6, (radian_joint_positions * self.train_sim_flip - self.joint_nominal_position) / 4.6,
radian_joint_speeds / 110.0 * self.train_sim_flip, radian_joint_speeds / 110.0 * self.train_sim_flip,
self.previous_action / 10.0, self.previous_action / 10.0,
]) ])
# 角速度 # 角速度
ang_vel = np.clip(np.deg2rad(robot.gyroscope) / 50.0, -1.0, 1.0) ang_vel = np.clip(np.deg2rad(robot.gyroscope) / 50.0, -1.0, 1.0)
# 投影的重力方向 # 投影的重力方向
orientation_quat_inv = R.from_quat(robot._global_cheat_orientation).inv() orientation_quat_inv = R.from_quat(robot._global_cheat_orientation).inv()
projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0])) projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0]))
# 组合观测 # 组合观测
observation = np.concatenate([ observation = np.concatenate([
qpos_qvel_previous_action, qpos_qvel_previous_action,
@@ -249,17 +253,25 @@ class WalkEnv(gym.Env):
projected_gravity, projected_gravity,
]) ])
observation = np.clip(observation, -10.0, 10.0) observation = np.clip(observation, -10.0, 10.0)
return observation.astype(np.float32) return observation.astype(np.float32)
def sync(self): def sync(self):
''' Run a single simulation step ''' ''' Run a single simulation step '''
self.Player.server.receive() self._safe_receive_world_update(retries=1)
self.Player.world.update()
self.Player.robot.commit_motor_targets_pd() self.Player.robot.commit_motor_targets_pd()
self.Player.server.send() self.Player.server.send()
if self._target_dt > 0.0:
now = time.time()
if self._last_sync_time is None:
self._last_sync_time = now
return
elapsed = now - self._last_sync_time
remaining = self._target_dt - elapsed
if remaining > 0.0:
time.sleep(remaining)
now = time.time()
self._last_sync_time = now
def debug_joint_status(self): def debug_joint_status(self):
robot = self.Player.robot robot = self.Player.robot
@@ -284,6 +296,7 @@ class WalkEnv(gym.Env):
f"err_norm={float(np.linalg.norm(joint_error)):.4f} " f"err_norm={float(np.linalg.norm(joint_error)):.4f} "
f"fallen={self.Player.world.global_position[2] < 0.3}" f"fallen={self.Player.world.global_position[2] < 0.3}"
) )
print(f"waist target={target_joint_positions[10]:.3f}, actual={actual_joint_positions[10]:.3f}")
def reset(self, seed=None, options=None): def reset(self, seed=None, options=None):
''' '''
@@ -295,83 +308,82 @@ class WalkEnv(gym.Env):
if seed is not None: if seed is not None:
np.random.seed(seed) np.random.seed(seed)
length1 = np.random.uniform(10, 20) # randomize target distance length1 = 2 # randomize target distance
length2 = np.random.uniform(10, 20) # randomize target distance length2 = np.random.uniform(0.6, 1) # randomize target distance
length3 = np.random.uniform(10, 20) # randomize target distance length3 = np.random.uniform(0.6, 1) # randomize target distance
angle2 = np.random.uniform(-30, 30) # randomize initial orientation angle2 = np.random.uniform(-30, 30) # randomize initial orientation
angle3 = np.random.uniform(-30, 30) # randomize target direction angle3 = np.random.uniform(-30, 30) # randomize target direction
self.step_counter = 0 self.step_counter = 0
self.waypoint_index = 0 self.waypoint_index = 0
self.route_completed = False self.route_completed = False
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step
self.walk_cycle_step = 0 self.walk_cycle_step = 0
# 随机 beam 目标位置和朝向,增加训练多样性 # 随机 beam 目标位置和朝向,增加训练多样性
beam_x = (random() - 0.5) * 10 beam_x = (random() - 0.5) * 10
beam_y = (random() - 0.5) * 10 beam_y = (random() - 0.5) * 10
beam_yaw = uniform(-self.reset_beam_yaw_range_deg, self.reset_beam_yaw_range_deg)
for _ in range(5): for _ in range(5):
self.Player.server.receive() self._safe_receive_world_update(retries=2)
self.Player.world.update()
self.Player.robot.commit_motor_targets_pd() self.Player.robot.commit_motor_targets_pd()
self.Player.server.commit_beam(pos2d=(beam_x, beam_y), rotation=0) self.Player.server.commit_beam(pos2d=(beam_x, beam_y), rotation=beam_yaw)
self.Player.server.send() self.Player.server.send()
# 执行 Neutral 技能直到完成,给机器人足够时间在 beam 位置稳定站立 # 执行 Neutral 技能直到完成,给机器人足够时间在 beam 位置稳定站立
finished_count = 0 finished_count = 0
for _ in range(20): for _ in range(50):
finished = self.Player.skills_manager.execute("Neutral") finished = self.Player.skills_manager.execute("Neutral")
self.sync() self.sync()
if finished: if finished:
finished_count += 1 finished_count += 1
if finished_count >= 2: # 假设需要连续2次完成才算成功 if finished_count >= 20: # 假设需要连续20次完成才算成功
break break
# neutral_joint_positions = np.deg2rad( if self.enable_reset_perturb and self.reset_joint_noise_rad > 0.0:
# [self.Player.robot.motor_positions[motor] for motor in self.Player.robot.ROBOT_MOTORS] perturb_action = np.zeros(self.no_of_actions, dtype=np.float32)
# ) # Perturb waist + lower body only (10:), keep head/arms stable.
# reliable_neutral, neutral_leg_norm, neutral_leg_max, neutral_height = self.is_reliable_neutral_pose(neutral_joint_positions) perturb_action[10:] = np.random.uniform(
-self.reset_joint_noise_rad,
self.reset_joint_noise_rad,
size=(self.no_of_actions - 10,)
)
# if self.auto_calibrate_train_sim_flip and reliable_neutral and not self.flip_calibrated_once: for _ in range(self.reset_perturb_steps):
# self.calibrate_train_sim_flip_from_neutral(neutral_joint_positions) target_joint_positions = (self.joint_nominal_position + perturb_action) * self.train_sim_flip
# self.flip_calibrated_once = True for idx, target in enumerate(target_joint_positions):
# if self.calibrate_nominal_from_neutral and reliable_neutral and not self.nominal_calibrated_once: r.set_motor_target_position(
# self.joint_nominal_position = neutral_joint_positions * self.train_sim_flip r.ROBOT_MOTORS[idx], target * 180 / math.pi, kp=25, kd=0.6
# self.nominal_calibrated_once = True )
# self.debug_log( self.sync()
# "[ResetDebug] "
# f"neutral_pos={np.round(self.Player.world.global_position, 3).tolist()} "
# f"shoulders={np.round(neutral_joint_positions[2:10], 3).tolist()} "
# f"legs={np.round(neutral_joint_positions[11:], 3).tolist()} "
# f"flip={self.train_sim_flip.tolist()} "
# f"nominal_legs={np.round(self.joint_nominal_position[11:], 3).tolist()} "
# f"calibrated_once={(self.flip_calibrated_once, self.nominal_calibrated_once)} "
# f"reliable_neutral={reliable_neutral} "
# f"leg_norm={neutral_leg_norm:.3f} leg_max={neutral_leg_max:.3f} height={neutral_height:.3f}"
# )
# reset_action_noise = np.random.uniform(-0.015, 0.015, size=(len(self.Player.robot.ROBOT_MOTORS),))
# self.target_joint_positions = (self.joint_nominal_position + reset_action_noise) * self.train_sim_flip
# for idx, target in enumerate(self.target_joint_positions):
# r.set_motor_target_position(
# r.ROBOT_MOTORS[idx], target*180/math.pi, kp=25, kd=0.6
# )
for i in range(self.reset_recover_steps):
# Linearly fade perturbation to help policy start from near-neutral.
alpha = 1.0 - float(i + 1) / float(self.reset_recover_steps)
target_joint_positions = (self.joint_nominal_position + alpha * perturb_action) * self.train_sim_flip
for idx, target in enumerate(target_joint_positions):
r.set_motor_target_position(
r.ROBOT_MOTORS[idx], target * 180 / math.pi, kp=25, kd=0.6
)
self.sync()
# memory variables # memory variables
self.sync()
self.initial_position = np.array(self.Player.world.global_position[:2]) self.initial_position = np.array(self.Player.world.global_position[:2])
self.previous_pos = self.initial_position.copy() # Critical: set to actual position self.previous_pos = self.initial_position.copy() # Critical: set to actual position
self.act = np.zeros(self.no_of_actions ,np.float32) self.act = np.zeros(self.no_of_actions, np.float32)
point1 = self.initial_position + np.array([length1, 0]) # Build target in the robot's current forward direction instead of fixed global +x.
heading_deg = float(r.global_orientation_euler[2])
forward_offset = MathOps.rotate_2d_vec(np.array([length1, 0.0]), heading_deg, is_rad=False)
point1 = self.initial_position + forward_offset
point2 = point1 + MathOps.rotate_2d_vec(np.array([length2, 0]), angle2, is_rad=False) point2 = point1 + MathOps.rotate_2d_vec(np.array([length2, 0]), angle2, is_rad=False)
point3 = point2 + MathOps.rotate_2d_vec(np.array([length3, 0]), angle3, is_rad=False) point3 = point2 + MathOps.rotate_2d_vec(np.array([length3, 0]), angle3, is_rad=False)
self.point_list = [point1, point2, point3] self.point_list = [point1]
self.target_position = self.point_list[self.waypoint_index] self.target_position = self.point_list[self.waypoint_index]
self.initial_height = self.Player.world.global_position[2]
return self.observe(True), {} return self.observe(True), {}
@@ -379,101 +391,136 @@ class WalkEnv(gym.Env):
return return
def compute_reward(self, previous_pos, current_pos, action): def compute_reward(self, previous_pos, current_pos, action):
velocity = current_pos - previous_pos height = float(self.Player.world.global_position[2])
velocity_magnitude = np.linalg.norm(velocity)
direction_to_target = self.target_position - current_pos
prev_direction_to_target = self.target_position - previous_pos
distance_to_target = np.linalg.norm(direction_to_target)
prev_distance_to_target = np.linalg.norm(prev_direction_to_target)
progress_reward = np.clip((prev_distance_to_target - distance_to_target) * 30.0, -2.0, 4.0) orientation_quat_inv = R.from_quat(self.Player.robot._global_cheat_orientation).inv()
projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0]))
tilt_mag = float(np.linalg.norm(projected_gravity[:2]))
ang_vel = np.deg2rad(self.Player.robot.gyroscope)
ang_vel_mag = float(np.linalg.norm(ang_vel))
velocity_in_m_per_sec = velocity_magnitude / 0.05 is_fallen = height < 0.3
speed_reward = np.clip(velocity_in_m_per_sec * 1.5, 0.0, 1.5) if is_fallen:
# remain = max(0, 800 - self.step_counter)
# return -8.0 - 0.01 * remain
return -1.0
if velocity_magnitude > 1e-4 and distance_to_target > 1e-4:
directional_alignment = np.dot(velocity, direction_to_target) / (velocity_magnitude * distance_to_target)
directional_alignment = np.clip(directional_alignment, -1.0, 1.0)
direction_reward = max(0.0, directional_alignment)
else:
direction_reward = 0.0
alive_bonus = 0.05 # # 目标方向
# to_target = self.target_position - current_pos
# dist_to_target = float(np.linalg.norm(to_target))
# if dist_to_target < 0.5:
# return 15.0
height = self.Player.world.global_position[2] # forward_dir = to_target / dist_to_target if dist_to_target > 0.1 else np.array([1.0, 0.0])
if 0.45 <= height <= 1.2: # delta_pos = current_pos - previous_pos
height_reward = 1.5 # forward_step = float(np.dot(delta_pos, forward_dir))
else: # lateral_step = float(np.linalg.norm(delta_pos - forward_dir * forward_step))
height_reward = -6.0
motionless_penalty = -1.5 if velocity_magnitude < 0.003 else 0.0 # 奖励项
# progress_reward = 2 * forward_step
# lateral_penalty = -0.1 * lateral_step
alive_bonus = 2.0
waypoint_bonus = 0.0 # action_penalty = -0.01 * float(np.linalg.norm(action))
if distance_to_target < 0.5: smoothness_penalty = -0.01 * float(np.linalg.norm(action - self.last_action_for_reward))
waypoint_bonus = 25.0
if self.waypoint_index < len(self.point_list) - 1: posture_penalty = -0.3 * (tilt_mag)
self.waypoint_index += 1 ang_vel_penalty = -0.02 * ang_vel_mag
self.target_position = self.point_list[self.waypoint_index]
else: target_height = self.initial_height
waypoint_bonus = 100.0 height_error = height - target_height
self.route_completed = True height_penalty = -0.5 * abs(height_error) # 惩罚高度偏离,系数可调
# # 在 compute_reward 开头附近,添加高度变化率计算
# if not hasattr(self, 'last_height'):
# self.last_height = height
# self.last_height_time = self.step_counter # 可选,用于时间间隔
# height_rate = height - self.last_height # 正为上升,负为下降
# self.last_height = height
# 惩罚高度下降(负变化率)
# height_down_penalty = -5.0 * max(0, -height_rate) # 系数可调,-height_rate 为正表示下降幅度
# # 在 compute_reward 中
# if self.step_counter > 50:
# avg_prev_action = np.mean(self.prev_action_history, axis=0)
# novelty = float(np.linalg.norm(action - avg_prev_action))
# exploration_bonus = 0.05 * novelty
# else:
# exploration_bonus = 0
# self.prev_action_history[self.history_idx] = action
# self.history_idx = (self.history_idx + 1) % 50
total = (
# progress_reward +
alive_bonus +
# lateral_penalty +
# action_penalty +
smoothness_penalty +
posture_penalty
+ ang_vel_penalty
+ height_penalty
# + exploration_bonus
# + height_down_penalty
)
if time.time() - self.start_time >= 1200:
self.start_time = time.time()
print(
# f"progress_reward:{progress_reward:.4f}",
# f"lateral_penalty:{lateral_penalty:.4f}",
# f"action_penalty:{action_penalty:.4f}"s,
f"height_penalty:{height_penalty:.4f}",
f"smoothness_penalty:{smoothness_penalty:.4f},",
f"posture_penalty:{posture_penalty:.4f}",
# f"ang_vel_penalty:{ang_vel_penalty:.4f}",
# f"height_down_penalty:{height_down_penalty:.4f}",
# f"exploration_bonus:{exploration_bonus:.4f}"
)
return total
action_magnitude = np.linalg.norm(action[11:])
action_penalty = -0.08 * action_magnitude
tilt_penalty = -0.2 * np.linalg.norm(self.Player.robot.gyroscope[:2]) / 100.0
return (
progress_reward
+ speed_reward
+ direction_reward
+ alive_bonus
+ height_reward
+ motionless_penalty
+ waypoint_bonus
+ action_penalty
+ tilt_penalty
)
def step(self, action): def step(self, action):
r = self.Player.robot r = self.Player.robot
self.previous_action = action self.previous_action = action
self.target_joint_positions = ( self.target_joint_positions = (
self.joint_nominal_position # self.joint_nominal_position +
+ self.scaling_factor * action self.scaling_factor * action
) )
self.target_joint_positions *= self.train_sim_flip self.target_joint_positions *= self.train_sim_flip
for idx, target in enumerate(self.target_joint_positions): for idx, target in enumerate(self.target_joint_positions):
r.set_motor_target_position( r.set_motor_target_position(
r.ROBOT_MOTORS[idx], target*180/math.pi, kp=25, kd=0.6 r.ROBOT_MOTORS[idx], target * 180 / math.pi, kp=40, kd=1.0
) )
self.previous_action = action
self.sync() # run simulation step
self.sync() # run simulation step
self.step_counter += 1 self.step_counter += 1
# if self.step_counter % self.debug_every_n_steps == 0: if self.enable_debug_joint_status and self.step_counter % self.debug_every_n_steps == 0:
# self.debug_joint_status() self.debug_joint_status()
current_pos = np.array(self.Player.world.global_position[:2], dtype=np.float32) current_pos = np.array(self.Player.world.global_position[:2], dtype=np.float32)
# Compute reward based on movement from previous step # Compute reward based on movement from previous step
reward = self.compute_reward(self.previous_pos, current_pos, action) reward = self.compute_reward(self.previous_pos, current_pos, action)
# Update previous position # Update previous position
self.previous_pos = current_pos.copy() self.previous_pos = current_pos.copy()
self.last_action_for_reward = action.copy()
# Fall detection and penalty # Fall detection and penalty
is_fallen = self.Player.world.global_position[2] < 0.3 is_fallen = self.Player.world.global_position[2] < 0.3
# terminal state: the robot is falling or timeout # terminal state: the robot is falling or timeout
terminated = is_fallen or self.step_counter > 800 or self.route_completed terminated = is_fallen or self.step_counter > 800 or self.route_completed
truncated = False truncated = False
@@ -481,94 +528,119 @@ class WalkEnv(gym.Env):
return self.observe(), reward, terminated, truncated, {} return self.observe(), reward, terminated, truncated, {}
class Train(Train_Base): class Train(Train_Base):
def __init__(self, script) -> None: def __init__(self, script) -> None:
super().__init__(script) super().__init__(script)
def train(self, args): def train(self, args):
#--------------------------------------- Learning parameters # --------------------------------------- Learning parameters
n_envs = 8 # Reduced from 8 to decrease CPU/network pressure during init n_envs = int(os.environ.get("GYM_CPU_N_ENVS", "20"))
n_steps_per_env = 512 # RolloutBuffer is of size (n_steps_per_env * n_envs) if n_envs < 1:
minibatch_size = 64 # should be a factor of (n_steps_per_env * n_envs) raise ValueError("GYM_CPU_N_ENVS must be >= 1")
server_warmup_sec = float(os.environ.get("GYM_CPU_SERVER_WARMUP_SEC", "3.0"))
n_steps_per_env = int(os.environ.get("GYM_CPU_TRAIN_STEPS_PER_ENV", "256")) # RolloutBuffer is of size (n_steps_per_env * n_envs)
minibatch_size = int(os.environ.get("GYM_CPU_TRAIN_BATCH_SIZE", "512")) # should be a factor of (n_steps_per_env * n_envs)
total_steps = 30000000 total_steps = 30000000
learning_rate = 3e-4 learning_rate = float(os.environ.get("GYM_CPU_TRAIN_LR", "3e-4"))
folder_name = f'Walk_R{self.robot_type}' folder_name = f'Walk_R{self.robot_type}'
model_path = f'./scripts/gyms/logs/{folder_name}/' model_path = f'./scripts/gyms/logs/{folder_name}/'
print(f"Model path: {model_path}") print(f"Model path: {model_path}")
print(f"Using {n_envs} parallel environments") print(f"Using {n_envs} parallel environments")
#--------------------------------------- Run algorithm # --------------------------------------- Run algorithm
def init_env(i_env): def init_env(i_env, monitor=False):
def thunk(): def thunk():
return WalkEnv( self.ip , self.server_p + i_env) env = WalkEnv(self.ip, self.server_p + i_env)
if monitor:
env = Monitor(env)
return env
return thunk return thunk
servers = Train_Server( self.server_p, self.monitor_p_1000, n_envs+1 ) #include 1 extra server for testing server_log_dir = os.path.join(model_path, "server_logs")
os.makedirs(server_log_dir, exist_ok=True)
servers = Train_Server(self.server_p, self.monitor_p_1000, n_envs + 1, no_render=True, no_realtime=True) # include 1 extra server for testing
# Wait for servers to start # Wait for servers to start
print(f"Starting {n_envs+1} rcssservermj servers...") print(f"Starting {n_envs + 1} rcssservermj servers...")
if server_warmup_sec > 0:
print(f"Waiting {server_warmup_sec:.1f}s for server warmup...")
sleep(server_warmup_sec)
print("Servers started, creating environments...") print("Servers started, creating environments...")
env = SubprocVecEnv( [init_env(i) for i in range(n_envs)] ) env = SubprocVecEnv([init_env(i, monitor=True) for i in range(n_envs)])
eval_env = SubprocVecEnv( [init_env(n_envs)] ) # Use single-process eval env to avoid extra subprocess fragility during callback evaluation.
eval_env = DummyVecEnv([init_env(n_envs, monitor=True)])
try: try:
# Custom policy network architecture # Custom policy network architecture
policy_kwargs = dict( policy_kwargs = dict(
net_arch=dict( net_arch=dict(
pi=[256, 256, 128], # Policy network: 3 layers pi=[512, 256, 128], # Policy network: 3 layers
vf=[256, 256, 128] # Value network: 3 layers vf=[512, 256, 128] # Value network: 3 layers
), ),
activation_fn=__import__('torch.nn', fromlist=['ReLU']).ReLU, activation_fn=__import__('torch.nn', fromlist=['ELU']).ELU,
) )
if "model_file" in args: # retrain if "model_file" in args: # retrain
model = PPO.load( args["model_file"], env=env, device="cpu", n_envs=n_envs, n_steps=n_steps_per_env, batch_size=minibatch_size, learning_rate=learning_rate ) model = PPO.load(args["model_file"], env=env, device="cpu", n_envs=n_envs, n_steps=n_steps_per_env,
else: # train new model batch_size=minibatch_size, learning_rate=learning_rate)
model = PPO( else: # train new model
"MlpPolicy", model = PPO(
env=env, "MlpPolicy",
verbose=1, env=env,
n_steps=n_steps_per_env, verbose=1,
batch_size=minibatch_size, n_steps=n_steps_per_env,
learning_rate=learning_rate, batch_size=minibatch_size,
learning_rate=learning_rate,
device="cpu", device="cpu",
policy_kwargs=policy_kwargs, policy_kwargs=policy_kwargs,
ent_coef=0.01, # Entropy coefficient for exploration ent_coef=float(os.environ.get("GYM_CPU_TRAIN_ENT_COEF", "0.05")), # Entropy coefficient for exploration
clip_range=0.2, # PPO clipping parameter clip_range=float(os.environ.get("GYM_CPU_TRAIN_CLIP_RANGE", "0.2")), # PPO clipping parameter
gae_lambda=0.95, # GAE lambda gae_lambda=0.95, # GAE lambda
gamma=0.99 # Discount factor gamma=float(os.environ.get("GYM_CPU_TRAIN_GAMMA", "0.95")), # Discount factor
target_kl=0.03,
n_epochs=int(os.environ.get("GYM_CPU_TRAIN_EPOCHS", "5")),
# tensorboard_log=f"./scripts/gyms/logs/{folder_name}/tensorboard/"
) )
model_path = self.learn_model( model, total_steps, model_path, eval_env=eval_env, eval_freq=n_steps_per_env*20, save_freq=n_steps_per_env*20, backup_env_file=__file__ ) model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20,
backup_env_file=__file__)
except KeyboardInterrupt: except KeyboardInterrupt:
sleep(1) # wait for child processes sleep(1) # wait for child processes
print("\nctrl+c pressed, aborting...\n") print("\nctrl+c pressed, aborting...\n")
servers.kill() servers.kill()
return return
env.close() env.close()
eval_env.close() eval_env.close()
servers.kill() servers.kill()
def test(self, args): def test(self, args):
# Uses different server and monitor ports # Uses different server and monitor ports
server = Train_Server( self.server_p-1, self.monitor_p, 1 ) server_log_dir = os.path.join(args["folder_dir"], "server_logs")
env = WalkEnv( self.ip, self.server_p-1 ) os.makedirs(server_log_dir, exist_ok=True)
model = PPO.load( args["model_file"], env=env ) test_no_render = os.environ.get("GYM_CPU_TEST_NO_RENDER", "0") == "1"
test_no_realtime = os.environ.get("GYM_CPU_TEST_NO_REALTIME", "0") == "1"
server = Train_Server(
self.server_p - 1,
self.monitor_p,
1,
no_render=test_no_render,
no_realtime=test_no_realtime,
)
env = WalkEnv(self.ip, self.server_p - 1)
model = PPO.load(args["model_file"], env=env)
try: try:
self.export_model( args["model_file"], args["model_file"]+".pkl", False ) # Export to pkl to create custom behavior self.export_model(args["model_file"], args["model_file"] + ".pkl",
self.test_model( model, env, log_path=args["folder_dir"], model_path=args["folder_dir"] ) False) # Export to pkl to create custom behavior
self.test_model(model, env, log_path=args["folder_dir"], model_path=args["folder_dir"])
except KeyboardInterrupt: except KeyboardInterrupt:
print() print()
@@ -578,18 +650,30 @@ class Train(Train_Base):
if __name__ == "__main__": if __name__ == "__main__":
from types import SimpleNamespace from types import SimpleNamespace
# 创建默认参数 # 创建默认参数
script_args = SimpleNamespace( script_args = SimpleNamespace(
args=SimpleNamespace( args=SimpleNamespace(
i='127.0.0.1', # Server IP i='127.0.0.1', # Server IP
p=3100, # Server port p=3100, # Server port
m=3200, # Monitor port m=3200, # Monitor port
r=0, # Robot type r=0, # Robot type
t='Gym', # Team name t='Gym', # Team name
u=1 # Uniform number u=1 # Uniform number
) )
) )
trainer = Train(script_args) trainer = Train(script_args)
trainer.train({})
run_mode = os.environ.get("GYM_CPU_MODE", "train").strip().lower()
if run_mode == "test":
test_model_file = os.environ.get("GYM_CPU_TEST_MODEL", "scripts/gyms/logs/Walk_R0_004/best_model.zip")
test_folder = os.environ.get("GYM_CPU_TEST_FOLDER", "scripts/gyms/logs/Walk_R0_004/")
trainer.test({"model_file": test_model_file, "folder_dir": test_folder})
else:
retrain_model = os.environ.get("GYM_CPU_TRAIN_MODEL", "").strip()
if retrain_model:
trainer.train({"model_file": retrain_model})
else:
trainer.train({})

Binary file not shown.

126
train.sh Executable file
View File

@@ -0,0 +1,126 @@
#!/usr/bin/env bash
set -euo pipefail
# ------------------------------
# 资源限制配置cgroup v2 + systemd-run
# ------------------------------
# 说明:
# 1) 这个脚本会把训练进程放进一个临时的 systemd scope 中,并施加 CPU/内存上限。
# 2) 仅限制“本次训练进程”,不会永久改系统配置。
# 3) 下面变量都支持“环境变量覆盖”,即你可以在命令前临时指定。
#
# CPU 核数基准(默认 20
# 例如你的机器按 20 核预算来算,可保持默认。
CORES="${CORES:-20}"
# CPU 占用百分比(默认 95
# 最终会与 CORES 相乘得到 CPUQuota。
# 例CORES=20, UTIL_PERCENT=95 -> CPUQuota=1900%(约 19 核等效)
UTIL_PERCENT="${UTIL_PERCENT:-95}"
CPU_QUOTA="$((CORES * UTIL_PERCENT))%"
# 内存上限(默认 28G
# 可改成 16G、24G 等,避免训练把系统内存吃满。
MEMORY_MAX="${MEMORY_MAX:-28G}"
# ------------------------------
# 训练运行参数(由 scripts/gyms/Walk.py 读取)
# ------------------------------
# 运行模式train 或 test
GYM_CPU_MODE="${GYM_CPU_MODE:-train}"
# 并行环境数量:越大通常吞吐越高,但也更容易触发服务器连接不稳定。
# 建议从 8~12 起步,稳定后再升到 16/20。
GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS:-20}"
# 服务器预热时间(秒):
# 在批量拉起 rcssserver 后等待一段时间,再创建 SubprocVecEnv
# 可降低 ConnectionReset/EOFError 概率。
GYM_CPU_SERVER_WARMUP_SEC="${GYM_CPU_SERVER_WARMUP_SEC:-10}"
# 训练专用参数
GYM_CPU_TRAIN_STEPS_PER_ENV="${GYM_CPU_TRAIN_STEPS_PER_ENV:-256}"
GYM_CPU_TRAIN_BATCH_SIZE="${GYM_CPU_TRAIN_BATCH_SIZE:-512}"
GYM_CPU_TRAIN_LR="${GYM_CPU_TRAIN_LR:-1e-4}"
GYM_CPU_TRAIN_ENT_COEF="${GYM_CPU_TRAIN_ENT_COEF:-0.03}"
GYM_CPU_TRAIN_CLIP_RANGE="${GYM_CPU_TRAIN_CLIP_RANGE:-0.13}"
GYM_CPU_TRAIN_GAMMA="${GYM_CPU_TRAIN_GAMMA:-0.95}"
GYM_CPU_TRAIN_EPOCHS="${GYM_CPU_TRAIN_EPOCHS:-5}"
GYM_CPU_TRAIN_MODEL="${GYM_CPU_TRAIN_MODEL:-}"
# 测试专用参数
GYM_CPU_TEST_MODEL="${GYM_CPU_TEST_MODEL:-scripts/gyms/logs/Walk_R0_004/best_model.zip}"
GYM_CPU_TEST_FOLDER="${GYM_CPU_TEST_FOLDER:-scripts/gyms/logs/Walk_R0_004/}"
# 测试默认实时且显示画面:默认均为 0
# 设为 1 表示关闭对应能力
GYM_CPU_TEST_NO_RENDER="${GYM_CPU_TEST_NO_RENDER:-0}"
GYM_CPU_TEST_NO_REALTIME="${GYM_CPU_TEST_NO_REALTIME:-0}"
# Python 解释器选择策略:
# 1) 优先使用你手动传入的 PYTHON_BIN
# 2) 其次用当前激活 conda 环境CONDA_PREFIX/bin/python
# 3) 再回退到默认 mujoco 环境路径
# 4) 最后尝试系统 python / python3
DEFAULT_PYTHON="/home/solren/Downloads/Anaconda/envs/mujoco/bin/python"
CONDA_PYTHON="${CONDA_PREFIX:-}/bin/python"
# 安全保护:不要用 sudo 运行。
# 原因sudo 可能导致 conda 环境与用户会话环境不一致,
# 会引发 python 路径丢失、systemd --user 会话不可见等问题。
if [[ "${EUID}" -eq 0 ]]; then
echo "Do not run this script with sudo; run as your normal user in conda env 'mujoco'."
exit 1
fi
# 解析最终使用的 Python 可执行文件。
if [[ -n "${PYTHON_BIN:-}" ]]; then
PYTHON_EXEC="${PYTHON_BIN}"
elif [[ -n "${CONDA_PREFIX:-}" && -x "${CONDA_PYTHON}" ]]; then
PYTHON_EXEC="${CONDA_PYTHON}"
elif [[ -x "${DEFAULT_PYTHON}" ]]; then
PYTHON_EXEC="${DEFAULT_PYTHON}"
elif command -v python >/dev/null 2>&1; then
PYTHON_EXEC="$(command -v python)"
elif command -v python3 >/dev/null 2>&1; then
PYTHON_EXEC="$(command -v python3)"
else
echo "No Python executable found. Set PYTHON_BIN=/abs/path/to/python and retry."
exit 1
fi
# 脚本所在目录(绝对路径),便于后续定位模块/相对路径。
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# 打印当前生效配置,方便排障和复现实验。
echo "Starting training with limits: CPU=${CPU_QUOTA}, Memory=${MEMORY_MAX}"
echo "Mode: ${GYM_CPU_MODE}"
echo "Runtime knobs: GYM_CPU_N_ENVS=${GYM_CPU_N_ENVS}, GYM_CPU_SERVER_WARMUP_SEC=${GYM_CPU_SERVER_WARMUP_SEC}"
echo "Using Python: ${PYTHON_EXEC}"
if [[ -n "${CONDA_DEFAULT_ENV:-}" ]]; then
echo "Detected conda env: ${CONDA_DEFAULT_ENV}"
fi
# 使用 systemd-run --user --scope 启动“受限资源”的训练进程:
# - CPUQuota: 总 CPU 配额
# - MemoryMax: 最大内存
# - env ... : 显式传递训练参数到 Python 进程
# - python -m scripts.gyms.Walk: 以模块方式启动训练入口
systemd-run --user --scope \
-p CPUQuota="${CPU_QUOTA}" \
-p MemoryMax="${MEMORY_MAX}" \
env \
GYM_CPU_MODE="${GYM_CPU_MODE}" \
GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS}" \
GYM_CPU_SERVER_WARMUP_SEC="${GYM_CPU_SERVER_WARMUP_SEC}" \
GYM_CPU_TRAIN_STEPS_PER_ENV="${GYM_CPU_TRAIN_STEPS_PER_ENV}" \
GYM_CPU_TRAIN_BATCH_SIZE="${GYM_CPU_TRAIN_BATCH_SIZE}" \
GYM_CPU_TRAIN_LR="${GYM_CPU_TRAIN_LR}" \
GYM_CPU_TRAIN_ENT_COEF="${GYM_CPU_TRAIN_ENT_COEF}" \
GYM_CPU_TRAIN_CLIP_RANGE="${GYM_CPU_TRAIN_CLIP_RANGE}" \
GYM_CPU_TRAIN_GAMMA="${GYM_CPU_TRAIN_GAMMA}" \
GYM_CPU_TRAIN_EPOCHS="${GYM_CPU_TRAIN_EPOCHS}" \
GYM_CPU_TRAIN_MODEL="${GYM_CPU_TRAIN_MODEL}" \
GYM_CPU_TEST_MODEL="${GYM_CPU_TEST_MODEL}" \
GYM_CPU_TEST_FOLDER="${GYM_CPU_TEST_FOLDER}" \
GYM_CPU_TEST_NO_RENDER="${GYM_CPU_TEST_NO_RENDER}" \
GYM_CPU_TEST_NO_REALTIME="${GYM_CPU_TEST_NO_REALTIME}" \
"${PYTHON_EXEC}" "-m" "scripts.gyms.Walk"

View File

@@ -47,6 +47,7 @@ class World:
self.their_team_players: list[OtherRobot] = [OtherRobot(is_teammate=False) for _ in self.their_team_players: list[OtherRobot] = [OtherRobot(is_teammate=False) for _ in
range(self.MAX_PLAYERS_PER_TEAM)] range(self.MAX_PLAYERS_PER_TEAM)]
self.field: Field = self.__initialize_field(field_name=field_name) self.field: Field = self.__initialize_field(field_name=field_name)
self.WORLD_STEPTIME: float = 0.005 # Time step of the world in seconds
def update(self) -> None: def update(self) -> None:
""" """