diff --git a/communication/server.py b/communication/server.py index 03eab86..c956d0a 100644 --- a/communication/server.py +++ b/communication/server.py @@ -72,37 +72,37 @@ class Server: self.commit(msg) self.send() - def receive(self) -> None: - """ - Receive the next message from the TCP/IP socket and updates world - """ + def receive(self): - # Receive message length information - if ( - self.__socket.recv_into( - self.__rcv_buffer, nbytes=4, flags=socket.MSG_WAITALL + while True: + + if ( + self.__socket.recv_into( + self.__rcv_buffer, nbytes=4, flags=socket.MSG_WAITALL + ) != 4 + ): + raise ConnectionResetError + + msg_size = int.from_bytes(self.__rcv_buffer[:4], byteorder="big", signed=False) + + if msg_size > self.__rcv_buffer_size: + self.__rcv_buffer_size = msg_size + self.__rcv_buffer = bytearray(self.__rcv_buffer_size) + + if ( + self.__socket.recv_into( + self.__rcv_buffer, nbytes=msg_size, flags=socket.MSG_WAITALL + ) != msg_size + ): + raise ConnectionResetError + + self.world_parser.parse( + message=self.__rcv_buffer[:msg_size].decode() ) - != 4 - ): - raise ConnectionResetError - msg_size = int.from_bytes(self.__rcv_buffer[:4], byteorder="big", signed=False) - - # Ensure receive buffer is large enough to hold the message - if msg_size > self.__rcv_buffer_size: - self.__rcv_buffer_size = msg_size - self.__rcv_buffer = bytearray(self.__rcv_buffer_size) - - # Receive message with the specified length - if ( - self.__socket.recv_into( - self.__rcv_buffer, nbytes=msg_size, flags=socket.MSG_WAITALL - ) - != msg_size - ): - raise ConnectionResetError - - self.world_parser.parse(message=self.__rcv_buffer[:msg_size].decode()) + # 如果socket没有更多数据就退出 + if len(select([self.__socket], [], [], 0.0)[0]) == 0: + break def commit_beam(self, pos2d: list, rotation: float) -> None: assert len(pos2d) == 2 diff --git a/scripts/commons/Server.py b/scripts/commons/Server.py index 17070e6..f238432 100644 --- a/scripts/commons/Server.py +++ b/scripts/commons/Server.py @@ -9,7 +9,7 @@ class Server(): self.check_running_servers(psutil, first_server_p, first_monitor_p, n_servers) except ModuleNotFoundError: print("Info: Cannot check if the server is already running, because the psutil module was not found") - + self.first_server_p = first_server_p self.n_servers = n_servers self.rcss_processes = [] @@ -18,29 +18,38 @@ class Server(): # makes it easier to kill test servers without affecting train servers cmd = "rcssservermj" for i in range(n_servers): + port = first_server_p + i + mport = first_monitor_p + i + + server_cmd = f"{cmd} --aport {port} --mport {mport} --no-render --no-realtime" + self.rcss_processes.append( - subprocess.Popen((f"{cmd} --aport {first_server_p+i} --mport {first_monitor_p+i}").split(), - stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, start_new_session=True) + subprocess.Popen( + server_cmd.split(), + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + start_new_session=True + ) ) def check_running_servers(self, psutil, first_server_p, first_monitor_p, n_servers): ''' Check if any server is running on chosen ports ''' found = False p_list = [p for p in psutil.process_iter() if p.cmdline() and "rcssservermj" in " ".join(p.cmdline())] - range1 = (first_server_p, first_server_p + n_servers) - range2 = (first_monitor_p,first_monitor_p + n_servers) + range1 = (first_server_p, first_server_p + n_servers) + range2 = (first_monitor_p, first_monitor_p + n_servers) bad_processes = [] - for p in p_list: + for p in p_list: # currently ignoring remaining default port when only one of the ports is specified (uncommon scenario) ports = [int(arg) for arg in p.cmdline()[1:] if arg.isdigit()] if len(ports) == 0: - ports = [60000,60100] # default server ports (changing this is unlikely) + ports = [60000, 60100] # default server ports (changing this is unlikely) conflicts = [str(port) for port in ports if ( - (range1[0] <= port < range1[1]) or (range2[0] <= port < range2[1]) )] + (range1[0] <= port < range1[1]) or (range2[0] <= port < range2[1]))] - if len(conflicts)>0: + if len(conflicts) > 0: if not found: print("\nThere are already servers running on the same port(s)!") found = True @@ -55,9 +64,8 @@ class Server(): for p in bad_processes: p.kill() return - def kill(self): for p in self.rcss_processes: p.kill() - print(f"Killed {self.n_servers} rcssservermj processes starting at {self.first_server_p}") \ No newline at end of file + print(f"Killed {self.n_servers} rcssservermj processes starting at {self.first_server_p}") diff --git a/scripts/commons/Train_Base.py b/scripts/commons/Train_Base.py index 8f4203f..617f5b2 100644 --- a/scripts/commons/Train_Base.py +++ b/scripts/commons/Train_Base.py @@ -31,31 +31,29 @@ class Train_Base(): args = script.args self.script = script self.ip = args.i - self.server_p = args.p # (initial) server port - self.monitor_p = args.m + 100 # monitor port when testing - self.monitor_p_1000 = args.m + 1100 # initial monitor port when training + self.server_p = args.p # (initial) server port + self.monitor_p = args.m + 100 # monitor port when testing + self.monitor_p_1000 = args.m + 1100 # initial monitor port when training self.robot_type = args.r self.team = args.t self.uniform = args.u self.cf_last_time = 0 self.cf_delay = 0 # self.cf_target_period = World.STEPTIME # target simulation speed while testing (default: real-time) - - @staticmethod def prompt_user_for_model(self): - gyms_logs_path = "./mujococodebase/scripts/gyms/logs/" + gyms_logs_path = "./scripts/gyms/logs/" folders = [f for f in listdir(gyms_logs_path) if isdir(join(gyms_logs_path, f))] - folders.sort(key=lambda f: os.path.getmtime(join(gyms_logs_path, f)), reverse=True) # sort by modification date + folders.sort(key=lambda f: os.path.getmtime(join(gyms_logs_path, f)), reverse=True) # sort by modification date while True: try: - folder_name = UI.print_list(folders,prompt="Choose folder (ctrl+c to return): ")[1] + folder_name = UI.print_list(folders, prompt="Choose folder (ctrl+c to return): ")[1] except KeyboardInterrupt: print() - return None # ctrl+c + return None # ctrl+c folder_dir = os.path.join(gyms_logs_path, folder_name) models = [m[:-4] for m in listdir(folder_dir) if isfile(join(folder_dir, m)) and m.endswith(".zip")] @@ -64,16 +62,17 @@ class Train_Base(): print("The chosen folder does not contain any .zip file!") continue - models.sort(key=lambda m: os.path.getmtime(join(folder_dir, m+".zip")), reverse=True) # sort by modification date - + models.sort(key=lambda m: os.path.getmtime(join(folder_dir, m + ".zip")), + reverse=True) # sort by modification date + try: - model_name = UI.print_list(models,prompt="Choose model (ctrl+c to return): ")[1] + model_name = UI.print_list(models, prompt="Choose model (ctrl+c to return): ")[1] break except KeyboardInterrupt: print() - return {"folder_dir":folder_dir, "folder_name":folder_name, "model_file":os.path.join(folder_dir, model_name+".zip")} - + return {"folder_dir": folder_dir, "folder_name": folder_name, + "model_file": os.path.join(folder_dir, model_name + ".zip")} # def control_fps(self, read_input = False): # ''' Add delay to control simulation speed ''' @@ -87,7 +86,7 @@ class Train_Base(): # if speed == '0': # inp = input("Paused. Set new speed or '' to use previous speed:") # if inp != '': - # speed = inp + # speed = inp # try: # speed = int(speed) @@ -95,7 +94,7 @@ class Train_Base(): # self.cf_target_period = World.STEPTIME * 100 / speed # print(f"Changed simulation speed to {speed}%") # except: - # print("""Train_Base.py: + # print("""Train_Base.py: # Error: To control the simulation speed, enter a non-negative integer. # To disable this control module, use test_model(..., enable_FPS_control=False) in your gyms environment.""") @@ -108,15 +107,15 @@ class Train_Base(): # else: # self.cf_delay = 0 - - def test_model(self, model:BaseAlgorithm, env, log_path:str=None, model_path:str=None, max_episodes=0, enable_FPS_control=True, verbose=1): + def test_model(self, model: BaseAlgorithm, env, log_path: str = None, model_path: str = None, max_episodes=0, + enable_FPS_control=True, verbose=1): ''' Test model and log results Parameters ---------- model : BaseAlgorithm - Trained model + Trained model env : Env Gym-like environment log_path : str @@ -147,12 +146,12 @@ class Train_Base(): break else: log_path += "/test.csv" - + with open(log_path, 'w') as f: f.write("reward,ep. length,rew. cumulative avg., ep. len. cumulative avg.\n") print("Train statistics are saved to:", log_path) - if enable_FPS_control: # control simulation speed (using non blocking user input) + if enable_FPS_control: # control simulation speed (using non blocking user input) print("\nThe simulation speed can be changed by sending a non-negative integer\n" "(e.g. '50' sets speed to 50%, '0' pauses the simulation, '' sets speed to MAX)\n") @@ -172,8 +171,8 @@ class Train_Base(): ep_reward += reward ep_length += 1 - if enable_FPS_control: # control simulation speed (using non blocking user input) - self.control_fps(select.select([sys.stdin], [], [], 0)[0]) + if enable_FPS_control: # control simulation speed (using non blocking user input) + self.control_fps(select.select([sys.stdin], [], [], 0)[0]) if done: obs, _ = env.reset() @@ -182,25 +181,28 @@ class Train_Base(): reward_max = max(ep_reward, reward_max) reward_min = min(ep_reward, reward_min) ep_no += 1 - avg_ep_lengths = ep_lengths_sum/ep_no - avg_rewards = rewards_sum/ep_no + avg_ep_lengths = ep_lengths_sum / ep_no + avg_rewards = rewards_sum / ep_no if verbose > 0: - print( f"\rEpisode: {ep_no:<3} Ep.Length: {ep_length:<4.0f} Reward: {ep_reward:<6.2f} \n", - end=f"--AVERAGE-- Ep.Length: {avg_ep_lengths:<4.0f} Reward: {avg_rewards:<6.2f} (Min: {reward_min:<6.2f} Max: {reward_max:<6.2f})", flush=True) - + print( + f"\rEpisode: {ep_no:<3} Ep.Length: {ep_length:<4.0f} Reward: {ep_reward:<6.2f} \n", + end=f"--AVERAGE-- Ep.Length: {avg_ep_lengths:<4.0f} Reward: {avg_rewards:<6.2f} (Min: {reward_min:<6.2f} Max: {reward_max:<6.2f})", + flush=True) + if log_path is not None: with open(log_path, 'a') as f: writer = csv.writer(f) writer.writerow([ep_reward, ep_length, avg_rewards, avg_ep_lengths]) - + if ep_no == max_episodes: return ep_reward = 0 ep_length = 0 - def learn_model(self, model:BaseAlgorithm, total_steps:int, path:str, eval_env=None, eval_freq=None, eval_eps=5, save_freq=None, backup_env_file=None, export_name=None): + def learn_model(self, model: BaseAlgorithm, total_steps: int, path: str, eval_env=None, eval_freq=None, eval_eps=5, + save_freq=None, backup_env_file=None, export_name=None): ''' Learn Model for a specific number of time steps @@ -251,7 +253,7 @@ class Train_Base(): # If path already exists, add suffix to avoid overwriting if os.path.isdir(path): for i in count(): - p = path.rstrip("/")+f'_{i:03}/' + p = path.rstrip("/") + f'_{i:03}/' if not os.path.isdir(p): path = p break @@ -265,22 +267,28 @@ class Train_Base(): evaluate = bool(eval_env is not None and eval_freq is not None) # Create evaluation callback - eval_callback = None if not evaluate else EvalCallback(eval_env, n_eval_episodes=eval_eps, eval_freq=eval_freq, log_path=path, - best_model_save_path=path, deterministic=True, render=False) + eval_callback = None if not evaluate else EvalCallback(eval_env, n_eval_episodes=eval_eps, eval_freq=eval_freq, + log_path=path, + best_model_save_path=path, deterministic=True, + render=False) # Create custom callback to display evaluations - custom_callback = None if not evaluate else Cyclic_Callback(eval_freq, lambda:self.display_evaluations(path,True)) + custom_callback = None if not evaluate else Cyclic_Callback(eval_freq, + lambda: self.display_evaluations(path, True)) # Create checkpoint callback - checkpoint_callback = None if save_freq is None else CheckpointCallback(save_freq=save_freq, save_path=path, name_prefix="model", verbose=1) + checkpoint_callback = None if save_freq is None else CheckpointCallback(save_freq=save_freq, save_path=path, + name_prefix="model", verbose=1) # Create custom callback to export checkpoint models - export_callback = None if save_freq is None or export_name is None else Export_Callback(save_freq, path, export_name) + export_callback = None if save_freq is None or export_name is None else Export_Callback(save_freq, path, + export_name) - callbacks = CallbackList([c for c in [eval_callback, custom_callback, checkpoint_callback, export_callback] if c is not None]) + callbacks = CallbackList( + [c for c in [eval_callback, custom_callback, checkpoint_callback, export_callback] if c is not None]) - model.learn( total_timesteps=total_steps, callback=callbacks ) - model.save( os.path.join(path, "last_model") ) + model.learn(total_timesteps=total_steps, callback=callbacks) + model.save(os.path.join(path, "last_model")) # Display evaluations if they exist if evaluate: @@ -288,18 +296,18 @@ class Train_Base(): # Display timestamps + Model path end_date = datetime.now().strftime('%d/%m/%Y %H:%M:%S') - duration = timedelta(seconds=int(time.time()-start)) + duration = timedelta(seconds=int(time.time() - start)) print(f"Train start: {start_date}") print(f"Train end: {end_date}") print(f"Train duration: {duration}") print(f"Model path: {path}") - + # Append timestamps to backup environment file if backup_env_file is not None: with open(backup_file, 'a') as f: f.write(f"\n# Train start: {start_date}\n") - f.write( f"# Train end: {end_date}\n") - f.write( f"# Train duration: {duration}") + f.write(f"# Train end: {end_date}\n") + f.write(f"# Train duration: {duration}") return path @@ -318,50 +326,57 @@ class Train_Base(): with np.load(eval_npz) as data: time_steps = data["timesteps"] - results_raw = np.mean(data["results"],axis=1) - ep_lengths_raw = np.mean(data["ep_lengths"],axis=1) + results_raw = np.mean(data["results"], axis=1) + ep_lengths_raw = np.mean(data["ep_lengths"], axis=1) sample_no = len(results_raw) - xvals = np.linspace(0, sample_no-1, 80) - results = np.interp(xvals, range(sample_no), results_raw) + xvals = np.linspace(0, sample_no - 1, 80) + results = np.interp(xvals, range(sample_no), results_raw) ep_lengths = np.interp(xvals, range(sample_no), ep_lengths_raw) - results_limits = np.min(results), np.max(results) + results_limits = np.min(results), np.max(results) ep_lengths_limits = np.min(ep_lengths), np.max(ep_lengths) - results_discrete = np.digitize(results, np.linspace(results_limits[0]-1e-5, results_limits[1]+1e-5, console_height+1))-1 - ep_lengths_discrete = np.digitize(ep_lengths, np.linspace(0, ep_lengths_limits[1]+1e-5, console_height+1))-1 + results_discrete = np.digitize(results, np.linspace(results_limits[0] - 1e-5, results_limits[1] + 1e-5, + console_height + 1)) - 1 + ep_lengths_discrete = np.digitize(ep_lengths, + np.linspace(0, ep_lengths_limits[1] + 1e-5, console_height + 1)) - 1 matrix = np.zeros((console_height, console_width, 2), int) - matrix[results_discrete[0] ][0][0] = 1 # draw 1st column - matrix[ep_lengths_discrete[0]][0][1] = 1 # draw 1st column + matrix[results_discrete[0]][0][0] = 1 # draw 1st column + matrix[ep_lengths_discrete[0]][0][1] = 1 # draw 1st column rng = [[results_discrete[0], results_discrete[0]], [ep_lengths_discrete[0], ep_lengths_discrete[0]]] # Create continuous line for both plots for k in range(2): - for i in range(1,console_width): + for i in range(1, console_width): x = [results_discrete, ep_lengths_discrete][k][i] if x > rng[k][1]: - rng[k] = [rng[k][1]+1, x] + rng[k] = [rng[k][1] + 1, x] elif x < rng[k][0]: - rng[k] = [x, rng[k][0]-1] + rng[k] = [x, rng[k][0] - 1] else: - rng[k] = [x,x] - for j in range(rng[k][0],rng[k][1]+1): + rng[k] = [x, x] + for j in range(rng[k][0], rng[k][1] + 1): matrix[j][i][k] = 1 - print(f'{"-"*console_width}') + print(f'{"-" * console_width}') for l in reversed(range(console_height)): for c in range(console_width): - if np.all(matrix[l][c] == 0): print(end=" ") - elif np.all(matrix[l][c] == 1): print(end=symb_xo) - elif matrix[l][c][0] == 1: print(end=symb_x) - else: print(end=symb_o) + if np.all(matrix[l][c] == 0): + print(end=" ") + elif np.all(matrix[l][c] == 1): + print(end=symb_xo) + elif matrix[l][c][0] == 1: + print(end=symb_x) + else: + print(end=symb_o) print() - print(f'{"-"*console_width}') + print(f'{"-" * console_width}') print(f"({symb_x})-reward min:{results_limits[0]:11.2f} max:{results_limits[1]:11.2f}") - print(f"({symb_o})-ep. length min:{ep_lengths_limits[0]:11.0f} max:{ep_lengths_limits[1]:11.0f} {time_steps[-1]/1000:15.0f}k steps") - print(f'{"-"*console_width}') + print( + f"({symb_o})-ep. length min:{ep_lengths_limits[0]:11.0f} max:{ep_lengths_limits[1]:11.0f} {time_steps[-1] / 1000:15.0f}k steps") + print(f'{"-" * console_width}') # save CSV if save_csv: @@ -370,8 +385,7 @@ class Train_Base(): writer = csv.writer(f) if sample_no == 1: writer.writerow(["time_steps", "reward ep.", "length"]) - writer.writerow([time_steps[-1],results_raw[-1],ep_lengths_raw[-1]]) - + writer.writerow([time_steps[-1], results_raw[-1], ep_lengths_raw[-1]]) # def generate_slot_behavior(self, path, slots, auto_head:bool, XML_name): # ''' @@ -394,7 +408,7 @@ class Train_Base(): # xml_pretty = minidom.parseString(xml_rough).toprettyxml(indent=" ") # with open(file, "w") as x: # x.write(xml_pretty) - + # print(file, "was created!") # @staticmethod @@ -406,7 +420,7 @@ class Train_Base(): # ---------- # initial_value : float # Initial learning rate - + # Returns # ------- # schedule : Callable[[float], float] @@ -420,7 +434,7 @@ class Train_Base(): # ---------- # progress_remaining : float # Progress will decrease from 1 (beginning) to 0 - + # Returns # ------- # learning_rate : float @@ -452,28 +466,28 @@ class Train_Base(): if not os.path.isfile(f): output_file = f break - - model = PPO.load(input_file) - weights = model.policy.state_dict() # dictionary containing network layers - w = lambda name : weights[name].detach().cpu().numpy() # extract weights from policy + model = PPO.load(input_file) + weights = model.policy.state_dict() # dictionary containing network layers + + w = lambda name: weights[name].detach().cpu().numpy() # extract weights from policy var_list = [] - for i in count(0,2): # add hidden layers (step=2 because that's how SB3 works) + for i in count(0, 2): # add hidden layers (step=2 because that's how SB3 works) if f"mlp_extractor.policy_net.{i}.bias" not in weights: break - var_list.append([w(f"mlp_extractor.policy_net.{i}.bias"), w(f"mlp_extractor.policy_net.{i}.weight"), "tanh"]) + var_list.append( + [w(f"mlp_extractor.policy_net.{i}.bias"), w(f"mlp_extractor.policy_net.{i}.weight"), "tanh"]) - var_list.append( [w("action_net.bias"), w("action_net.weight"), "none"] ) # add final layer - - with open(output_file,"wb") as f: - pickle.dump(var_list, f, protocol=4) # protocol 4 is backward compatible with Python 3.4 + var_list.append([w("action_net.bias"), w("action_net.weight"), "none"]) # add final layer + + with open(output_file, "wb") as f: + pickle.dump(var_list, f, protocol=4) # protocol 4 is backward compatible with Python 3.4 - def print_list(data, numbering=True, prompt=None, divider=" | ", alignment="<", min_per_col=6): ''' Print list - prints list, using as many columns as possible - + Parameters ---------- data : `list` @@ -488,7 +502,7 @@ class Train_Base(): f-string style alignment ( '<', '>', '^' ) min_per_col : int avoid splitting columns with fewer items - + Returns ------- item : `int`, item @@ -496,65 +510,67 @@ class Train_Base(): or `None` (if `numbering` or `prompt` are `None`) ''' - + WIDTH = shutil.get_terminal_size()[0] - data_size = len(data) + data_size = len(data) items = [] items_len = [] - #--------------------------------------------- Add numbers, margins and divider + # --------------------------------------------- Add numbers, margins and divider for i in range(data_size): number = f"{i}-" if numbering else "" - items.append( f"{divider}{number}{data[i]}" ) - items_len.append( len(items[-1]) ) + items.append(f"{divider}{number}{data[i]}") + items_len.append(len(items[-1])) - max_cols = np.clip((WIDTH+len(divider)) // min(items_len),1,math.ceil(data_size/max(min_per_col,1))) # width + len(divider) because it is not needed in last col + max_cols = np.clip((WIDTH + len(divider)) // min(items_len), 1, math.ceil( + data_size / max(min_per_col, 1))) # width + len(divider) because it is not needed in last col - #--------------------------------------------- Check maximum number of columns, considering content width (min:1) - for i in range(max_cols,0,-1): + # --------------------------------------------- Check maximum number of columns, considering content width (min:1) + for i in range(max_cols, 0, -1): cols_width = [] cols_items = [] table_width = 0 - a,b = divmod(data_size,i) + a, b = divmod(data_size, i) for col in range(i): - start = a*col + min(b,col) - end = start+a+(1 if col row else divider # print divider when there are no items + content = cols_items[col][row] if len( + cols_items[col]) > row else divider # print divider when there are no items if col == 0: l = len(divider) - print(end=f"{content[l:]:{alignment}{cols_width[col]-l}}") # remove divider from 1st col + print(end=f"{content[l:]:{alignment}{cols_width[col] - l}}") # remove divider from 1st col else: - print(end=f"{content :{alignment}{cols_width[col] }}") - print() - print("="*table_width) + print(end=f"{content :{alignment}{cols_width[col]}}") + print() + print("=" * table_width) - #--------------------------------------------- Prompt + # --------------------------------------------- Prompt if prompt is None: return None if numbering is None: return None else: - idx = UI.read_int( prompt, 0, data_size ) + idx = UI.read_int(prompt, 0, data_size) return idx, data[idx] - class Cyclic_Callback(BaseCallback): ''' Stable baselines custom callback ''' + def __init__(self, freq, function): super(Cyclic_Callback, self).__init__(1) self.freq = freq @@ -563,10 +579,12 @@ class Cyclic_Callback(BaseCallback): def _on_step(self) -> bool: if self.n_calls % self.freq == 0: self.function() - return True # If the callback returns False, training is aborted early + return True # If the callback returns False, training is aborted early + class Export_Callback(BaseCallback): ''' Stable baselines custom callback ''' + def __init__(self, freq, load_path, export_name): super(Export_Callback, self).__init__(1) self.freq = freq @@ -577,8 +595,7 @@ class Export_Callback(BaseCallback): if self.n_calls % self.freq == 0: path = os.path.join(self.load_path, f"model_{self.num_timesteps}_steps.zip") Train_Base.export_model(path, f"./scripts/gyms/export/{self.export_name}") - return True # If the callback returns False, training is aborted early - + return True # If the callback returns False, training is aborted early + - \ No newline at end of file diff --git a/scripts/gyms/Walk.py b/scripts/gyms/Walk.py index 4447dde..dd2f42a 100644 --- a/scripts/gyms/Walk.py +++ b/scripts/gyms/Walk.py @@ -1,11 +1,11 @@ import os import numpy as np import math -from time import sleep +import time +from time import sleep from random import random from random import uniform - from stable_baselines3 import PPO from stable_baselines3.common.vec_env import SubprocVecEnv @@ -28,25 +28,24 @@ Learn how to run forward using step primitive - class Train: implements algorithms to train a new model or test an existing model ''' + class WalkEnv(gym.Env): def __init__(self, ip, server_p) -> None: - - # Args: Server IP, Agent Port, Monitor Port, Uniform No., Robot Type, Team Name, Enable Log, Enable Draw self.Player = player = Base_Agent( - team_name="Gym", - number=1, - host=ip, - port=server_p + team_name="Gym", + number=1, + host=ip, + port=server_p ) - self.robot_type = self.Player.robot - self.step_counter = 0 # to limit episode size + self.robot_type = self.Player.robot + self.step_counter = 0 # to limit episode size self.force_play_on = True - self.target_position = np.array([0.0, 0.0]) # target position in the x-y plane - self.initial_position = np.array([0.0, 0.0]) # initial position in the x-y plane - self.target_direction = 0.0 # target direction in the x-y plane (relative to the robot's orientation) + self.target_position = np.array([0.0, 0.0]) # target position in the x-y plane + self.initial_position = np.array([0.0, 0.0]) # initial position in the x-y plane + self.target_direction = 0.0 # target direction in the x-y plane (relative to the robot's orientation) self.isfallen = False self.waypoint_index = 0 self.route_completed = False @@ -55,16 +54,26 @@ class WalkEnv(gym.Env): self.auto_calibrate_train_sim_flip = True self.nominal_calibrated_once = False self.flip_calibrated_once = False - + self._target_hz = 0.0 + self._target_dt = 0.0 + self._last_sync_time = None + target_hz_env = 24 + if target_hz_env: + try: + self._target_hz = float(target_hz_env) + except ValueError: + self._target_hz = 0.0 + if self._target_hz > 0.0: + self._target_dt = 1.0 / self._target_hz # State space # 原始观测大小: 78 obs_size = 78 self.obs = np.zeros(obs_size, np.float32) self.observation_space = spaces.Box( - low=-10.0, - high=10.0, - shape=(obs_size,), + low=-10.0, + high=10.0, + shape=(obs_size,), dtype=np.float32 ) @@ -73,11 +82,10 @@ class WalkEnv(gym.Env): self.action_space = spaces.Box( low=-1.0, high=1.0, - shape=(action_dim,), + shape=(action_dim,), dtype=np.float32 ) - # 中立姿态 self.joint_nominal_position = np.array( [ @@ -110,22 +118,22 @@ class WalkEnv(gym.Env): self.train_sim_flip = np.array( [ - 1.0, # 0: Head_yaw (he1) + 1.0, # 0: Head_yaw (he1) -1.0, # 1: Head_pitch (he2) - 1.0, # 2: Left_Shoulder_Pitch (lae1) + 1.0, # 2: Left_Shoulder_Pitch (lae1) -1.0, # 3: Left_Shoulder_Roll (lae2) - 1.0, # 4: Left_Elbow_Pitch (lae3) - 1.0, # 5: Left_Elbow_Yaw (lae4) + 1.0, # 4: Left_Elbow_Pitch (lae3) + 1.0, # 5: Left_Elbow_Yaw (lae4) -1.0, # 6: Right_Shoulder_Pitch (rae1) - 1.0, # 7: Right_Shoulder_Roll (rae2) - 1.0, # 8: Right_Elbow_Pitch (rae3) - 1.0, # 9: Right_Elbow_Yaw (rae4) - 1.0, # 10: Waist (te1) - 1.0, # 11: Left_Hip_Pitch (lle1) + 1.0, # 7: Right_Shoulder_Roll (rae2) + 1.0, # 8: Right_Elbow_Pitch (rae3) + 1.0, # 9: Right_Elbow_Yaw (rae4) + 1.0, # 10: Waist (te1) + 1.0, # 11: Left_Hip_Pitch (lle1) -1.0, # 12: Left_Hip_Roll (lle2) -1.0, # 13: Left_Hip_Yaw (lle3) - 1.0, # 14: Left_Knee_Pitch (lle4) - 1.0, # 15: Left_Ankle_Pitch (lle5) + 1.0, # 14: Left_Knee_Pitch (lle4) + 1.0, # 15: Left_Ankle_Pitch (lle5) -1.0, # 16: Left_Ankle_Roll (lle6) -1.0, # 17: Right_Hip_Pitch (rle1) -1.0, # 18: Right_Hip_Roll (rle2) @@ -141,7 +149,7 @@ class WalkEnv(gym.Env): self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_pos = np.array([0.0, 0.0]) # Track previous position self.Player.server.connect() - sleep(2.0) # Longer wait for connection to establish completely + # sleep(2.0) # Longer wait for connection to establish completely self.Player.server.send_immediate( f"(init {self.Player.robot.name} {self.Player.world.team_name} {self.Player.world.number})" ) @@ -160,7 +168,7 @@ class WalkEnv(gym.Env): changed = [] for idx, (reference_value, observed_value) in enumerate( - zip(self.reference_joint_nominal_position, neutral_joint_positions) + zip(self.reference_joint_nominal_position, neutral_joint_positions) ): if idx >= 10: continue @@ -187,38 +195,37 @@ class WalkEnv(gym.Env): height = float(self.Player.world.global_position[2]) reliable = ( - leg_norm > 0.8 - and leg_max > 0.35 - and 0.12 < height < 0.8 + leg_norm > 0.8 + and leg_max > 0.35 + and 0.12 < height < 0.8 ) return reliable, leg_norm, leg_max, height - def observe(self, init=False): """获取当前观测值""" robot = self.Player.robot world = self.Player.world - + # Safety check: ensure data is available - + # 计算目标速度 raw_target = self.target_position - world.global_position[:2] velocity = MathOps.rotate_2d_vec( - raw_target, - -robot.global_orientation_euler[2], + raw_target, + -robot.global_orientation_euler[2], is_rad=False ) - + # 计算相对方向 rel_orientation = MathOps.vector_angle(velocity) * 0.3 rel_orientation = np.clip(rel_orientation, -0.25, 0.25) - + velocity = np.concatenate([velocity, np.array([rel_orientation])]) velocity[0] = np.clip(velocity[0], -0.5, 0.5) velocity[1] = np.clip(velocity[1], -0.25, 0.25) - + # 关节状态 radian_joint_positions = np.deg2rad( [robot.motor_positions[motor] for motor in robot.ROBOT_MOTORS] @@ -226,21 +233,20 @@ class WalkEnv(gym.Env): radian_joint_speeds = np.deg2rad( [robot.motor_speeds[motor] for motor in robot.ROBOT_MOTORS] ) - + qpos_qvel_previous_action = np.concatenate([ (radian_joint_positions * self.train_sim_flip - self.joint_nominal_position) / 4.6, radian_joint_speeds / 110.0 * self.train_sim_flip, self.previous_action / 10.0, ]) - + # 角速度 ang_vel = np.clip(np.deg2rad(robot.gyroscope) / 50.0, -1.0, 1.0) - + # 投影的重力方向 orientation_quat_inv = R.from_quat(robot._global_cheat_orientation).inv() projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0])) - - + # 组合观测 observation = np.concatenate([ qpos_qvel_previous_action, @@ -249,8 +255,6 @@ class WalkEnv(gym.Env): projected_gravity, ]) - - observation = np.clip(observation, -10.0, 10.0) return observation.astype(np.float32) @@ -260,6 +264,17 @@ class WalkEnv(gym.Env): self.Player.world.update() self.Player.robot.commit_motor_targets_pd() self.Player.server.send() + if self._target_dt > 0.0: + now = time.time() + if self._last_sync_time is None: + self._last_sync_time = now + return + elapsed = now - self._last_sync_time + remaining = self._target_dt - elapsed + if remaining > 0.0: + time.sleep(remaining) + now = time.time() + self._last_sync_time = now def debug_joint_status(self): robot = self.Player.robot @@ -295,12 +310,11 @@ class WalkEnv(gym.Env): if seed is not None: np.random.seed(seed) - length1 = np.random.uniform(10, 20) # randomize target distance - length2 = np.random.uniform(10, 20) # randomize target distance - length3 = np.random.uniform(10, 20) # randomize target distance - angle2 = np.random.uniform(-30, 30) # randomize initial orientation - angle3 = np.random.uniform(-30, 30) # randomize target direction - + length1 = np.random.uniform(10, 20) # randomize target distance + length2 = np.random.uniform(10, 20) # randomize target distance + length3 = np.random.uniform(10, 20) # randomize target distance + angle2 = np.random.uniform(-30, 30) # randomize initial orientation + angle3 = np.random.uniform(-30, 30) # randomize target direction self.step_counter = 0 self.waypoint_index = 0 @@ -322,12 +336,12 @@ class WalkEnv(gym.Env): # 执行 Neutral 技能直到完成,给机器人足够时间在 beam 位置稳定站立 finished_count = 0 - for _ in range(20): + for _ in range(10): finished = self.Player.skills_manager.execute("Neutral") self.sync() if finished: finished_count += 1 - if finished_count >= 2: # 假设需要连续2次完成才算成功 + if finished_count >= 3: # 假设需要连续3次完成才算成功 break # neutral_joint_positions = np.deg2rad( @@ -356,17 +370,15 @@ class WalkEnv(gym.Env): # reset_action_noise = np.random.uniform(-0.015, 0.015, size=(len(self.Player.robot.ROBOT_MOTORS),)) # self.target_joint_positions = (self.joint_nominal_position + reset_action_noise) * self.train_sim_flip - # for idx, target in enumerate(self.target_joint_positions): # r.set_motor_target_position( # r.ROBOT_MOTORS[idx], target*180/math.pi, kp=25, kd=0.6 # ) - # memory variables self.initial_position = np.array(self.Player.world.global_position[:2]) self.previous_pos = self.initial_position.copy() # Critical: set to actual position - self.act = np.zeros(self.no_of_actions ,np.float32) + self.act = np.zeros(self.no_of_actions, np.float32) point1 = self.initial_position + np.array([length1, 0]) point2 = point1 + MathOps.rotate_2d_vec(np.array([length2, 0]), angle2, is_rad=False) point3 = point2 + MathOps.rotate_2d_vec(np.array([length3, 0]), angle3, is_rad=False) @@ -423,57 +435,51 @@ class WalkEnv(gym.Env): tilt_penalty = -0.2 * np.linalg.norm(self.Player.robot.gyroscope[:2]) / 100.0 return ( - progress_reward - + speed_reward - + direction_reward - + alive_bonus - + height_reward - + motionless_penalty - + waypoint_bonus - + action_penalty - + tilt_penalty + progress_reward + + speed_reward + + direction_reward + + alive_bonus + + height_reward + + motionless_penalty + + waypoint_bonus + + action_penalty + + tilt_penalty ) def step(self, action): - - r = self.Player.robot + r = self.Player.robot self.previous_action = action self.target_joint_positions = ( - self.joint_nominal_position - + self.scaling_factor * action + self.joint_nominal_position + + self.scaling_factor * action ) self.target_joint_positions *= self.train_sim_flip - for idx, target in enumerate(self.target_joint_positions): r.set_motor_target_position( - r.ROBOT_MOTORS[idx], target*180/math.pi, kp=25, kd=0.6 + r.ROBOT_MOTORS[idx], target * 180 / math.pi, kp=25, kd=0.6 ) - - - - self.sync() # run simulation step + self.sync() # run simulation step self.step_counter += 1 # if self.step_counter % self.debug_every_n_steps == 0: # self.debug_joint_status() - + current_pos = np.array(self.Player.world.global_position[:2], dtype=np.float32) - + # Compute reward based on movement from previous step reward = self.compute_reward(self.previous_pos, current_pos, action) - - + # Update previous position self.previous_pos = current_pos.copy() - + # Fall detection and penalty is_fallen = self.Player.world.global_position[2] < 0.3 - + # terminal state: the robot is falling or timeout terminated = is_fallen or self.step_counter > 800 or self.route_completed truncated = False @@ -481,64 +487,65 @@ class WalkEnv(gym.Env): return self.observe(), reward, terminated, truncated, {} - - - class Train(Train_Base): def __init__(self, script) -> None: super().__init__(script) - def train(self, args): - #--------------------------------------- Learning parameters + # --------------------------------------- Learning parameters n_envs = 8 # Reduced from 8 to decrease CPU/network pressure during init + if n_envs < 1: + raise ValueError("GYM_CPU_N_ENVS must be >= 1") n_steps_per_env = 512 # RolloutBuffer is of size (n_steps_per_env * n_envs) - minibatch_size = 64 # should be a factor of (n_steps_per_env * n_envs) + minibatch_size = 128 # should be a factor of (n_steps_per_env * n_envs) total_steps = 30000000 learning_rate = 3e-4 folder_name = f'Walk_R{self.robot_type}' - model_path = f'./scripts/gyms/logs/{folder_name}/' + model_path = f'./scripts/gyms/logs/{folder_name}/' print(f"Model path: {model_path}") print(f"Using {n_envs} parallel environments") - #--------------------------------------- Run algorithm + # --------------------------------------- Run algorithm def init_env(i_env): def thunk(): - return WalkEnv( self.ip , self.server_p + i_env) + return WalkEnv(self.ip, self.server_p + i_env) + return thunk - servers = Train_Server( self.server_p, self.monitor_p_1000, n_envs+1 ) #include 1 extra server for testing - + server_log_dir = os.path.join(model_path, "server_logs") + os.makedirs(server_log_dir, exist_ok=True) + servers = Train_Server(self.server_p, self.monitor_p_1000, n_envs + 1) # include 1 extra server for testing + # Wait for servers to start - print(f"Starting {n_envs+1} rcssservermj servers...") + print(f"Starting {n_envs + 1} rcssservermj servers...") print("Servers started, creating environments...") - env = SubprocVecEnv( [init_env(i) for i in range(n_envs)] ) - eval_env = SubprocVecEnv( [init_env(n_envs)] ) - + env = SubprocVecEnv([init_env(i) for i in range(n_envs)]) + eval_env = SubprocVecEnv([init_env(n_envs)]) try: # Custom policy network architecture policy_kwargs = dict( net_arch=dict( pi=[256, 256, 128], # Policy network: 3 layers - vf=[256, 256, 128] # Value network: 3 layers + vf=[256, 256, 128] # Value network: 3 layers ), activation_fn=__import__('torch.nn', fromlist=['ReLU']).ReLU, ) - - if "model_file" in args: # retrain - model = PPO.load( args["model_file"], env=env, device="cpu", n_envs=n_envs, n_steps=n_steps_per_env, batch_size=minibatch_size, learning_rate=learning_rate ) - else: # train new model - model = PPO( - "MlpPolicy", - env=env, - verbose=1, - n_steps=n_steps_per_env, - batch_size=minibatch_size, - learning_rate=learning_rate, + + if "model_file" in args: # retrain + model = PPO.load(args["model_file"], env=env, device="cpu", n_envs=n_envs, n_steps=n_steps_per_env, + batch_size=minibatch_size, learning_rate=learning_rate) + else: # train new model + model = PPO( + "MlpPolicy", + env=env, + verbose=1, + n_steps=n_steps_per_env, + batch_size=minibatch_size, + learning_rate=learning_rate, device="cpu", policy_kwargs=policy_kwargs, ent_coef=0.01, # Entropy coefficient for exploration @@ -547,28 +554,32 @@ class Train(Train_Base): gamma=0.99 # Discount factor ) - model_path = self.learn_model( model, total_steps, model_path, eval_env=eval_env, eval_freq=n_steps_per_env*20, save_freq=n_steps_per_env*20, backup_env_file=__file__ ) + model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env, + eval_freq=n_steps_per_env * 10, save_freq=n_steps_per_env * 10, + backup_env_file=__file__) except KeyboardInterrupt: - sleep(1) # wait for child processes + sleep(1) # wait for child processes print("\nctrl+c pressed, aborting...\n") servers.kill() return - + env.close() eval_env.close() servers.kill() - def test(self, args): # Uses different server and monitor ports - server = Train_Server( self.server_p-1, self.monitor_p, 1 ) - env = WalkEnv( self.ip, self.server_p-1 ) - model = PPO.load( args["model_file"], env=env ) + server_log_dir = os.path.join(args["folder_dir"], "server_logs") + os.makedirs(server_log_dir, exist_ok=True) + server = Train_Server(self.server_p - 1, self.monitor_p, 1, log_dir=server_log_dir) + env = WalkEnv(self.ip, self.server_p - 1) + model = PPO.load(args["model_file"], env=env) try: - self.export_model( args["model_file"], args["model_file"]+".pkl", False ) # Export to pkl to create custom behavior - self.test_model( model, env, log_path=args["folder_dir"], model_path=args["folder_dir"] ) + self.export_model(args["model_file"], args["model_file"] + ".pkl", + False) # Export to pkl to create custom behavior + self.test_model(model, env, log_path=args["folder_dir"], model_path=args["folder_dir"]) except KeyboardInterrupt: print() @@ -578,18 +589,18 @@ class Train(Train_Base): if __name__ == "__main__": from types import SimpleNamespace - + # 创建默认参数 script_args = SimpleNamespace( args=SimpleNamespace( i='127.0.0.1', # Server IP - p=3100, # Server port - m=3200, # Monitor port - r=0, # Robot type - t='Gym', # Team name - u=1 # Uniform number + p=3100, # Server port + m=3200, # Monitor port + r=0, # Robot type + t='Gym', # Team name + u=1 # Uniform number ) ) - + trainer = Train(script_args) - trainer.train({}) \ No newline at end of file + trainer.train({"model_file": "scripts/gyms/logs/Walk_R0_000/model_245760_steps.zip"})