diff --git a/command.md b/command.md index 14ba73a..4097301 100644 --- a/command.md +++ b/command.md @@ -11,4 +11,4 @@ retrain(继续训练) GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_005/best_model.zip bash train.sh retrain+改训练超参 -GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_004/best_model.zip GYM_CPU_TRAIN_LR=2e-4 GYM_CPU_TRAIN_BATCH_SIZE=256 GYM_CPU_TRAIN_EPOCHS=8 bash train.sh \ No newline at end of file +GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_004/best_model.zip GYM_CPU_TRAIN_LR=2e-4 GYM_CPU_TRAIN_CLIP_RANGE=0.13 GYM_CPU_TRAIN_BATCH_SIZE=256 YM_CPU_TRAIN_GAMMA=0.95 GYM_CPU_TRAIN_ENT_COEF=0.05 GYM_CPU_TRAIN_EPOCHS=8 bash train.sh \ No newline at end of file diff --git a/scripts/gyms/Walk.py b/scripts/gyms/Walk.py index 8a2cfeb..4983f04 100755 --- a/scripts/gyms/Walk.py +++ b/scripts/gyms/Walk.py @@ -5,6 +5,7 @@ import time from time import sleep from random import random from random import uniform +from itertools import count from stable_baselines3 import PPO from stable_baselines3.common.monitor import Monitor @@ -148,11 +149,14 @@ class WalkEnv(gym.Env): self.scaling_factor = 0.3 # self.scaling_factor = 1 + # Encourage a minimum lateral stance so the policy avoids feet overlap. + self.min_stance_rad = 0.10 + # Small reset perturbations for robustness training. self.enable_reset_perturb = True self.reset_beam_yaw_range_deg = 180 # randomize target direction fully to encourage learning a real walk instead of a fixed gait - self.reset_joint_noise_rad = 0.015 - self.reset_perturb_steps = 3 + self.reset_joint_noise_rad = 0.025 + self.reset_perturb_steps = 4 self.reset_recover_steps = 8 self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) @@ -392,14 +396,15 @@ class WalkEnv(gym.Env): def compute_reward(self, previous_pos, current_pos, action): height = float(self.Player.world.global_position[2]) + robot = self.Player.robot - orientation_quat_inv = R.from_quat(self.Player.robot._global_cheat_orientation).inv() + orientation_quat_inv = R.from_quat(robot._global_cheat_orientation).inv() projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0])) tilt_mag = float(np.linalg.norm(projected_gravity[:2])) - ang_vel = np.deg2rad(self.Player.robot.gyroscope) + ang_vel = np.deg2rad(robot.gyroscope) ang_vel_mag = float(np.linalg.norm(ang_vel)) - is_fallen = height < 0.3 + is_fallen = height < 0.55 if is_fallen: # remain = max(0, 800 - self.step_counter) # return -8.0 - 0.01 * remain @@ -429,6 +434,23 @@ class WalkEnv(gym.Env): posture_penalty = -0.3 * (tilt_mag) ang_vel_penalty = -0.02 * ang_vel_mag + # Use simulator joint readings in training frame to shape lateral stance. + joint_pos = np.deg2rad( + [robot.motor_positions[motor] for motor in robot.ROBOT_MOTORS] + ) * self.train_sim_flip + left_hip_roll = float(joint_pos[12]) + right_hip_roll = float(joint_pos[18]) + left_ankle_roll = float(joint_pos[16]) + right_ankle_roll = float(joint_pos[22]) + + hip_spread = left_hip_roll - right_hip_roll + ankle_spread = left_ankle_roll - right_ankle_roll + stance_metric = 0.6 * abs(hip_spread) + 0.4 * abs(ankle_spread) + + # Penalize narrow stance (feet too close) and scissoring (cross-leg pattern). + stance_collapse_penalty = -4.0 * max(0.0, self.min_stance_rad - stance_metric) + cross_leg_penalty = -1.2 * max(0.0, -(hip_spread * ankle_spread)) + target_height = self.initial_height height_error = height - target_height height_penalty = -0.5 * abs(height_error) # 惩罚高度偏离,系数可调 @@ -464,10 +486,12 @@ class WalkEnv(gym.Env): posture_penalty + ang_vel_penalty + height_penalty + + stance_collapse_penalty + + cross_leg_penalty # + exploration_bonus # + height_down_penalty ) - if time.time() - self.start_time >= 1200: + if time.time() - self.start_time >= 600: self.start_time = time.time() print( # f"progress_reward:{progress_reward:.4f}", @@ -476,6 +500,8 @@ class WalkEnv(gym.Env): f"height_penalty:{height_penalty:.4f}", f"smoothness_penalty:{smoothness_penalty:.4f},", f"posture_penalty:{posture_penalty:.4f}", + f"stance_collapse_penalty:{stance_collapse_penalty:.4f}", + f"cross_leg_penalty:{cross_leg_penalty:.4f}", # f"ang_vel_penalty:{ang_vel_penalty:.4f}", # f"height_down_penalty:{height_down_penalty:.4f}", # f"exploration_bonus:{exploration_bonus:.4f}" @@ -519,7 +545,7 @@ class WalkEnv(gym.Env): self.last_action_for_reward = action.copy() # Fall detection and penalty - is_fallen = self.Player.world.global_position[2] < 0.3 + is_fallen = self.Player.world.global_position[2] < 0.55 # terminal state: the robot is falling or timeout terminated = is_fallen or self.step_counter > 800 or self.route_completed @@ -601,13 +627,13 @@ class Train(Train_Base): clip_range=float(os.environ.get("GYM_CPU_TRAIN_CLIP_RANGE", "0.2")), # PPO clipping parameter gae_lambda=0.95, # GAE lambda gamma=float(os.environ.get("GYM_CPU_TRAIN_GAMMA", "0.95")), # Discount factor - target_kl=0.03, + # target_kl=0.03, n_epochs=int(os.environ.get("GYM_CPU_TRAIN_EPOCHS", "5")), - # tensorboard_log=f"./scripts/gyms/logs/{folder_name}/tensorboard/" + tensorboard_log=f"./scripts/gyms/logs/{folder_name}/tensorboard/" ) model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env, - eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, + eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_eps=100, backup_env_file=__file__) except KeyboardInterrupt: sleep(1) # wait for child processes diff --git a/scripts/gyms/logs/stand_stable_final.zip b/scripts/gyms/logs/stand_stable_final.zip new file mode 100644 index 0000000..985c7f4 Binary files /dev/null and b/scripts/gyms/logs/stand_stable_final.zip differ diff --git a/train.sh b/train.sh index a4b910a..184f9bb 100755 --- a/train.sh +++ b/train.sh @@ -12,15 +12,16 @@ set -euo pipefail # CPU 核数基准(默认 20): # 例如你的机器按 20 核预算来算,可保持默认。 CORES="${CORES:-20}" -# CPU 占用百分比(默认 95): +# CPU 占用百分比(默认 100): # 最终会与 CORES 相乘得到 CPUQuota。 -# 例:CORES=20, UTIL_PERCENT=95 -> CPUQuota=1900%(约 19 核等效) -UTIL_PERCENT="${UTIL_PERCENT:-95}" +# 例:CORES=20, UTIL_PERCENT=100 -> CPUQuota=2000%(约 20 核等效) +UTIL_PERCENT="${UTIL_PERCENT:-100}" CPU_QUOTA="$((CORES * UTIL_PERCENT))%" -# 内存上限(默认 28G): -# 可改成 16G、24G 等,避免训练把系统内存吃满。 -MEMORY_MAX="${MEMORY_MAX:-28G}" +# 内存上限(默认关闭): +# 设为具体值(如 24G/28G)可限制训练最多占用内存; +# 设为 0/none/off/infinity 表示不设置 cgroup 内存上限。 +MEMORY_MAX="${MEMORY_MAX:-0}" # ------------------------------ # 训练运行参数(由 scripts/gyms/Walk.py 读取) @@ -28,8 +29,8 @@ MEMORY_MAX="${MEMORY_MAX:-28G}" # 运行模式:train 或 test GYM_CPU_MODE="${GYM_CPU_MODE:-train}" -# 并行环境数量:越大通常吞吐越高,但也更容易触发服务器连接不稳定。 -# 建议从 8~12 起步,稳定后再升到 16/20。 +# 并行环境数量:越大通常吞吐越高,但也更容易触发 OOM 或连接不稳定。 +# 默认使用更稳妥的 12,确认稳定后再升到 16/20。 GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS:-20}" # 服务器预热时间(秒): # 在批量拉起 rcssserver 后等待一段时间,再创建 SubprocVecEnv, @@ -98,14 +99,23 @@ if [[ -n "${CONDA_DEFAULT_ENV:-}" ]]; then echo "Detected conda env: ${CONDA_DEFAULT_ENV}" fi +SYSTEMD_PROPS=("-p" "CPUQuota=${CPU_QUOTA}") +case "${MEMORY_MAX,,}" in + 0|none|off|infinity) + echo "MemoryMax is disabled for this run (no cgroup memory cap)." + ;; + *) + SYSTEMD_PROPS+=("-p" "MemoryMax=${MEMORY_MAX}") + ;; +esac + # 使用 systemd-run --user --scope 启动“受限资源”的训练进程: # - CPUQuota: 总 CPU 配额 # - MemoryMax: 最大内存 # - env ... : 显式传递训练参数到 Python 进程 # - python -m scripts.gyms.Walk: 以模块方式启动训练入口 systemd-run --user --scope \ - -p CPUQuota="${CPU_QUOTA}" \ - -p MemoryMax="${MEMORY_MAX}" \ + "${SYSTEMD_PROPS[@]}" \ env \ GYM_CPU_MODE="${GYM_CPU_MODE}" \ GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS}" \