stand_stable_final_version

2026-03-24 06:40:15 -04:00
parent a52cdff013
commit baaa0aa6ed
4 changed files with 57 additions and 21 deletions
--- a/command.md
+++ b/command.md
@@ -11,4 +11,4 @@ retrain（继续训练）
 GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_005/best_model.zip bash train.sh
 retrain+改训练超参
-GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_004/best_model.zip GYM_CPU_TRAIN_LR=2e-4 GYM_CPU_TRAIN_BATCH_SIZE=256 GYM_CPU_TRAIN_EPOCHS=8 bash train.sh
+GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_004/best_model.zip GYM_CPU_TRAIN_LR=2e-4 GYM_CPU_TRAIN_CLIP_RANGE=0.13 GYM_CPU_TRAIN_BATCH_SIZE=256 YM_CPU_TRAIN_GAMMA=0.95 GYM_CPU_TRAIN_ENT_COEF=0.05 GYM_CPU_TRAIN_EPOCHS=8 bash train.sh
--- a/scripts/gyms/Walk.py
+++ b/scripts/gyms/Walk.py
@@ -5,6 +5,7 @@ import time
 from time import sleep
 from random import random
 from random import uniform
 from itertools import count
 from stable_baselines3 import PPO
 from stable_baselines3.common.monitor import Monitor
@@ -148,11 +149,14 @@ class WalkEnv(gym.Env):
        self.scaling_factor = 0.3
        # self.scaling_factor = 1
        # Encourage a minimum lateral stance so the policy avoids feet overlap.
        self.min_stance_rad = 0.10
        # Small reset perturbations for robustness training.
        self.enable_reset_perturb = True
        self.reset_beam_yaw_range_deg = 180  # randomize target direction fully to encourage learning a real walk instead of a fixed gait
-        self.reset_joint_noise_rad = 0.015
+        self.reset_joint_noise_rad = 0.025
-        self.reset_perturb_steps = 3
+        self.reset_perturb_steps = 4
        self.reset_recover_steps = 8
        self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
@@ -392,14 +396,15 @@ class WalkEnv(gym.Env):
    def compute_reward(self, previous_pos, current_pos, action):
        height = float(self.Player.world.global_position[2])
        robot = self.Player.robot
-        orientation_quat_inv = R.from_quat(self.Player.robot._global_cheat_orientation).inv()
+        orientation_quat_inv = R.from_quat(robot._global_cheat_orientation).inv()
        projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0]))
        tilt_mag = float(np.linalg.norm(projected_gravity[:2]))
-        ang_vel = np.deg2rad(self.Player.robot.gyroscope)
+        ang_vel = np.deg2rad(robot.gyroscope)
        ang_vel_mag = float(np.linalg.norm(ang_vel))
-        is_fallen = height < 0.3
+        is_fallen = height < 0.55
        if is_fallen:
            # remain = max(0, 800 - self.step_counter)
            # return -8.0 - 0.01 * remain
@@ -429,6 +434,23 @@ class WalkEnv(gym.Env):
        posture_penalty = -0.3 * (tilt_mag)
        ang_vel_penalty = -0.02 * ang_vel_mag
        # Use simulator joint readings in training frame to shape lateral stance.
        joint_pos = np.deg2rad(
            [robot.motor_positions[motor] for motor in robot.ROBOT_MOTORS]
        ) * self.train_sim_flip
        left_hip_roll = float(joint_pos[12])
        right_hip_roll = float(joint_pos[18])
        left_ankle_roll = float(joint_pos[16])
        right_ankle_roll = float(joint_pos[22])
        hip_spread = left_hip_roll - right_hip_roll
        ankle_spread = left_ankle_roll - right_ankle_roll
        stance_metric = 0.6 * abs(hip_spread) + 0.4 * abs(ankle_spread)
        # Penalize narrow stance (feet too close) and scissoring (cross-leg pattern).
        stance_collapse_penalty = -4.0 * max(0.0, self.min_stance_rad - stance_metric)
        cross_leg_penalty = -1.2 * max(0.0, -(hip_spread * ankle_spread))
        target_height = self.initial_height
        height_error =  height - target_height
        height_penalty = -0.5 * abs(height_error)  # 惩罚高度偏离，系数可调
@@ -464,10 +486,12 @@ class WalkEnv(gym.Env):
                 posture_penalty
                 + ang_vel_penalty
                 + height_penalty
                 + stance_collapse_penalty
                 + cross_leg_penalty
                #  + exploration_bonus
                # + height_down_penalty
                 )
-        if time.time() - self.start_time >= 1200:
+        if time.time() - self.start_time >= 600:
            self.start_time = time.time()
            print(
                #   f"progress_reward:{progress_reward:.4f}",
@@ -476,6 +500,8 @@ class WalkEnv(gym.Env):
                  f"height_penalty:{height_penalty:.4f}",
                  f"smoothness_penalty:{smoothness_penalty:.4f},",
                  f"posture_penalty:{posture_penalty:.4f}",
                  f"stance_collapse_penalty:{stance_collapse_penalty:.4f}",
                  f"cross_leg_penalty:{cross_leg_penalty:.4f}",
                #   f"ang_vel_penalty:{ang_vel_penalty:.4f}",
                #   f"height_down_penalty:{height_down_penalty:.4f}",
                #   f"exploration_bonus:{exploration_bonus:.4f}"
@@ -519,7 +545,7 @@ class WalkEnv(gym.Env):
        self.last_action_for_reward = action.copy()
        # Fall detection and penalty
-        is_fallen = self.Player.world.global_position[2] < 0.3
+        is_fallen = self.Player.world.global_position[2] < 0.55
        # terminal state: the robot is falling or timeout
        terminated = is_fallen or self.step_counter > 800 or self.route_completed
@@ -601,13 +627,13 @@ class Train(Train_Base):
                    clip_range=float(os.environ.get("GYM_CPU_TRAIN_CLIP_RANGE", "0.2")),  # PPO clipping parameter
                    gae_lambda=0.95,  # GAE lambda
                    gamma=float(os.environ.get("GYM_CPU_TRAIN_GAMMA", "0.95")), # Discount factor
-                    target_kl=0.03,
+                    # target_kl=0.03,
                    n_epochs=int(os.environ.get("GYM_CPU_TRAIN_EPOCHS", "5")),
-                    # tensorboard_log=f"./scripts/gyms/logs/{folder_name}/tensorboard/"
+                    tensorboard_log=f"./scripts/gyms/logs/{folder_name}/tensorboard/"
                )
            model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
-                                          eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20,
+                                          eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_eps=100,
                                          backup_env_file=__file__)
        except KeyboardInterrupt:
            sleep(1)  # wait for child processes
--- a/scripts/gyms/logs/stand_stable_final.zip
+++ b/scripts/gyms/logs/stand_stable_final.zip
--- a/train.sh
+++ b/train.sh
@@ -12,15 +12,16 @@ set -euo pipefail
 # CPU 核数基准（默认 20）：
 # 例如你的机器按 20 核预算来算，可保持默认。
 CORES="${CORES:-20}"
-# CPU 占用百分比（默认 95）：
+# CPU 占用百分比（默认 100）：
 # 最终会与 CORES 相乘得到 CPUQuota。
-# 例：CORES=20, UTIL_PERCENT=95 -> CPUQuota=1900%（约 19 核等效）
+# 例：CORES=20, UTIL_PERCENT=100 -> CPUQuota=2000%（约 20 核等效）
-UTIL_PERCENT="${UTIL_PERCENT:-95}"
+UTIL_PERCENT="${UTIL_PERCENT:-100}"
 CPU_QUOTA="$((CORES * UTIL_PERCENT))%"
-# 内存上限（默认 28G）：
+# 内存上限（默认关闭）：
-# 可改成 16G、24G 等，避免训练把系统内存吃满。
+# 设为具体值（如 24G/28G）可限制训练最多占用内存；
-MEMORY_MAX="${MEMORY_MAX:-28G}"
+# 设为 0/none/off/infinity 表示不设置 cgroup 内存上限。
 MEMORY_MAX="${MEMORY_MAX:-0}"
 # ------------------------------
 # 训练运行参数（由 scripts/gyms/Walk.py 读取）
@@ -28,8 +29,8 @@ MEMORY_MAX="${MEMORY_MAX:-28G}"
 # 运行模式：train 或 test
 GYM_CPU_MODE="${GYM_CPU_MODE:-train}"
-# 并行环境数量：越大通常吞吐越高，但也更容易触发服务器连接不稳定。
+# 并行环境数量：越大通常吞吐越高，但也更容易触发 OOM 或连接不稳定。
-# 建议从 8~12 起步，稳定后再升到 16/20。
+# 默认使用更稳妥的 12，确认稳定后再升到 16/20。
 GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS:-20}"
 # 服务器预热时间（秒）：
 # 在批量拉起 rcssserver 后等待一段时间，再创建 SubprocVecEnv，
@@ -98,14 +99,23 @@ if [[ -n "${CONDA_DEFAULT_ENV:-}" ]]; then
 	echo "Detected conda env: ${CONDA_DEFAULT_ENV}"
 fi
 SYSTEMD_PROPS=("-p" "CPUQuota=${CPU_QUOTA}")
 case "${MEMORY_MAX,,}" in
 	0|none|off|infinity)
 		echo "MemoryMax is disabled for this run (no cgroup memory cap)."
 		;;
 	*)
 		SYSTEMD_PROPS+=("-p" "MemoryMax=${MEMORY_MAX}")
 		;;
 esac
 # 使用 systemd-run --user --scope 启动“受限资源”的训练进程：
 # - CPUQuota: 总 CPU 配额
 # - MemoryMax: 最大内存
 # - env ...  : 显式传递训练参数到 Python 进程
 # - python -m scripts.gyms.Walk: 以模块方式启动训练入口
 systemd-run --user --scope \
-	-p CPUQuota="${CPU_QUOTA}" \
+	"${SYSTEMD_PROPS[@]}" \
 	-p MemoryMax="${MEMORY_MAX}" \
 	env \
 	GYM_CPU_MODE="${GYM_CPU_MODE}" \
 	GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS}" \