stand_stable_final_version

This commit is contained in:
xxh
2026-03-24 06:40:15 -04:00
parent a52cdff013
commit baaa0aa6ed
4 changed files with 57 additions and 21 deletions

View File

@@ -11,4 +11,4 @@ retrain继续训练
GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_005/best_model.zip bash train.sh GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_005/best_model.zip bash train.sh
retrain+改训练超参 retrain+改训练超参
GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_004/best_model.zip GYM_CPU_TRAIN_LR=2e-4 GYM_CPU_TRAIN_BATCH_SIZE=256 GYM_CPU_TRAIN_EPOCHS=8 bash train.sh GYM_CPU_MODE=train GYM_CPU_TRAIN_MODEL=scripts/gyms/logs/Walk_R0_004/best_model.zip GYM_CPU_TRAIN_LR=2e-4 GYM_CPU_TRAIN_CLIP_RANGE=0.13 GYM_CPU_TRAIN_BATCH_SIZE=256 YM_CPU_TRAIN_GAMMA=0.95 GYM_CPU_TRAIN_ENT_COEF=0.05 GYM_CPU_TRAIN_EPOCHS=8 bash train.sh

View File

@@ -5,6 +5,7 @@ import time
from time import sleep from time import sleep
from random import random from random import random
from random import uniform from random import uniform
from itertools import count
from stable_baselines3 import PPO from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.monitor import Monitor
@@ -148,11 +149,14 @@ class WalkEnv(gym.Env):
self.scaling_factor = 0.3 self.scaling_factor = 0.3
# self.scaling_factor = 1 # self.scaling_factor = 1
# Encourage a minimum lateral stance so the policy avoids feet overlap.
self.min_stance_rad = 0.10
# Small reset perturbations for robustness training. # Small reset perturbations for robustness training.
self.enable_reset_perturb = True self.enable_reset_perturb = True
self.reset_beam_yaw_range_deg = 180 # randomize target direction fully to encourage learning a real walk instead of a fixed gait self.reset_beam_yaw_range_deg = 180 # randomize target direction fully to encourage learning a real walk instead of a fixed gait
self.reset_joint_noise_rad = 0.015 self.reset_joint_noise_rad = 0.025
self.reset_perturb_steps = 3 self.reset_perturb_steps = 4
self.reset_recover_steps = 8 self.reset_recover_steps = 8
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
@@ -392,14 +396,15 @@ class WalkEnv(gym.Env):
def compute_reward(self, previous_pos, current_pos, action): def compute_reward(self, previous_pos, current_pos, action):
height = float(self.Player.world.global_position[2]) height = float(self.Player.world.global_position[2])
robot = self.Player.robot
orientation_quat_inv = R.from_quat(self.Player.robot._global_cheat_orientation).inv() orientation_quat_inv = R.from_quat(robot._global_cheat_orientation).inv()
projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0])) projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0]))
tilt_mag = float(np.linalg.norm(projected_gravity[:2])) tilt_mag = float(np.linalg.norm(projected_gravity[:2]))
ang_vel = np.deg2rad(self.Player.robot.gyroscope) ang_vel = np.deg2rad(robot.gyroscope)
ang_vel_mag = float(np.linalg.norm(ang_vel)) ang_vel_mag = float(np.linalg.norm(ang_vel))
is_fallen = height < 0.3 is_fallen = height < 0.55
if is_fallen: if is_fallen:
# remain = max(0, 800 - self.step_counter) # remain = max(0, 800 - self.step_counter)
# return -8.0 - 0.01 * remain # return -8.0 - 0.01 * remain
@@ -429,6 +434,23 @@ class WalkEnv(gym.Env):
posture_penalty = -0.3 * (tilt_mag) posture_penalty = -0.3 * (tilt_mag)
ang_vel_penalty = -0.02 * ang_vel_mag ang_vel_penalty = -0.02 * ang_vel_mag
# Use simulator joint readings in training frame to shape lateral stance.
joint_pos = np.deg2rad(
[robot.motor_positions[motor] for motor in robot.ROBOT_MOTORS]
) * self.train_sim_flip
left_hip_roll = float(joint_pos[12])
right_hip_roll = float(joint_pos[18])
left_ankle_roll = float(joint_pos[16])
right_ankle_roll = float(joint_pos[22])
hip_spread = left_hip_roll - right_hip_roll
ankle_spread = left_ankle_roll - right_ankle_roll
stance_metric = 0.6 * abs(hip_spread) + 0.4 * abs(ankle_spread)
# Penalize narrow stance (feet too close) and scissoring (cross-leg pattern).
stance_collapse_penalty = -4.0 * max(0.0, self.min_stance_rad - stance_metric)
cross_leg_penalty = -1.2 * max(0.0, -(hip_spread * ankle_spread))
target_height = self.initial_height target_height = self.initial_height
height_error = height - target_height height_error = height - target_height
height_penalty = -0.5 * abs(height_error) # 惩罚高度偏离,系数可调 height_penalty = -0.5 * abs(height_error) # 惩罚高度偏离,系数可调
@@ -464,10 +486,12 @@ class WalkEnv(gym.Env):
posture_penalty posture_penalty
+ ang_vel_penalty + ang_vel_penalty
+ height_penalty + height_penalty
+ stance_collapse_penalty
+ cross_leg_penalty
# + exploration_bonus # + exploration_bonus
# + height_down_penalty # + height_down_penalty
) )
if time.time() - self.start_time >= 1200: if time.time() - self.start_time >= 600:
self.start_time = time.time() self.start_time = time.time()
print( print(
# f"progress_reward:{progress_reward:.4f}", # f"progress_reward:{progress_reward:.4f}",
@@ -476,6 +500,8 @@ class WalkEnv(gym.Env):
f"height_penalty:{height_penalty:.4f}", f"height_penalty:{height_penalty:.4f}",
f"smoothness_penalty:{smoothness_penalty:.4f},", f"smoothness_penalty:{smoothness_penalty:.4f},",
f"posture_penalty:{posture_penalty:.4f}", f"posture_penalty:{posture_penalty:.4f}",
f"stance_collapse_penalty:{stance_collapse_penalty:.4f}",
f"cross_leg_penalty:{cross_leg_penalty:.4f}",
# f"ang_vel_penalty:{ang_vel_penalty:.4f}", # f"ang_vel_penalty:{ang_vel_penalty:.4f}",
# f"height_down_penalty:{height_down_penalty:.4f}", # f"height_down_penalty:{height_down_penalty:.4f}",
# f"exploration_bonus:{exploration_bonus:.4f}" # f"exploration_bonus:{exploration_bonus:.4f}"
@@ -519,7 +545,7 @@ class WalkEnv(gym.Env):
self.last_action_for_reward = action.copy() self.last_action_for_reward = action.copy()
# Fall detection and penalty # Fall detection and penalty
is_fallen = self.Player.world.global_position[2] < 0.3 is_fallen = self.Player.world.global_position[2] < 0.55
# terminal state: the robot is falling or timeout # terminal state: the robot is falling or timeout
terminated = is_fallen or self.step_counter > 800 or self.route_completed terminated = is_fallen or self.step_counter > 800 or self.route_completed
@@ -601,13 +627,13 @@ class Train(Train_Base):
clip_range=float(os.environ.get("GYM_CPU_TRAIN_CLIP_RANGE", "0.2")), # PPO clipping parameter clip_range=float(os.environ.get("GYM_CPU_TRAIN_CLIP_RANGE", "0.2")), # PPO clipping parameter
gae_lambda=0.95, # GAE lambda gae_lambda=0.95, # GAE lambda
gamma=float(os.environ.get("GYM_CPU_TRAIN_GAMMA", "0.95")), # Discount factor gamma=float(os.environ.get("GYM_CPU_TRAIN_GAMMA", "0.95")), # Discount factor
target_kl=0.03, # target_kl=0.03,
n_epochs=int(os.environ.get("GYM_CPU_TRAIN_EPOCHS", "5")), n_epochs=int(os.environ.get("GYM_CPU_TRAIN_EPOCHS", "5")),
# tensorboard_log=f"./scripts/gyms/logs/{folder_name}/tensorboard/" tensorboard_log=f"./scripts/gyms/logs/{folder_name}/tensorboard/"
) )
model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env, model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_eps=100,
backup_env_file=__file__) backup_env_file=__file__)
except KeyboardInterrupt: except KeyboardInterrupt:
sleep(1) # wait for child processes sleep(1) # wait for child processes

Binary file not shown.

View File

@@ -12,15 +12,16 @@ set -euo pipefail
# CPU 核数基准(默认 20 # CPU 核数基准(默认 20
# 例如你的机器按 20 核预算来算,可保持默认。 # 例如你的机器按 20 核预算来算,可保持默认。
CORES="${CORES:-20}" CORES="${CORES:-20}"
# CPU 占用百分比(默认 95 # CPU 占用百分比(默认 100
# 最终会与 CORES 相乘得到 CPUQuota。 # 最终会与 CORES 相乘得到 CPUQuota。
# 例CORES=20, UTIL_PERCENT=95 -> CPUQuota=1900%(约 19 核等效) # 例CORES=20, UTIL_PERCENT=100 -> CPUQuota=2000%(约 20 核等效)
UTIL_PERCENT="${UTIL_PERCENT:-95}" UTIL_PERCENT="${UTIL_PERCENT:-100}"
CPU_QUOTA="$((CORES * UTIL_PERCENT))%" CPU_QUOTA="$((CORES * UTIL_PERCENT))%"
# 内存上限(默认 28G # 内存上限(默认关闭
# 可改成 16G、24G 等,避免训练把系统内存吃满。 # 设为具体值(如 24G/28G可限制训练最多占用内存
MEMORY_MAX="${MEMORY_MAX:-28G}" # 设为 0/none/off/infinity 表示不设置 cgroup 内存上限。
MEMORY_MAX="${MEMORY_MAX:-0}"
# ------------------------------ # ------------------------------
# 训练运行参数(由 scripts/gyms/Walk.py 读取) # 训练运行参数(由 scripts/gyms/Walk.py 读取)
@@ -28,8 +29,8 @@ MEMORY_MAX="${MEMORY_MAX:-28G}"
# 运行模式train 或 test # 运行模式train 或 test
GYM_CPU_MODE="${GYM_CPU_MODE:-train}" GYM_CPU_MODE="${GYM_CPU_MODE:-train}"
# 并行环境数量:越大通常吞吐越高,但也更容易触发服务器连接不稳定。 # 并行环境数量:越大通常吞吐越高,但也更容易触发 OOM 或连接不稳定。
# 建议从 8~12 起步,稳定后再升到 16/20。 # 默认使用更稳妥的 12确认稳定后再升到 16/20。
GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS:-20}" GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS:-20}"
# 服务器预热时间(秒): # 服务器预热时间(秒):
# 在批量拉起 rcssserver 后等待一段时间,再创建 SubprocVecEnv # 在批量拉起 rcssserver 后等待一段时间,再创建 SubprocVecEnv
@@ -98,14 +99,23 @@ if [[ -n "${CONDA_DEFAULT_ENV:-}" ]]; then
echo "Detected conda env: ${CONDA_DEFAULT_ENV}" echo "Detected conda env: ${CONDA_DEFAULT_ENV}"
fi fi
SYSTEMD_PROPS=("-p" "CPUQuota=${CPU_QUOTA}")
case "${MEMORY_MAX,,}" in
0|none|off|infinity)
echo "MemoryMax is disabled for this run (no cgroup memory cap)."
;;
*)
SYSTEMD_PROPS+=("-p" "MemoryMax=${MEMORY_MAX}")
;;
esac
# 使用 systemd-run --user --scope 启动“受限资源”的训练进程: # 使用 systemd-run --user --scope 启动“受限资源”的训练进程:
# - CPUQuota: 总 CPU 配额 # - CPUQuota: 总 CPU 配额
# - MemoryMax: 最大内存 # - MemoryMax: 最大内存
# - env ... : 显式传递训练参数到 Python 进程 # - env ... : 显式传递训练参数到 Python 进程
# - python -m scripts.gyms.Walk: 以模块方式启动训练入口 # - python -m scripts.gyms.Walk: 以模块方式启动训练入口
systemd-run --user --scope \ systemd-run --user --scope \
-p CPUQuota="${CPU_QUOTA}" \ "${SYSTEMD_PROPS[@]}" \
-p MemoryMax="${MEMORY_MAX}" \
env \ env \
GYM_CPU_MODE="${GYM_CPU_MODE}" \ GYM_CPU_MODE="${GYM_CPU_MODE}" \
GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS}" \ GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS}" \