127 lines
5.2 KiB
Bash
127 lines
5.2 KiB
Bash
|
|
#!/usr/bin/env bash
|
|||
|
|
set -euo pipefail
|
|||
|
|
|
|||
|
|
# ------------------------------
|
|||
|
|
# 资源限制配置(cgroup v2 + systemd-run)
|
|||
|
|
# ------------------------------
|
|||
|
|
# 说明:
|
|||
|
|
# 1) 这个脚本会把训练进程放进一个临时的 systemd scope 中,并施加 CPU/内存上限。
|
|||
|
|
# 2) 仅限制“本次训练进程”,不会永久改系统配置。
|
|||
|
|
# 3) 下面变量都支持“环境变量覆盖”,即你可以在命令前临时指定。
|
|||
|
|
#
|
|||
|
|
# CPU 核数基准(默认 20):
|
|||
|
|
# 例如你的机器按 20 核预算来算,可保持默认。
|
|||
|
|
CORES="${CORES:-20}"
|
|||
|
|
# CPU 占用百分比(默认 95):
|
|||
|
|
# 最终会与 CORES 相乘得到 CPUQuota。
|
|||
|
|
# 例:CORES=20, UTIL_PERCENT=95 -> CPUQuota=1900%(约 19 核等效)
|
|||
|
|
UTIL_PERCENT="${UTIL_PERCENT:-95}"
|
|||
|
|
CPU_QUOTA="$((CORES * UTIL_PERCENT))%"
|
|||
|
|
|
|||
|
|
# 内存上限(默认 28G):
|
|||
|
|
# 可改成 16G、24G 等,避免训练把系统内存吃满。
|
|||
|
|
MEMORY_MAX="${MEMORY_MAX:-28G}"
|
|||
|
|
|
|||
|
|
# ------------------------------
|
|||
|
|
# 训练运行参数(由 scripts/gyms/Walk.py 读取)
|
|||
|
|
# ------------------------------
|
|||
|
|
# 运行模式:train 或 test
|
|||
|
|
GYM_CPU_MODE="${GYM_CPU_MODE:-train}"
|
|||
|
|
|
|||
|
|
# 并行环境数量:越大通常吞吐越高,但也更容易触发服务器连接不稳定。
|
|||
|
|
# 建议从 8~12 起步,稳定后再升到 16/20。
|
|||
|
|
GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS:-20}"
|
|||
|
|
# 服务器预热时间(秒):
|
|||
|
|
# 在批量拉起 rcssserver 后等待一段时间,再创建 SubprocVecEnv,
|
|||
|
|
# 可降低 ConnectionReset/EOFError 概率。
|
|||
|
|
GYM_CPU_SERVER_WARMUP_SEC="${GYM_CPU_SERVER_WARMUP_SEC:-10}"
|
|||
|
|
|
|||
|
|
# 训练专用参数
|
|||
|
|
GYM_CPU_TRAIN_STEPS_PER_ENV="${GYM_CPU_TRAIN_STEPS_PER_ENV:-256}"
|
|||
|
|
GYM_CPU_TRAIN_BATCH_SIZE="${GYM_CPU_TRAIN_BATCH_SIZE:-512}"
|
|||
|
|
GYM_CPU_TRAIN_LR="${GYM_CPU_TRAIN_LR:-1e-4}"
|
|||
|
|
GYM_CPU_TRAIN_ENT_COEF="${GYM_CPU_TRAIN_ENT_COEF:-0.03}"
|
|||
|
|
GYM_CPU_TRAIN_CLIP_RANGE="${GYM_CPU_TRAIN_CLIP_RANGE:-0.13}"
|
|||
|
|
GYM_CPU_TRAIN_GAMMA="${GYM_CPU_TRAIN_GAMMA:-0.95}"
|
|||
|
|
GYM_CPU_TRAIN_EPOCHS="${GYM_CPU_TRAIN_EPOCHS:-5}"
|
|||
|
|
GYM_CPU_TRAIN_MODEL="${GYM_CPU_TRAIN_MODEL:-}"
|
|||
|
|
|
|||
|
|
# 测试专用参数
|
|||
|
|
GYM_CPU_TEST_MODEL="${GYM_CPU_TEST_MODEL:-scripts/gyms/logs/Walk_R0_004/best_model.zip}"
|
|||
|
|
GYM_CPU_TEST_FOLDER="${GYM_CPU_TEST_FOLDER:-scripts/gyms/logs/Walk_R0_004/}"
|
|||
|
|
# 测试默认实时且显示画面:默认均为 0
|
|||
|
|
# 设为 1 表示关闭对应能力
|
|||
|
|
GYM_CPU_TEST_NO_RENDER="${GYM_CPU_TEST_NO_RENDER:-0}"
|
|||
|
|
GYM_CPU_TEST_NO_REALTIME="${GYM_CPU_TEST_NO_REALTIME:-0}"
|
|||
|
|
|
|||
|
|
# Python 解释器选择策略:
|
|||
|
|
# 1) 优先使用你手动传入的 PYTHON_BIN
|
|||
|
|
# 2) 其次用当前激活 conda 环境(CONDA_PREFIX/bin/python)
|
|||
|
|
# 3) 再回退到默认 mujoco 环境路径
|
|||
|
|
# 4) 最后尝试系统 python / python3
|
|||
|
|
DEFAULT_PYTHON="/home/solren/Downloads/Anaconda/envs/mujoco/bin/python"
|
|||
|
|
CONDA_PYTHON="${CONDA_PREFIX:-}/bin/python"
|
|||
|
|
|
|||
|
|
# 安全保护:不要用 sudo 运行。
|
|||
|
|
# 原因:sudo 可能导致 conda 环境与用户会话环境不一致,
|
|||
|
|
# 会引发 python 路径丢失、systemd --user 会话不可见等问题。
|
|||
|
|
if [[ "${EUID}" -eq 0 ]]; then
|
|||
|
|
echo "Do not run this script with sudo; run as your normal user in conda env 'mujoco'."
|
|||
|
|
exit 1
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 解析最终使用的 Python 可执行文件。
|
|||
|
|
if [[ -n "${PYTHON_BIN:-}" ]]; then
|
|||
|
|
PYTHON_EXEC="${PYTHON_BIN}"
|
|||
|
|
elif [[ -n "${CONDA_PREFIX:-}" && -x "${CONDA_PYTHON}" ]]; then
|
|||
|
|
PYTHON_EXEC="${CONDA_PYTHON}"
|
|||
|
|
elif [[ -x "${DEFAULT_PYTHON}" ]]; then
|
|||
|
|
PYTHON_EXEC="${DEFAULT_PYTHON}"
|
|||
|
|
elif command -v python >/dev/null 2>&1; then
|
|||
|
|
PYTHON_EXEC="$(command -v python)"
|
|||
|
|
elif command -v python3 >/dev/null 2>&1; then
|
|||
|
|
PYTHON_EXEC="$(command -v python3)"
|
|||
|
|
else
|
|||
|
|
echo "No Python executable found. Set PYTHON_BIN=/abs/path/to/python and retry."
|
|||
|
|
exit 1
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 脚本所在目录(绝对路径),便于后续定位模块/相对路径。
|
|||
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|||
|
|
|
|||
|
|
# 打印当前生效配置,方便排障和复现实验。
|
|||
|
|
echo "Starting training with limits: CPU=${CPU_QUOTA}, Memory=${MEMORY_MAX}"
|
|||
|
|
echo "Mode: ${GYM_CPU_MODE}"
|
|||
|
|
echo "Runtime knobs: GYM_CPU_N_ENVS=${GYM_CPU_N_ENVS}, GYM_CPU_SERVER_WARMUP_SEC=${GYM_CPU_SERVER_WARMUP_SEC}"
|
|||
|
|
echo "Using Python: ${PYTHON_EXEC}"
|
|||
|
|
if [[ -n "${CONDA_DEFAULT_ENV:-}" ]]; then
|
|||
|
|
echo "Detected conda env: ${CONDA_DEFAULT_ENV}"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 使用 systemd-run --user --scope 启动“受限资源”的训练进程:
|
|||
|
|
# - CPUQuota: 总 CPU 配额
|
|||
|
|
# - MemoryMax: 最大内存
|
|||
|
|
# - env ... : 显式传递训练参数到 Python 进程
|
|||
|
|
# - python -m scripts.gyms.Walk: 以模块方式启动训练入口
|
|||
|
|
systemd-run --user --scope \
|
|||
|
|
-p CPUQuota="${CPU_QUOTA}" \
|
|||
|
|
-p MemoryMax="${MEMORY_MAX}" \
|
|||
|
|
env \
|
|||
|
|
GYM_CPU_MODE="${GYM_CPU_MODE}" \
|
|||
|
|
GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS}" \
|
|||
|
|
GYM_CPU_SERVER_WARMUP_SEC="${GYM_CPU_SERVER_WARMUP_SEC}" \
|
|||
|
|
GYM_CPU_TRAIN_STEPS_PER_ENV="${GYM_CPU_TRAIN_STEPS_PER_ENV}" \
|
|||
|
|
GYM_CPU_TRAIN_BATCH_SIZE="${GYM_CPU_TRAIN_BATCH_SIZE}" \
|
|||
|
|
GYM_CPU_TRAIN_LR="${GYM_CPU_TRAIN_LR}" \
|
|||
|
|
GYM_CPU_TRAIN_ENT_COEF="${GYM_CPU_TRAIN_ENT_COEF}" \
|
|||
|
|
GYM_CPU_TRAIN_CLIP_RANGE="${GYM_CPU_TRAIN_CLIP_RANGE}" \
|
|||
|
|
GYM_CPU_TRAIN_GAMMA="${GYM_CPU_TRAIN_GAMMA}" \
|
|||
|
|
GYM_CPU_TRAIN_EPOCHS="${GYM_CPU_TRAIN_EPOCHS}" \
|
|||
|
|
GYM_CPU_TRAIN_MODEL="${GYM_CPU_TRAIN_MODEL}" \
|
|||
|
|
GYM_CPU_TEST_MODEL="${GYM_CPU_TEST_MODEL}" \
|
|||
|
|
GYM_CPU_TEST_FOLDER="${GYM_CPU_TEST_FOLDER}" \
|
|||
|
|
GYM_CPU_TEST_NO_RENDER="${GYM_CPU_TEST_NO_RENDER}" \
|
|||
|
|
GYM_CPU_TEST_NO_REALTIME="${GYM_CPU_TEST_NO_REALTIME}" \
|
|||
|
|
"${PYTHON_EXEC}" "-m" "scripts.gyms.Walk"
|
|||
|
|
|