reward modification and add stage reward

reward modification and change the get_up logic
Amend tiny bug
2026-03-23 10:17:31 -04:00 · 2026-03-23 09:06:36 -04:00 · 2026-03-22 21:21:17 -04:00 · 2026-03-22 21:11:46 -04:00 · 2026-03-22 03:05:24 -04:00 · 2026-03-22 02:57:04 -04:00
7 changed files with 489 additions and 0 deletions
--- a/rl_game/get_up/init.py
+++ b/rl_game/get_up/init.py
@@ -0,0 +1,13 @@
 import gymnasium as gym
 # 导入你的配置
 from rl_game.demo.config.t1_env_cfg import T1EnvCfg
 # 注册环境到 Gymnasium
 gym.register(
    id="Isaac-T1-GetUp-v0",
    entry_point="isaaclab.envs:ManagerBasedRLEnv", # Isaac Lab 统一的强化学习环境入口
    kwargs={
        "cfg": T1EnvCfg(),
    },
 )
--- a/rl_game/get_up/asset/t1/T1_locomotion_base.usd
+++ b/rl_game/get_up/asset/t1/T1_locomotion_base.usd
--- a/rl_game/get_up/asset/t1/T1_locomotion_physics_lab.usd
+++ b/rl_game/get_up/asset/t1/T1_locomotion_physics_lab.usd
--- a/rl_game/get_up/config/ppo_cfg.yaml
+++ b/rl_game/get_up/config/ppo_cfg.yaml
@@ -0,0 +1,60 @@
 params:
  seed: 42
  algo:
    name: a2c_continuous
  model:
    name: continuous_a2c_logstd
  network:
    name: actor_critic
    separate: False
    space:
      continuous:
        mu_activation: None
        sigma_activation: None
        mu_init:
          name: default
        sigma_init:
          name: const_initializer
          val: 0.5
        fixed_sigma: False
    mlp:
      units: [512, 256, 128]
      activation: relu
      d2rl: False
      initializer:
        name: default
  config:
    name: T1_Walking
    env_name: rlgym # Isaac Lab 包装器
    multi_gpu: False
    ppo: True
    mixed_precision: True
    normalize_input: True
    normalize_value: True
    value_bootstrap: True
    num_actors: 8192 # 同时训练的机器人数量
    reward_shaper:
      scale_value: 1.0
    normalize_advantage: True
    gamma: 0.98
    tau: 0.95
    learning_rate: 3e-4
    lr_schedule: adaptive
    kl_threshold: 0.015
    score_to_win: 20000
    max_epochs: 500
    save_best_after: 50
    save_frequency: 100
    grad_norm: 1.0
    entropy_coef: 0.005
    truncate_grads: True
    bounds_loss_coef: 0.001
    e_clip: 0.2
    horizon_length: 256
    minibatch_size: 65536
    mini_epochs: 4
    critic_coef: 1
    clip_value: True
--- a/rl_game/get_up/config/t1_env_cfg.py
+++ b/rl_game/get_up/config/t1_env_cfg.py
@@ -0,0 +1,241 @@
 import torch
 import random
 import numpy as np
 import isaaclab.envs.mdp as mdp
 from isaaclab.assets import ArticulationCfg
 from isaaclab.envs import ManagerBasedRLEnvCfg, ManagerBasedRLEnv
 from isaaclab.managers import ObservationGroupCfg as ObsGroup
 from isaaclab.managers import ObservationTermCfg as ObsTerm
 from isaaclab.managers import RewardTermCfg as RewTerm
 from isaaclab.managers import TerminationTermCfg as DoneTerm
 from isaaclab.managers import EventTermCfg as EventTerm
 from isaaclab.envs.mdp import JointPositionActionCfg
 from isaaclab.managers import SceneEntityCfg
 from isaaclab.utils import configclass
 from rl_game.get_up.env.t1_env import T1SceneCfg
 # --- 1. 自定义逻辑：阶段性解锁奖励 ---
 def sequenced_getup_reward(
        env: ManagerBasedRLEnv,
        crouch_threshold: float = 0.7,  # 蜷缩完成度达到多少解锁下一阶段
        target_knee: float = 1.5,
        target_hip: float = 1.2
 ) -> torch.Tensor:
    """
    【核心修改】只有先蜷缩，才能拿高度分：
    1. 计算蜷缩程度。
    2. 记录当前 Episode 是否曾经达到过蜷缩目标。
    3. 返回 基础蜷缩奖 + (解锁标志 * 站立奖)。
    """
    # --- 1. 初始化/重置状态位 ---
    if "has_crouched" not in env.extras:
        env.extras["has_crouched"] = torch.zeros(env.num_envs, device=env.device, dtype=torch.bool)
    # 每一回合开始时（reset_buf 为 1），重置该机器人的状态位
    env.extras["has_crouched"] &= ~env.reset_buf
    # --- 2. 计算当前蜷缩质量 ---
    knee_names = ['Left_Knee_Pitch', 'Right_Knee_Pitch']
    hip_names = ['Left_Hip_Pitch', 'Right_Hip_Pitch']
    knee_indices, _ = env.scene["robot"].find_joints(knee_names)
    hip_indices, _ = env.scene["robot"].find_joints(hip_names)
    joint_pos = env.scene["robot"].data.joint_pos
    knee_error = torch.mean(torch.abs(joint_pos[:, knee_indices] - target_knee), dim=-1)
    hip_error = torch.mean(torch.abs(joint_pos[:, hip_indices] - target_hip), dim=-1)
    # 蜷缩得分 (0.0 ~ 1.0)
    crouch_score = torch.exp(-(knee_error + hip_error) / 0.6)
    # --- 3. 判断是否触发解锁 ---
    # 只要在这一回合内，crouch_score 曾经超过阈值，就永久解锁高度奖
    current_success = crouch_score > crouch_threshold
    env.extras["has_crouched"] |= current_success
    # --- 4. 计算高度奖励 ---
    pelvis_idx, _ = env.scene["robot"].find_bodies("Trunk")
    curr_pelvis_h = env.scene["robot"].data.body_state_w[:, pelvis_idx[0], 2]
    # 只有解锁后，高度奖励才生效 (0.0 或 高度值)
    standing_reward = torch.clamp(curr_pelvis_h - 0.3, min=0.0) * 20.0
    gated_standing_reward = env.extras["has_crouched"].float() * standing_reward
    # 总奖励 = 持续引导蜷缩 + 只有解锁后才有的站立奖
    return 5.0 * crouch_score + gated_standing_reward
 def is_standing_still(
        env: ManagerBasedRLEnv,
        min_head_height: float,
        min_pelvis_height: float,
        max_angle_error: float,
        standing_time: float,
        velocity_threshold: float = 0.15
 ) -> torch.Tensor:
    head_idx, _ = env.scene["robot"].find_bodies("H2")
    pelvis_idx, _ = env.scene["robot"].find_bodies("Trunk")
    current_head_h = env.scene["robot"].data.body_state_w[:, head_idx[0], 2]
    current_pelvis_h = env.scene["robot"].data.body_state_w[:, pelvis_idx[0], 2]
    gravity_error = torch.norm(env.scene["robot"].data.projected_gravity_b[:, :2], dim=-1)
    root_vel_norm = torch.norm(env.scene["robot"].data.root_lin_vel_w, dim=-1)
    is_stable_now = (
            (current_head_h > min_head_height) &
            (current_pelvis_h > min_pelvis_height) &
            (gravity_error < max_angle_error) &
            (root_vel_norm < velocity_threshold)
    )
    if "stable_timer" not in env.extras:
        env.extras["stable_timer"] = torch.zeros(env.num_envs, device=env.device)
    dt = env.physics_dt * env.cfg.decimation
    env.extras["stable_timer"] = torch.where(is_stable_now, env.extras["stable_timer"] + dt,
                                             torch.zeros_like(env.extras["stable_timer"]))
    return env.extras["stable_timer"] > standing_time
 # --- 2. 配置类 ---
 T1_JOINT_NAMES = [
    'AAHead_yaw', 'Head_pitch',
    'Left_Shoulder_Pitch', 'Left_Shoulder_Roll', 'Left_Elbow_Pitch', 'Left_Elbow_Yaw',
    'Right_Shoulder_Pitch', 'Right_Shoulder_Roll', 'Right_Elbow_Pitch', 'Right_Elbow_Yaw',
    'Waist',
    'Left_Hip_Pitch', 'Right_Hip_Pitch', 'Left_Hip_Roll', 'Right_Hip_Roll',
    'Left_Hip_Yaw', 'Right_Hip_Yaw', 'Left_Knee_Pitch', 'Right_Knee_Pitch',
    'Left_Ankle_Pitch', 'Right_Ankle_Pitch', 'Left_Ankle_Roll', 'Right_Ankle_Roll'
 ]
@configclass
 class T1ObservationCfg:
    @configclass
    class PolicyCfg(ObsGroup):
        concatenate_terms = True
        base_lin_vel = ObsTerm(func=mdp.base_lin_vel)
        base_ang_vel = ObsTerm(func=mdp.base_ang_vel)
        projected_gravity = ObsTerm(func=mdp.projected_gravity)
        root_pos = ObsTerm(func=mdp.root_pos_w)
        joint_pos = ObsTerm(func=mdp.joint_pos_rel,
                            params={"asset_cfg": SceneEntityCfg("robot", joint_names=T1_JOINT_NAMES)})
        joint_vel = ObsTerm(func=mdp.joint_vel_rel,
                            params={"asset_cfg": SceneEntityCfg("robot", joint_names=T1_JOINT_NAMES)})
        actions = ObsTerm(func=mdp.last_action)
    policy = PolicyCfg()
@configclass
 class T1EventCfg:
    reset_robot_rotation = EventTerm(
        func=mdp.reset_root_state_uniform,
        params={
            "asset_cfg": SceneEntityCfg("robot"),
            "pose_range": {
                "roll": (-1.57, 1.57),
                "pitch": tuple(np.array([1.4, 1.6], dtype=np.float32) * random.choice([-1 , 1])),
                "yaw": (-3.14, 3.14),
                "x": (0.0, 0.0),
                "y": (0.0, 0.0),
                "z": (0.35, 0.45),
            },
            "velocity_range": {},
        },
        mode="reset",
    )
@configclass
 class T1ActionCfg:
    # 拆分动作组以防止抽搐。由于不强制规定动作，我们可以给各个部位较为均衡的探索范围。
    arm_action = JointPositionActionCfg(
        asset_name="robot",
        joint_names=[
            'Left_Shoulder_Pitch', 'Left_Shoulder_Roll', 'Left_Elbow_Pitch', 'Left_Elbow_Yaw',
            'Right_Shoulder_Pitch', 'Right_Shoulder_Roll', 'Right_Elbow_Pitch', 'Right_Elbow_Yaw'
        ],
        scale=1.0,  # 给了手臂相对充裕的自由度去摸索
        use_default_offset=True
    )
    torso_action = JointPositionActionCfg(
        asset_name="robot",
        joint_names=['Waist', 'AAHead_yaw', 'Head_pitch'],
        scale=0.7,
        use_default_offset=True
    )
    leg_action = JointPositionActionCfg(
        asset_name="robot",
        joint_names=[
            'Left_Hip_Pitch', 'Right_Hip_Pitch', 'Left_Hip_Roll', 'Right_Hip_Roll',
            'Left_Hip_Yaw', 'Right_Hip_Yaw', 'Left_Knee_Pitch', 'Right_Knee_Pitch',
            'Left_Ankle_Pitch', 'Right_Ankle_Pitch', 'Left_Ankle_Roll', 'Right_Ankle_Roll'
        ],
        scale=0.5,
        use_default_offset=True
    )
@configclass
 class T1GetUpRewardCfg:
    # 核心：顺序阶段奖励
    sequenced_task = RewTerm(
        func=sequenced_getup_reward,
        weight=10.0,
        params={"crouch_threshold": 0.75}  # 必须完成 75% 的收腿动作才解锁高度奖
    )
    # 姿态惩罚：即便解锁了高度奖，如果姿态歪了也要扣分
    orientation = RewTerm(
        func=mdp.flat_orientation_l2,
        weight=-2.5
    )
    # 抑制抽搐
    action_rate = RewTerm(func=mdp.action_rate_l2, weight=-0.08)
    # 最终站稳奖
    is_success_maintain = RewTerm(
        func=is_standing_still,
        weight=100.0,
        params={
            "min_head_height": 1.08,
            "min_pelvis_height": 0.72,
            "max_angle_error": 0.25,
            "standing_time": 0.4,
            "velocity_threshold": 0.2
        }
    )
@configclass
 class T1GetUpTerminationsCfg:
    time_out = DoneTerm(func=mdp.time_out)
    standing_success = DoneTerm(
        func=is_standing_still,
        params={
            "min_head_height": 1.08,
            "min_pelvis_height": 0.72,
            "max_angle_error": 0.3,
            "standing_time": 0.3,
            "velocity_threshold": 0.4
        }
    )
@configclass
 class T1EnvCfg(ManagerBasedRLEnvCfg):
    scene = T1SceneCfg(num_envs=8192, env_spacing=2.5)
    observations = T1ObservationCfg()
    rewards = T1GetUpRewardCfg()
    terminations = T1GetUpTerminationsCfg()
    events = T1EventCfg()
    actions = T1ActionCfg()
    episode_length_s = 10.0
    decimation = 4
--- a/rl_game/get_up/env/t1_env.py
+++ b/rl_game/get_up/env/t1_env.py
@@ -0,0 +1,74 @@
 from isaaclab.assets import ArticulationCfg, AssetBaseCfg
 from isaaclab.scene import InteractiveSceneCfg
 from isaaclab.sensors import ContactSensorCfg
 from isaaclab.utils import configclass
 from isaaclab.actuators import ImplicitActuatorCfg
 from isaaclab import sim as sim_utils
 import os
 _DEMO_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 T1_USD_PATH = os.path.join(_DEMO_DIR, "asset", "t1", "T1_locomotion_physics_lab.usd")
@configclass
 class T1SceneCfg(InteractiveSceneCfg):
    """最终修正版：彻底解决 Unknown asset config type 报错"""
    # 1. 地面配置：直接在 spawn 内部定义材质
    ground = AssetBaseCfg(
        prim_path="/World/ground",
        spawn=sim_utils.GroundPlaneCfg(
            physics_material=sim_utils.RigidBodyMaterialCfg(
                static_friction=1.0,
                dynamic_friction=1.0,
                restitution=0.3,
                friction_combine_mode="average",
                restitution_combine_mode="average",
            )
        ),
    )
    # 2. 机器人配置
    robot = ArticulationCfg(
        prim_path="{ENV_REGEX_NS}/Robot",
        spawn=sim_utils.UsdFileCfg(
            usd_path=T1_USD_PATH,
            activate_contact_sensors=True,
            rigid_props=sim_utils.RigidBodyPropertiesCfg(
                disable_gravity=False,
                max_depenetration_velocity=10.0,
            ),
            articulation_props=sim_utils.ArticulationRootPropertiesCfg(
                enabled_self_collisions=True,
                solver_position_iteration_count=8,
                solver_velocity_iteration_count=4,
            ),
        ),
        init_state=ArticulationCfg.InitialStateCfg(
            pos=(0.0, 0.0, 0.4), # 掉落高度
            joint_pos={".*": 0.0},
        ),
        actuators={
            "t1_joints": ImplicitActuatorCfg(
                joint_names_expr=[".*"],
                effort_limit=800.0,  # 翻倍，确保电机有力气
                velocity_limit=20.0,
                stiffness=500.0,  # 【关键】从 150 提到 500-800 之间
                damping=40.0,  # 【关键】从 5 提到 30-50 之间，抑制乱抖
            ),
        },
    )
    contact_sensor = ContactSensorCfg(
        prim_path="{ENV_REGEX_NS}/Robot/.*",
        update_period=0.0,
        history_length=3,
    )
    # 3. 光照配置
    light = AssetBaseCfg(
        prim_path="/World/light",
        spawn=sim_utils.DistantLightCfg(color=(0.75, 0.75, 0.75), intensity=3000.0),
    )
 # ['Trunk', 'H1', 'H2', 'AL1', 'AL2', 'AL3', 'left_hand_link', 'AR1', 'AR2', 'AR3', 'right_hand_link', 'Waist', 'Hip_Pitch_Left', 'Hip_Roll_Left', 'Hip_Yaw_Left', 'Shank_Left', 'Ankle_Cross_Left', 'left_foot_link', 'Hip_Pitch_Right', 'Hip_Roll_Right', 'Hip_Yaw_Right', 'Shank_Right', 'Ankle_Cross_Right', 'right_foot_link']
--- a/rl_game/get_up/train.py
+++ b/rl_game/get_up/train.py
@@ -0,0 +1,101 @@
 import sys
 import os
 import argparse
 # 确保能找到项目根目录下的模块
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from isaaclab.app import AppLauncher
 # 1. 配置启动参数
 parser = argparse.ArgumentParser(description="Train T1 robot to Get-Up with RL-Games.")
 parser.add_argument("--num_envs", type=int, default=8192, help="起身任务建议并行 4096 即可")
 parser.add_argument("--task", type=str, default="Isaac-T1-GetUp-v0", help="任务 ID")
 parser.add_argument("--seed", type=int, default=42, help="随机种子")
 AppLauncher.add_app_launcher_args(parser)
 args_cli = parser.parse_args()
 # 2. 启动仿真器（必须在导入其他 isaaclab 模块前）
 app_launcher = AppLauncher(args_cli)
 simulation_app = app_launcher.app
 import torch
 import gymnasium as gym
 import yaml
 from isaaclab_rl.rl_games import RlGamesVecEnvWrapper
 from rl_games.torch_runner import Runner
 from rl_games.common import env_configurations, vecenv
 # 导入你刚刚修改好的配置类
 # 假设你的文件名是 t1_getup_cfg.py，类名是 T1EnvCfg
 from config.t1_env_cfg import T1EnvCfg
 # 3. 注册环境
 gym.register(
    id="Isaac-T1-GetUp-v0",
    entry_point="isaaclab.envs:ManagerBasedRLEnv",
    kwargs={
        "cfg": T1EnvCfg(),  # 这里会加载你设置的随机旋转、时间惩罚等
    },
 )
 def main():
    # --- 新增：处理 Retrain 参数 ---
    # 你可以手动指定路径，或者在 argparse 里增加一个 --checkpoint 参数
    checkpoint_path = os.path.join(os.path.dirname(__file__), "logs/T1_GetUp/nn/T1_GetUp.pth")
    # 检查模型文件是否存在
    should_retrain = os.path.exists(checkpoint_path)
    env = gym.make("Isaac-T1-GetUp-v0", num_envs=args_cli.num_envs)
    # 注意：rl_device 必须设置为 args_cli.device (通常是 'cuda:0')
    wrapped_env = RlGamesVecEnvWrapper(
        env,
        rl_device=args_cli.device,
        clip_obs=5.0,
        clip_actions=1.0
    )
    vecenv.register('as_is', lambda config_name, num_actors, **kwargs: wrapped_env)
    env_configurations.register('rlgym', {
        'vecenv_type': 'as_is',
        'env_creator': lambda **kwargs: wrapped_env
    })
    config_path = os.path.join(os.path.dirname(__file__), "config", "ppo_cfg.yaml")
    with open(config_path, "r") as f:
        rl_config = yaml.safe_load(f)
    # 设置日志和实验名称
    rl_game_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "."))
    log_dir = os.path.join(rl_game_dir, "logs")
    rl_config['params']['config']['train_dir'] = log_dir
    rl_config['params']['config']['name'] = "T1_GetUp"
    # --- 关键修改：注入模型路径 ---
    if should_retrain:
        print(f"[INFO]: 检测到预训练模型，正在从 {checkpoint_path} 恢复训练...")
        # rl_games 会读取 config 中的 load_path 进行续训
        rl_config['params']['config']['load_path'] = checkpoint_path
    else:
        print("[INFO]: 未找到预训练模型，将从零开始训练。")
    # 7. 运行训练
    runner = Runner()
    runner.load(rl_config)
    runner.run({
        "train": True,
        "play": False,
        # 如果你想强制从某个 checkpoint 开始，也可以在这里传参
        "checkpoint": checkpoint_path if should_retrain else None,
        "vec_env": wrapped_env
    })
    simulation_app.close()
 if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
ChenXi	f1bd15d434	reward modification and add stage reward	2026-03-23 10:17:31 -04:00
ChenXi	4bc205399c	reward modification and change the get_up logic	2026-03-23 09:06:36 -04:00
ChenXi	af42087bd8	Amend tiny bug	2026-03-22 21:21:17 -04:00
ChenXi	7f7ec781c5	Add weighting function, change the reward logic	2026-03-22 21:11:46 -04:00
ChenXi	a642274fa6	Amend symbol to save video memory	2026-03-22 03:05:24 -04:00
ChenXi	20c961936d	Amend symbol	2026-03-22 02:57:04 -04:00
ChenXi	0315b4cb99	prevent gradient explosion	2026-03-22 02:55:07 -04:00
ChenXi	616dd06e78	Amend success rewards	2026-03-22 02:32:58 -04:00
ChenXi	2e2d68a933	change the reward remove arm disturbance	2026-03-22 02:26:16 -04:00
ChenXi	f7c8e6e325	Amend bugs	2026-03-22 02:20:17 -04:00
ChenXi	a8199fd056	Amend arm push reward	2026-03-22 02:19:29 -04:00
ChenXi	0e70d34e81	Amend bugs	2026-03-22 00:01:21 -04:00
ChenXi	905e998596	change model	2026-03-21 23:46:59 -04:00
ChenXi	4833ba33c8	change parameter	2026-03-21 10:16:01 -04:00
ChenXi	fd8238dc41	Amend arm reward to get reward difficultly	2026-03-21 09:30:43 -04:00
ChenXi	72a22bd78a	change arm to push the ground reward function	2026-03-21 08:38:17 -04:00
ChenXi	d78fdeda0d	change reward function	2026-03-21 07:00:49 -04:00
ChenXi	6d2ad9846a	change parameter	2026-03-20 10:51:07 -04:00
ChenXi	1fbc9dccac	change parameter	2026-03-20 09:53:34 -04:00
ChenXi	49da77db51	change parameter	2026-03-20 08:55:29 -04:00
ChenXi	c0088ebac3	Amend tiny bug	2026-03-20 08:12:08 -04:00
ChenXi	00d3be8e7a	Amend tiny bug	2026-03-20 08:00:51 -04:00
ChenXi	ad2255bc18	change parameter	2026-03-20 07:06:42 -04:00
ChenXi	14f2151014	Amend bugs	2026-03-20 07:03:41 -04:00
ChenXi	31a9fa9965	change T1EventCfg to add more initial state	2026-03-20 05:20:17 -04:00
ChenXi	2ae7210062	Amend for standing	2026-03-20 03:37:56 -04:00
ChenXi	9cfc127694	Amend bug	2026-03-19 09:36:32 -04:00
ChenXi	af3ba4704f	Add feet_airtime loss	2026-03-19 09:25:20 -04:00
ChenXi	5df147b0b1	Add arm link rewards	2026-03-19 09:08:57 -04:00
ChenXi	6ca671dce5	change rewards	2026-03-19 06:29:30 -04:00
ChenXi	d4089b103e	change init nums	2026-03-18 06:36:40 -04:00
ChenXi	118d39f4bc	change env num	2026-03-18 06:32:06 -04:00
ChenXi	fdfd962fbc	Amend a tiny bug	2026-03-18 06:18:29 -04:00
ChenXi	08d1bb539b	Amend a tiny bug	2026-03-18 06:11:30 -04:00
ChenXi	9f3ec9d67a	Amend some codes to init training for get up better	2026-03-18 06:05:30 -04:00
ChenXi	4933567ef8	change reward add punishment of joint_vel and root_vel_z_penalty	2026-03-17 05:54:20 -04:00
ChenXi	c1e3d9382f	Add reward to maintain an upright and stable position	2026-03-16 09:23:22 -04:00
ChenXi	6510cb0bfc	Amend some bugs and make it training	2026-03-16 05:46:49 -04:00
ChenXi	4b0b1fac8d	The demo of get up	2026-03-16 05:00:20 -04:00