reward modification and change the get_up logic

2026-03-23 09:06:36 -04:00
parent af42087bd8
commit 4bc205399c
1 changed files with 114 additions and 84 deletions
--- a/rl_game/get_up/config/t1_env_cfg.py
+++ b/rl_game/get_up/config/t1_env_cfg.py
@@ -1,7 +1,7 @@
-import random
-import numpy
-import numpy as np
 import torch
+import random
+import numpy as np
+import isaaclab.envs.mdp as mdp
 from isaaclab.assets import ArticulationCfg
 from isaaclab.envs import ManagerBasedRLEnvCfg, ManagerBasedRLEnv
 from isaaclab.managers import ObservationGroupCfg as ObsGroup
@@ -13,7 +13,6 @@ from isaaclab.envs.mdp import JointPositionActionCfg
 from isaaclab.managers import SceneEntityCfg
 from isaaclab.utils import configclass
 from rl_game.get_up.env.t1_env import T1SceneCfg
-import isaaclab.envs.mdp as mdp


 # --- 1. 自定义 MDP 逻辑函数 ---
@@ -52,72 +51,70 @@ def standing_with_feet_reward(
    return height_reward * (0.5 + 0.5 * force_weight * vel_penalty)


-def arm_tuck_incremental_reward(
-        env: ManagerBasedRLEnv,
-        pitch_threshold: float = 1.4,
-        shaping_weight: float = 0.2
-) -> torch.Tensor:
-    """增量式收手奖励：鼓励向弯曲方向运动，达到阈值给大奖"""
-    joint_names = ["Left_Elbow_Pitch", "Right_Elbow_Pitch"]
-    joint_ids, _ = env.scene["robot"].find_joints(joint_names)
-
-    elbow_pos = env.scene["robot"].data.joint_pos[:, joint_ids]
-    elbow_vel = env.scene["robot"].data.joint_vel[:, joint_ids]
-
-    # 1. 速度引导：只要在收缩（速度为正）就给小奖，伸直则惩罚
-    avg_vel = torch.mean(elbow_vel, dim=-1)
-    shaping_reward = torch.tanh(avg_vel) * shaping_weight
-
-    # 2. 阈值触发：一旦收缩到位，给稳定的静态奖
-    is_tucked = torch.all(elbow_pos > pitch_threshold, dim=-1).float()
-    goal_bonus = is_tucked * 1.5
-
-    return shaping_reward + goal_bonus
-
-
 def dynamic_getup_strategy_reward(env: ManagerBasedRLEnv) -> torch.Tensor:
    """
-    状态机奖励切换逻辑：
-    - 仰卧时：重点是 翻身 + 缩手。
-    - 俯卧时：重点是 撑地起立。
+    全姿态对称起立策略：
+    1. 核心蜷缩 (Spring Loading)：无论仰卧还是俯卧，只要高度低，就必须强制收腿。
+    2. 仰卧支撑 (Back-Pushing)：在仰卧状态下，鼓励手臂向后发力并抬高盆骨。
+    3. 协同爆发 (Explosive Jump)：蜷缩状态下产生的向上动量获得最高倍率奖励。
    """
-    # 获取重力投影：Z轴分量 > 0 表示仰卧
-    gravity_z = env.scene["robot"].data.projected_gravity_b[:, 2]
+    # --- 1. 获取物理状态 ---
+    gravity_z = env.scene["robot"].data.projected_gravity_b[:, 2]  # 1:仰卧, -1:俯卧
+    pelvis_idx, _ = env.scene["robot"].find_bodies("Trunk")
+    curr_pelvis_h = env.scene["robot"].data.body_state_w[:, pelvis_idx[0], 2]
+    root_vel_z = env.scene["robot"].data.root_lin_vel_w[:, 2]

-    # 状态掩码
-    is_on_back = (gravity_z > 0.2).float()
-    is_on_belly = (gravity_z < -0.2).float()
-    is_transition = (1.0 - is_on_back - is_on_belly)
+    # 关节索引：11,12髋, 17,18膝 (确保与T1模型一致)
+    knee_joints = [17, 18]
+    hip_pitch_joints = [11, 12]
+    joint_pos = env.scene["robot"].data.joint_pos

-    # 1. 翻身势能：引导 gravity_z 向 -1.0 靠拢
-    flip_shaping = torch.clamp(-gravity_z, min=-1.0, max=1.0)
+    # --- 2. 核心蜷缩评分 (Crouch Score) ---
+    # 无论仰俯，蜷缩是起立的绝对前提。目标是让脚尽可能靠近质心。
+    # 提高膝盖弯曲目标 (1.5 rad)，引导更深度的折叠
+    knee_flex_err = torch.abs(joint_pos[:, knee_joints] - 1.5).sum(dim=-1)
+    hip_flex_err = torch.abs(joint_pos[:, hip_pitch_joints] - 1.2).sum(dim=-1)
+    crouch_score = torch.exp(-(knee_flex_err + hip_flex_err) * 0.6)

-    # 2. 缩手动作
-    tuck_rew = arm_tuck_incremental_reward(env)
+    # 基础蜷缩奖励 (Spring Base) - 权重加大
+    crouch_trigger = torch.clamp(0.6 - curr_pelvis_h, min=0.0)
+    base_crouch_reward = crouch_trigger * crouch_score * 40.0

-    # 3. 撑地动作 (复用原逻辑，但去掉内部的高度衰减，统一由状态机控制)
+    # --- 3. 支撑力奖励 (Support Force) ---
+    push_reward = torch.zeros_like(curr_pelvis_h)
    contact_sensor = env.scene.sensors.get("contact_sensor")
-    max_arm_force = torch.zeros(env.num_envs, device=env.device)
    if contact_sensor is not None:
-        # 假设手臂/手部 link 的受力
+        # 监测非足部Link（手、臂）的受力
+        # 无论正反，只要手能提供垂直向上的推力，就是好手
        arm_forces_z = contact_sensor.data.net_forces_w[:, :, 2]
-        max_arm_force = torch.max(arm_forces_z, dim=-1)[0]
+        push_reward = torch.tanh(torch.max(arm_forces_z, dim=-1)[0] / 30.0)

-    push_rew = torch.tanh(torch.clamp(max_arm_force - 15.0, min=0.0) / 40.0)
+    # --- 4. 姿态特定引导 (Orientation-Neutral) ---
+    is_back = torch.clamp(gravity_z, min=0.0)  # 仰卧程度
+    is_belly = torch.clamp(-gravity_z, min=0.0)  # 俯卧程度

-    # --- 权重动态合成 ---
-    # 仰卧区：翻身(8.0) + 缩手(4.0)
-    back_strategy = is_on_back * (8.0 * flip_shaping + 4.0 * tuck_rew)
+    # A. 仰卧直接起立逻辑：
+    # 在仰卧时，如果能把盆骨撑起来 (curr_pelvis_h 增加)，给予重奖
+    # 配合crouch_score，鼓励“收腿-撑地-挺髋”的动作链
+    back_lift_reward = is_back * torch.clamp(curr_pelvis_h - 0.15, min=0.0) * crouch_score * 50.0

-    # 俯卧区：撑地(25.0) + 缩手维持(1.0)
-    # 这里撑地权重远高于翻身，确保机器人更愿意待在俯卧区尝试站立
-    belly_strategy = is_on_belly * (25.0 * push_rew + 1.0 * tuck_rew)
+    # B. 俯卧/翻身辅助逻辑 (保留一定的翻身倾向，但不再是唯一路径)
+    flip_reward = is_back * (1.0 - gravity_z) * 5.0  # 权重降低，仅作为备选

-    # 过渡区
-    trans_strategy = is_transition * (4.0 * flip_shaping + 10.0 * push_rew + 2.0 * tuck_rew)
+    # --- 5. 最终爆发项 (The Jump) ---
+    # 核心公式：蜷缩程度 * 向上速度 * 支撑力感应
+    # 这是一个通用的“起跳”奖励，无论正反面，只要满足“缩得紧、跳得快、手有撑”，奖励就爆炸
+    explosion_reward = crouch_score * torch.clamp(root_vel_z, min=0.0) * (0.5 + 0.5 * push_reward) * 80.0

-    return back_strategy + belly_strategy + trans_strategy
+    # --- 6. 汇总 ---
+    total_reward = (
+            base_crouch_reward +  # 必须缩腿
+            back_lift_reward +  # 仰卧挺髋
+            flip_reward +  # 翻身尝试
+            explosion_reward  # 终极爆发
+    )

+    return total_reward

 def is_standing_still(
        env: ManagerBasedRLEnv,
@@ -127,7 +124,6 @@ def is_standing_still(
        standing_time: float,
        velocity_threshold: float = 0.15
 ) -> torch.Tensor:
-    """判定逻辑：双高度达标 + 躯干垂直 + 全身静止"""
    head_idx, _ = env.scene["robot"].find_bodies("H2")
    pelvis_idx, _ = env.scene["robot"].find_bodies("Trunk")

@@ -193,7 +189,7 @@ class T1EventCfg:
            "asset_cfg": SceneEntityCfg("robot"),
            "pose_range": {
                "roll": (-1.57, 1.57),
-                "pitch": tuple(numpy.array([1.4, 1.6], dtype=np.float32) * random.choice([-1 , 1])),  # 仰卧/俯卧
+                "pitch": tuple(np.array([1.4, 1.6], dtype=np.float32) * random.choice([-1 , 1])),
                "yaw": (-3.14, 3.14),
                "x": (0.0, 0.0),
                "y": (0.0, 0.0),
@@ -207,50 +203,88 @@ class T1EventCfg:

@configclass
 class T1ActionCfg:
-    joint_pos = JointPositionActionCfg(
-        asset_name="robot", joint_names=T1_JOINT_NAMES, scale=0.5, use_default_offset=True
+    # 拆分动作组以防止抽搐。由于不强制规定动作，我们可以给各个部位较为均衡的探索范围。
+    arm_action = JointPositionActionCfg(
+        asset_name="robot",
+        joint_names=[
+            'Left_Shoulder_Pitch', 'Left_Shoulder_Roll', 'Left_Elbow_Pitch', 'Left_Elbow_Yaw',
+            'Right_Shoulder_Pitch', 'Right_Shoulder_Roll', 'Right_Elbow_Pitch', 'Right_Elbow_Yaw'
+        ],
+        scale=1.2,  # 给了手臂相对充裕的自由度去摸索
+        use_default_offset=True
+    )
+
+    torso_action = JointPositionActionCfg(
+        asset_name="robot",
+        joint_names=['Waist', 'AAHead_yaw', 'Head_pitch'],
+        scale=0.8,
+        use_default_offset=True
+    )
+
+    leg_action = JointPositionActionCfg(
+        asset_name="robot",
+        joint_names=[
+            'Left_Hip_Pitch', 'Right_Hip_Pitch', 'Left_Hip_Roll', 'Right_Hip_Roll',
+            'Left_Hip_Yaw', 'Right_Hip_Yaw', 'Left_Knee_Pitch', 'Right_Knee_Pitch',
+            'Left_Ankle_Pitch', 'Right_Ankle_Pitch', 'Left_Ankle_Roll', 'Right_Ankle_Roll'
+        ],
+        scale=0.6,
+        use_default_offset=True
    )


@configclass
 class T1GetUpRewardCfg:
-    # --- 1. 动态策略整合奖励 (包含了翻身、缩手、撑地的逻辑切换) ---
-    adaptive_strategy = RewTerm(
+    # 1. 核心阶段性引导 (翻身 -> 蜷缩 -> 支撑)
+    dynamic_strategy = RewTerm(
        func=dynamic_getup_strategy_reward,
-        weight=1.0  # 内部已经有细分权重
+        weight=1.5
    )

-    # --- 2. 核心高度目标 (维持最高优先级) ---
+    # 2. 站立质量奖励 (强化双脚受力)
    height_with_feet = RewTerm(
        func=standing_with_feet_reward,
-        weight=15.0,
+        weight=40.0,  # 大权重
        params={
            "min_head_height": 1.1,
            "min_pelvis_height": 0.7,
            "sensor_cfg": SceneEntityCfg("contact_sensor", body_names=[".*_foot_link"]),
-            "force_threshold": 30.0,
-            "max_v_z": 0.3
+            "force_threshold": 40.0,  # 必须达到一定压力，防止脚尖点地作弊
+            "max_v_z": 0.2
        }
    )

-    # --- 3. 辅助约束与惩罚 ---
-    upright = RewTerm(func=mdp.flat_orientation_l2, weight=1.0)
-    joint_limits = RewTerm(func=mdp.joint_pos_limits, weight=-20.0, params={"asset_cfg": SceneEntityCfg("robot")})
-    action_rate = RewTerm(func=mdp.action_rate_l2, weight=-0.01)
-
-    # --- 4. 成功奖励 ---
-    is_success_bonus = RewTerm(
-        func=is_standing_still,
-        weight=1000.0,
+    # 3. 惩罚项：防止钻空子
+    # 严厉惩罚：如果躯干(Trunk)或头(H2)直接接触地面，扣大分
+    body_contact_penalty = RewTerm(
+        func=mdp.contact_forces,
+        weight=-20.0,
        params={
-            "min_head_height": 1.05,
-            "min_pelvis_height": 0.75,
-            "max_angle_error": 0.3,
-            "standing_time": 0.2,
-            "velocity_threshold": 0.5
+            "sensor_cfg": SceneEntityCfg("contact_sensor", body_names=["Trunk", "H2"]),
+            "threshold": 1.0
        }
    )

+    # 4. 关节功耗惩罚 (防止高频抽搐)
+    action_rate = RewTerm(
+        func=mdp.action_rate_l2,
+        weight=-0.01
+    )
+
+    # 5. 成功维持奖励
+    is_success_maintain = RewTerm(
+        func=is_standing_still,
+        weight=1000.0,  # 巨大的成功奖励
+        params={
+            "min_head_height": 1.08,
+            "min_pelvis_height": 0.72,
+            "max_angle_error": 0.2,
+            "standing_time": 0.4,  # 必须站稳 0.4s
+            "velocity_threshold": 0.3
+        }
+    )
+
+
@configclass
 class T1GetUpTerminationsCfg:
    time_out = DoneTerm(func=mdp.time_out)
@@ -270,15 +304,11 @@ class T1GetUpTerminationsCfg:
 class T1EnvCfg(ManagerBasedRLEnvCfg):
    scene = T1SceneCfg(num_envs=8192, env_spacing=2.5)

-    def __post_init__(self):
-        super().__post_init__()
-        self.scene.robot.init_state.pos = (0.0, 0.0, 0.2)
-
    observations = T1ObservationCfg()
    rewards = T1GetUpRewardCfg()
    terminations = T1GetUpTerminationsCfg()
    events = T1EventCfg()
    actions = T1ActionCfg()

-    episode_length_s = 6.0
+    episode_length_s = 10.0
    decimation = 4