prevent gradient explosion

2026-03-22 02:55:07 -04:00
parent 616dd06e78
commit 0315b4cb99
2 changed files with 36 additions and 25 deletions
--- a/rl_game/get_up/config/ppo_cfg.yaml
+++ b/rl_game/get_up/config/ppo_cfg.yaml
@@ -17,7 +17,7 @@ params:
          name: default
        sigma_init:
          name: const_initializer
-          val: 1.2
+          val: 0.5
        fixed_sigma: False
    mlp:
      units: [512, 256, 128]
@@ -39,21 +39,21 @@ params:
    reward_shaper:
      scale_value: 1.0
    normalize_advantage: True
-    gamma: 0.98
+    gamma: 0.96
    tau: 0.95
-    learning_rate: 5e-4
+    learning_rate: 3e-4
    lr_schedule: adaptive
-    kl_threshold: 0.008
+    kl_threshold: 0.015
    score_to_win: 20000
    max_epochs: 500
    save_best_after: 50
    save_frequency: 100
-    grad_norm: 0.5
+    grad_norm: 1.0
-    entropy_coef: 0.008
+    entropy_coef: 0.005
    truncate_grads: True
    bounds_loss_coef: 0.001
    e_clip: 0.2
-    horizon_length: 128
+    horizon_length: 65536
    minibatch_size: 8192
    mini_epochs: 4
    critic_coef: 1
--- a/rl_game/get_up/config/t1_env_cfg.py
+++ b/rl_game/get_up/config/t1_env_cfg.py
@@ -26,27 +26,35 @@ def standing_with_feet_reward(
        force_threshold: float = 20.0,
        max_v_z: float = 0.5
 ) -> torch.Tensor:
-
+    # 增加防护：从场景中安全获取 body 索引
    head_idx, _ = env.scene["robot"].find_bodies("H2")
    pelvis_idx, _ = env.scene["robot"].find_bodies("Trunk")
-    curr_head_h = torch.clamp(env.scene["robot"].data.body_state_w[:, head_idx[0], 2], 0.0, 2.0)
+    # 1. 高度奖励：使用更稳定的归一化，限制范围在 [0, 1]
-    curr_pelvis_h = torch.clamp(env.scene["robot"].data.body_state_w[:, pelvis_idx[0], 2], 0.0, 2.0)
+    curr_head_h = env.scene["robot"].data.body_state_w[:, head_idx[0], 2]
    curr_pelvis_h = env.scene["robot"].data.body_state_w[:, pelvis_idx[0], 2]
-    head_score = torch.tanh(curr_head_h / (min_head_height + 1e-6) * 2.0)
+    # 使用 sigmoid 或简单的 min-max 映射，避免除以极小值
-    pelvis_score = torch.tanh(curr_pelvis_h / (min_pelvis_height + 1e-6) * 2.0)
+    head_score = torch.clamp(curr_head_h / min_head_height, 0.0, 1.2)
    pelvis_score = torch.clamp(curr_pelvis_h / min_pelvis_height, 0.0, 1.2)
    height_reward = (head_score + pelvis_score) / 2.0
    # 2. 足部受力：增加对 NaN 的防御
    contact_sensor = env.scene.sensors.get(sensor_cfg.name)
    # 某些步数传感器可能未初始化，加个判空
    if contact_sensor is None: return torch.zeros(env.num_envs, device=env.device)
    foot_forces_z = torch.sum(contact_sensor.data.net_forces_w[:, :, 2], dim=-1)
    # 对巨大的冲击力做剪裁，防止 sigmoid 输入过大
    foot_forces_z = torch.clamp(foot_forces_z, 0.0, 500.0)
    force_weight = torch.sigmoid((foot_forces_z - force_threshold) / 5.0)
    # 3. 垂直速度惩罚：使用更平滑的惩罚
    root_vel_z = env.scene["robot"].data.root_lin_vel_w[:, 2]
-    vel_penalty = torch.exp(-2.0 * torch.clamp(torch.abs(root_vel_z) - max_v_z, min=0.0))
+    vel_penalty = torch.exp(-torch.abs(root_vel_z) / max_v_z)
-    influence_weight = torch.clamp((curr_pelvis_h - 0.2) / 0.4, min=0.0, max=1.0)
+    # 逻辑组合：高度 * 稳定性
-    combined_reward = height_reward * ((1.0 - influence_weight) + influence_weight * force_weight * vel_penalty)
+    return height_reward * (0.5 + 0.5 * force_weight * vel_penalty)
    return combined_reward
 def universal_arm_support_reward(
@@ -138,15 +146,18 @@ def is_standing_still(
 # --- 2. 配置类 ---
 T1_JOINT_NAMES = [
-    # 腿部
+
    'Head_yaw', 'Head_pitch'
    'Left_Shoulder_Pitch', 'Left_Shoulder_Roll', 'Left_Elbow_Pitch', 'Left_Elbow_Yaw',
    'Right_Shoulder_Pitch', 'Right_Shoulder_Roll', 'Right_Elbow_Pitch', 'Right_Elbow_Yaw',
    'Waist'
    'Left_Hip_Pitch', 'Right_Hip_Pitch', 'Left_Hip_Roll', 'Right_Hip_Roll',
    'Left_Hip_Yaw', 'Right_Hip_Yaw', 'Left_Knee_Pitch', 'Right_Knee_Pitch',
    'Left_Ankle_Pitch', 'Right_Ankle_Pitch', 'Left_Ankle_Roll', 'Right_Ankle_Roll',
-    # 手臂
+
    'Left_Shoulder_Pitch', 'Left_Shoulder_Roll', 'Left_Elbow_Pitch', 'Left_Elbow_Yaw',
    'Right_Shoulder_Pitch', 'Right_Shoulder_Roll', 'Right_Elbow_Pitch', 'Right_Elbow_Yaw',
    # 腰部
    'Waist'
 ]
@@ -180,7 +191,7 @@ class T1EventCfg:
                "yaw": (-3.14, 3.14), # 全向旋转
                "x": (0.0, 0.0),
                "y": (0.0, 0.0),
-                "z": (0.1, 0.2),
+                "z": (0.3, 0.4),
            },
            "velocity_range": {},
        },
@@ -244,7 +255,7 @@ class T1GetUpRewardCfg:
    # 6. 成功终极大奖
    is_success = RewTerm(
        func=is_standing_still,
-        weight=2000.0,
+        weight=800.0,
        params={
            "min_head_height": 1.05,
            "min_pelvis_height": 0.75,