reward modification and change the get_up logic

This commit is contained in:
2026-03-23 09:06:36 -04:00
parent af42087bd8
commit 4bc205399c

View File

@@ -1,7 +1,7 @@
import random
import numpy
import numpy as np
import torch
import random
import numpy as np
import isaaclab.envs.mdp as mdp
from isaaclab.assets import ArticulationCfg
from isaaclab.envs import ManagerBasedRLEnvCfg, ManagerBasedRLEnv
from isaaclab.managers import ObservationGroupCfg as ObsGroup
@@ -13,7 +13,6 @@ from isaaclab.envs.mdp import JointPositionActionCfg
from isaaclab.managers import SceneEntityCfg
from isaaclab.utils import configclass
from rl_game.get_up.env.t1_env import T1SceneCfg
import isaaclab.envs.mdp as mdp
# --- 1. 自定义 MDP 逻辑函数 ---
@@ -52,72 +51,70 @@ def standing_with_feet_reward(
return height_reward * (0.5 + 0.5 * force_weight * vel_penalty)
def arm_tuck_incremental_reward(
env: ManagerBasedRLEnv,
pitch_threshold: float = 1.4,
shaping_weight: float = 0.2
) -> torch.Tensor:
"""增量式收手奖励:鼓励向弯曲方向运动,达到阈值给大奖"""
joint_names = ["Left_Elbow_Pitch", "Right_Elbow_Pitch"]
joint_ids, _ = env.scene["robot"].find_joints(joint_names)
elbow_pos = env.scene["robot"].data.joint_pos[:, joint_ids]
elbow_vel = env.scene["robot"].data.joint_vel[:, joint_ids]
# 1. 速度引导:只要在收缩(速度为正)就给小奖,伸直则惩罚
avg_vel = torch.mean(elbow_vel, dim=-1)
shaping_reward = torch.tanh(avg_vel) * shaping_weight
# 2. 阈值触发:一旦收缩到位,给稳定的静态奖
is_tucked = torch.all(elbow_pos > pitch_threshold, dim=-1).float()
goal_bonus = is_tucked * 1.5
return shaping_reward + goal_bonus
def dynamic_getup_strategy_reward(env: ManagerBasedRLEnv) -> torch.Tensor:
"""
状态机奖励切换逻辑
- 仰卧时:重点是 翻身 + 缩手
- 俯卧时:重点是 撑地起立
全姿态对称起立策略
1. 核心蜷缩 (Spring Loading):无论仰卧还是俯卧,只要高度低,就必须强制收腿
2. 仰卧支撑 (Back-Pushing):在仰卧状态下,鼓励手臂向后发力并抬高盆骨
3. 协同爆发 (Explosive Jump):蜷缩状态下产生的向上动量获得最高倍率奖励。
"""
# 获取重力投影Z轴分量 > 0 表示仰卧
gravity_z = env.scene["robot"].data.projected_gravity_b[:, 2]
# --- 1. 获取物理状态 ---
gravity_z = env.scene["robot"].data.projected_gravity_b[:, 2] # 1:仰卧, -1:俯卧
pelvis_idx, _ = env.scene["robot"].find_bodies("Trunk")
curr_pelvis_h = env.scene["robot"].data.body_state_w[:, pelvis_idx[0], 2]
root_vel_z = env.scene["robot"].data.root_lin_vel_w[:, 2]
# 状态掩码
is_on_back = (gravity_z > 0.2).float()
is_on_belly = (gravity_z < -0.2).float()
is_transition = (1.0 - is_on_back - is_on_belly)
# 关节索引11,12髋, 17,18膝 (确保与T1模型一致)
knee_joints = [17, 18]
hip_pitch_joints = [11, 12]
joint_pos = env.scene["robot"].data.joint_pos
# 1. 翻身势能:引导 gravity_z 向 -1.0 靠拢
flip_shaping = torch.clamp(-gravity_z, min=-1.0, max=1.0)
# --- 2. 核心蜷缩评分 (Crouch Score) ---
# 无论仰俯,蜷缩是起立的绝对前提。目标是让脚尽可能靠近质心。
# 提高膝盖弯曲目标 (1.5 rad),引导更深度的折叠
knee_flex_err = torch.abs(joint_pos[:, knee_joints] - 1.5).sum(dim=-1)
hip_flex_err = torch.abs(joint_pos[:, hip_pitch_joints] - 1.2).sum(dim=-1)
crouch_score = torch.exp(-(knee_flex_err + hip_flex_err) * 0.6)
# 2. 缩手动作
tuck_rew = arm_tuck_incremental_reward(env)
# 基础蜷缩奖励 (Spring Base) - 权重加大
crouch_trigger = torch.clamp(0.6 - curr_pelvis_h, min=0.0)
base_crouch_reward = crouch_trigger * crouch_score * 40.0
# 3. 撑地动作 (复用原逻辑,但去掉内部的高度衰减,统一由状态机控制)
# --- 3. 支撑力奖励 (Support Force) ---
push_reward = torch.zeros_like(curr_pelvis_h)
contact_sensor = env.scene.sensors.get("contact_sensor")
max_arm_force = torch.zeros(env.num_envs, device=env.device)
if contact_sensor is not None:
# 假设手臂/手部 link 的受力
# 监测非足部Link手、臂的受力
# 无论正反,只要手能提供垂直向上的推力,就是好手
arm_forces_z = contact_sensor.data.net_forces_w[:, :, 2]
max_arm_force = torch.max(arm_forces_z, dim=-1)[0]
push_reward = torch.tanh(torch.max(arm_forces_z, dim=-1)[0] / 30.0)
push_rew = torch.tanh(torch.clamp(max_arm_force - 15.0, min=0.0) / 40.0)
# --- 4. 姿态特定引导 (Orientation-Neutral) ---
is_back = torch.clamp(gravity_z, min=0.0) # 仰卧程度
is_belly = torch.clamp(-gravity_z, min=0.0) # 俯卧程度
# --- 权重动态合成 ---
# 仰卧区:翻身(8.0) + 缩手(4.0)
back_strategy = is_on_back * (8.0 * flip_shaping + 4.0 * tuck_rew)
# A. 仰卧直接起立逻辑:
# 仰卧时,如果能把盆骨撑起来 (curr_pelvis_h 增加),给予重奖
# 配合crouch_score鼓励“收腿-撑地-挺髋”的动作链
back_lift_reward = is_back * torch.clamp(curr_pelvis_h - 0.15, min=0.0) * crouch_score * 50.0
# 俯卧区:撑地(25.0) + 缩手维持(1.0)
# 这里撑地权重远高于翻身,确保机器人更愿意待在俯卧区尝试站立
belly_strategy = is_on_belly * (25.0 * push_rew + 1.0 * tuck_rew)
# B. 俯卧/翻身辅助逻辑 (保留一定的翻身倾向,但不再是唯一路径)
flip_reward = is_back * (1.0 - gravity_z) * 5.0 # 权重降低,仅作为备选
# 过渡区
trans_strategy = is_transition * (4.0 * flip_shaping + 10.0 * push_rew + 2.0 * tuck_rew)
# --- 5. 最终爆发项 (The Jump) ---
# 核心公式:蜷缩程度 * 向上速度 * 支撑力感应
# 这是一个通用的“起跳”奖励,无论正反面,只要满足“缩得紧、跳得快、手有撑”,奖励就爆炸
explosion_reward = crouch_score * torch.clamp(root_vel_z, min=0.0) * (0.5 + 0.5 * push_reward) * 80.0
return back_strategy + belly_strategy + trans_strategy
# --- 6. 汇总 ---
total_reward = (
base_crouch_reward + # 必须缩腿
back_lift_reward + # 仰卧挺髋
flip_reward + # 翻身尝试
explosion_reward # 终极爆发
)
return total_reward
def is_standing_still(
env: ManagerBasedRLEnv,
@@ -127,7 +124,6 @@ def is_standing_still(
standing_time: float,
velocity_threshold: float = 0.15
) -> torch.Tensor:
"""判定逻辑:双高度达标 + 躯干垂直 + 全身静止"""
head_idx, _ = env.scene["robot"].find_bodies("H2")
pelvis_idx, _ = env.scene["robot"].find_bodies("Trunk")
@@ -193,7 +189,7 @@ class T1EventCfg:
"asset_cfg": SceneEntityCfg("robot"),
"pose_range": {
"roll": (-1.57, 1.57),
"pitch": tuple(numpy.array([1.4, 1.6], dtype=np.float32) * random.choice([-1 , 1])), # 仰卧/俯卧
"pitch": tuple(np.array([1.4, 1.6], dtype=np.float32) * random.choice([-1 , 1])),
"yaw": (-3.14, 3.14),
"x": (0.0, 0.0),
"y": (0.0, 0.0),
@@ -207,50 +203,88 @@ class T1EventCfg:
@configclass
class T1ActionCfg:
joint_pos = JointPositionActionCfg(
asset_name="robot", joint_names=T1_JOINT_NAMES, scale=0.5, use_default_offset=True
# 拆分动作组以防止抽搐。由于不强制规定动作,我们可以给各个部位较为均衡的探索范围。
arm_action = JointPositionActionCfg(
asset_name="robot",
joint_names=[
'Left_Shoulder_Pitch', 'Left_Shoulder_Roll', 'Left_Elbow_Pitch', 'Left_Elbow_Yaw',
'Right_Shoulder_Pitch', 'Right_Shoulder_Roll', 'Right_Elbow_Pitch', 'Right_Elbow_Yaw'
],
scale=1.2, # 给了手臂相对充裕的自由度去摸索
use_default_offset=True
)
torso_action = JointPositionActionCfg(
asset_name="robot",
joint_names=['Waist', 'AAHead_yaw', 'Head_pitch'],
scale=0.8,
use_default_offset=True
)
leg_action = JointPositionActionCfg(
asset_name="robot",
joint_names=[
'Left_Hip_Pitch', 'Right_Hip_Pitch', 'Left_Hip_Roll', 'Right_Hip_Roll',
'Left_Hip_Yaw', 'Right_Hip_Yaw', 'Left_Knee_Pitch', 'Right_Knee_Pitch',
'Left_Ankle_Pitch', 'Right_Ankle_Pitch', 'Left_Ankle_Roll', 'Right_Ankle_Roll'
],
scale=0.6,
use_default_offset=True
)
@configclass
class T1GetUpRewardCfg:
# --- 1. 动态策略整合奖励 (包含了翻身、缩手、撑地的逻辑切换) ---
adaptive_strategy = RewTerm(
# 1. 核心阶段性引导 (翻身 -> 蜷缩 -> 支撑)
dynamic_strategy = RewTerm(
func=dynamic_getup_strategy_reward,
weight=1.0 # 内部已经有细分权重
weight=1.5
)
# --- 2. 核心高度目标 (维持最高优先级) ---
# 2. 站立质量奖励 (强化双脚受力)
height_with_feet = RewTerm(
func=standing_with_feet_reward,
weight=15.0,
weight=40.0, # 大权重
params={
"min_head_height": 1.1,
"min_pelvis_height": 0.7,
"sensor_cfg": SceneEntityCfg("contact_sensor", body_names=[".*_foot_link"]),
"force_threshold": 30.0,
"max_v_z": 0.3
"force_threshold": 40.0, # 必须达到一定压力,防止脚尖点地作弊
"max_v_z": 0.2
}
)
# --- 3. 辅助约束与惩罚 ---
upright = RewTerm(func=mdp.flat_orientation_l2, weight=1.0)
joint_limits = RewTerm(func=mdp.joint_pos_limits, weight=-20.0, params={"asset_cfg": SceneEntityCfg("robot")})
action_rate = RewTerm(func=mdp.action_rate_l2, weight=-0.01)
# --- 4. 成功奖励 ---
is_success_bonus = RewTerm(
func=is_standing_still,
weight=1000.0,
# 3. 惩罚项:防止钻空子
# 严厉惩罚:如果躯干(Trunk)或头(H2)直接接触地面,扣大分
body_contact_penalty = RewTerm(
func=mdp.contact_forces,
weight=-20.0,
params={
"min_head_height": 1.05,
"min_pelvis_height": 0.75,
"max_angle_error": 0.3,
"standing_time": 0.2,
"velocity_threshold": 0.5
"sensor_cfg": SceneEntityCfg("contact_sensor", body_names=["Trunk", "H2"]),
"threshold": 1.0
}
)
# 4. 关节功耗惩罚 (防止高频抽搐)
action_rate = RewTerm(
func=mdp.action_rate_l2,
weight=-0.01
)
# 5. 成功维持奖励
is_success_maintain = RewTerm(
func=is_standing_still,
weight=1000.0, # 巨大的成功奖励
params={
"min_head_height": 1.08,
"min_pelvis_height": 0.72,
"max_angle_error": 0.2,
"standing_time": 0.4, # 必须站稳 0.4s
"velocity_threshold": 0.3
}
)
@configclass
class T1GetUpTerminationsCfg:
time_out = DoneTerm(func=mdp.time_out)
@@ -270,15 +304,11 @@ class T1GetUpTerminationsCfg:
class T1EnvCfg(ManagerBasedRLEnvCfg):
scene = T1SceneCfg(num_envs=8192, env_spacing=2.5)
def __post_init__(self):
super().__post_init__()
self.scene.robot.init_state.pos = (0.0, 0.0, 0.2)
observations = T1ObservationCfg()
rewards = T1GetUpRewardCfg()
terminations = T1GetUpTerminationsCfg()
events = T1EventCfg()
actions = T1ActionCfg()
episode_length_s = 6.0
episode_length_s = 10.0
decimation = 4