Compare commits

..

3 Commits

Author SHA1 Message Date
xxh
f99fae68f6 change model and policy 2026-03-14 01:08:22 -04:00
xxh
294fe0bd79 restore 2026-03-13 21:44:59 -04:00
xxh
cf80becd17 restore 2026-03-13 21:38:44 -04:00
3 changed files with 73 additions and 39 deletions

View File

@@ -21,7 +21,7 @@ class Server():
port = first_server_p + i port = first_server_p + i
mport = first_monitor_p + i mport = first_monitor_p + i
server_cmd = f"{cmd} --aport {port} --mport {mport} --no-render --no-realtime" server_cmd = f"{cmd} --aport {port} --mport {mport} "
self.rcss_processes.append( self.rcss_processes.append(
subprocess.Popen( subprocess.Popen(

View File

@@ -171,8 +171,8 @@ class Train_Base():
ep_reward += reward ep_reward += reward
ep_length += 1 ep_length += 1
if enable_FPS_control: # control simulation speed (using non blocking user input) # if enable_FPS_control: # control simulation speed (using non blocking user input)
self.control_fps(select.select([sys.stdin], [], [], 0)[0]) # self.control_fps(select.select([sys.stdin], [], [], 0)[0])
if done: if done:
obs, _ = env.reset() obs, _ = env.reset()

View File

@@ -57,7 +57,7 @@ class WalkEnv(gym.Env):
self._target_hz = 0.0 self._target_hz = 0.0
self._target_dt = 0.0 self._target_dt = 0.0
self._last_sync_time = None self._last_sync_time = None
target_hz_env = 24 target_hz_env = 1000
if target_hz_env: if target_hz_env:
try: try:
self._target_hz = float(target_hz_env) self._target_hz = float(target_hz_env)
@@ -80,8 +80,8 @@ class WalkEnv(gym.Env):
action_dim = len(self.Player.robot.ROBOT_MOTORS) action_dim = len(self.Player.robot.ROBOT_MOTORS)
self.no_of_actions = action_dim self.no_of_actions = action_dim
self.action_space = spaces.Box( self.action_space = spaces.Box(
low=-1.0, low=-10.0,
high=1.0, high=10.0,
shape=(action_dim,), shape=(action_dim,),
dtype=np.float32 dtype=np.float32
) )
@@ -115,17 +115,17 @@ class WalkEnv(gym.Env):
] ]
) )
self.reference_joint_nominal_position = self.joint_nominal_position.copy() self.reference_joint_nominal_position = self.joint_nominal_position.copy()
self.joint_nominal_position = np.zeros(self.no_of_actions)
self.train_sim_flip = np.array( self.train_sim_flip = np.array(
[ [
1.0, # 0: Head_yaw (he1) 1.0, # 0: Head_yaw (he1)
-1.0, # 1: Head_pitch (he2) -1.0, # 1: Head_pitch (he2)
1.0, # 2: Left_Shoulder_Pitch (lae1) 1.0, # 2: Left_Shoulder_Pitch (lae1)
-1.0, # 3: Left_Shoulder_Roll (lae2) -1.0, # 3: Left_Shoulder_Roll (lae2)
1.0, # 4: Left_Elbow_Pitch (lae3) -1.0, # 4: Left_Elbow_Pitch (lae3)
1.0, # 5: Left_Elbow_Yaw (lae4) 1.0, # 5: Left_Elbow_Yaw (lae4)
-1.0, # 6: Right_Shoulder_Pitch (rae1) -1.0, # 6: Right_Shoulder_Pitch (rae1)
1.0, # 7: Right_Shoulder_Roll (rae2) -1.0, # 7: Right_Shoulder_Roll (rae2)
1.0, # 8: Right_Elbow_Pitch (rae3) 1.0, # 8: Right_Elbow_Pitch (rae3)
1.0, # 9: Right_Elbow_Yaw (rae4) 1.0, # 9: Right_Elbow_Yaw (rae4)
1.0, # 10: Waist (te1) 1.0, # 10: Waist (te1)
@@ -145,8 +145,10 @@ class WalkEnv(gym.Env):
) )
self.scaling_factor = 0.5 self.scaling_factor = 0.5
# self.scaling_factor = 1
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.previous_pos = np.array([0.0, 0.0]) # Track previous position self.previous_pos = np.array([0.0, 0.0]) # Track previous position
self.Player.server.connect() self.Player.server.connect()
# sleep(2.0) # Longer wait for connection to establish completely # sleep(2.0) # Longer wait for connection to establish completely
@@ -320,6 +322,7 @@ class WalkEnv(gym.Env):
self.waypoint_index = 0 self.waypoint_index = 0
self.route_completed = False self.route_completed = False
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step
self.walk_cycle_step = 0 self.walk_cycle_step = 0
@@ -391,35 +394,60 @@ class WalkEnv(gym.Env):
return return
def compute_reward(self, previous_pos, current_pos, action): def compute_reward(self, previous_pos, current_pos, action):
eps = 1e-6
dt = 0.05
velocity = current_pos - previous_pos velocity = current_pos - previous_pos
velocity_magnitude = np.linalg.norm(velocity) speed_step = float(np.linalg.norm(velocity))
speed = speed_step / dt
direction_to_target = self.target_position - current_pos direction_to_target = self.target_position - current_pos
prev_direction_to_target = self.target_position - previous_pos prev_direction_to_target = self.target_position - previous_pos
distance_to_target = np.linalg.norm(direction_to_target) distance_to_target = float(np.linalg.norm(direction_to_target))
prev_distance_to_target = np.linalg.norm(prev_direction_to_target) prev_distance_to_target = float(np.linalg.norm(prev_direction_to_target))
progress_reward = np.clip((prev_distance_to_target - distance_to_target) * 30.0, -2.0, 4.0) # Progress toward waypoint (secondary signal)
progress = prev_distance_to_target - distance_to_target
progress_reward = np.clip(progress * 2.0, -1.5, 2.5)
velocity_in_m_per_sec = velocity_magnitude / 0.05 # Forward speed and lateral drift
speed_reward = np.clip(velocity_in_m_per_sec * 1.5, 0.0, 1.5) forward_dir = direction_to_target / max(distance_to_target, eps)
forward_speed = float(np.dot(velocity, forward_dir)) / dt
target_speed = 1.0
speed_error = forward_speed - target_speed
speed_reward = 3.0 * math.exp(-1.5 * (speed_error ** 2))
if velocity_magnitude > 1e-4 and distance_to_target > 1e-4: lateral_vec = velocity - forward_dir * np.dot(velocity, forward_dir)
directional_alignment = np.dot(velocity, direction_to_target) / (velocity_magnitude * distance_to_target) lateral_speed = float(np.linalg.norm(lateral_vec)) / dt
directional_alignment = np.clip(directional_alignment, -1.0, 1.0) lateral_penalty = -0.6 * np.clip(lateral_speed, 0.0, 2.0)
direction_reward = max(0.0, directional_alignment)
# Heading alignment (small shaping term)
if speed_step > 1e-4 and distance_to_target > 1e-4:
directional_alignment = np.dot(velocity, direction_to_target) / (speed_step * distance_to_target)
directional_alignment = float(np.clip(directional_alignment, -1.0, 1.0))
direction_reward = max(0.0, directional_alignment) * 0.3
else: else:
direction_reward = 0.0 direction_reward = 0.0
alive_bonus = 0.05 alive_bonus = 0.05
height = self.Player.world.global_position[2] # Height and posture
if 0.45 <= height <= 1.2: height = float(self.Player.world.global_position[2])
height_reward = 1.5 if 0.8 <= height <= 1.05:
height_reward = 1.0
elif 0.40 <= height <= 1.20:
height_reward = -1.0
else: else:
height_reward = -6.0 height_reward = -6.0
motionless_penalty = -1.5 if velocity_magnitude < 0.003 else 0.0 orientation_quat_inv = R.from_quat(self.Player.robot._global_cheat_orientation).inv()
projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0]))
tilt_mag = float(np.linalg.norm(projected_gravity[:2]))
posture_penalty = -2.2 * (tilt_mag ** 2)
motionless_penalty = -1.5 if speed < 0.1 else 0.0
# Waypoint bonus
waypoint_bonus = 0.0 waypoint_bonus = 0.0
if distance_to_target < 0.5: if distance_to_target < 0.5:
waypoint_bonus = 25.0 waypoint_bonus = 25.0
@@ -430,26 +458,29 @@ class WalkEnv(gym.Env):
waypoint_bonus = 100.0 waypoint_bonus = 100.0
self.route_completed = True self.route_completed = True
action_magnitude = np.linalg.norm(action[11:]) # Effort + smoothness
action_penalty = -0.08 * action_magnitude action_magnitude = float(np.linalg.norm(action[11:]))
tilt_penalty = -0.2 * np.linalg.norm(self.Player.robot.gyroscope[:2]) / 100.0 action_penalty = -0.05 * action_magnitude
action_delta = action - self.last_action_for_reward
smoothness_penalty = -0.02 * float(np.linalg.norm(action_delta[11:]))
return ( return (
progress_reward progress_reward
+ speed_reward + speed_reward
+ lateral_penalty
+ direction_reward + direction_reward
+ alive_bonus + alive_bonus
+ height_reward + height_reward
+ posture_penalty
+ motionless_penalty + motionless_penalty
+ waypoint_bonus + waypoint_bonus
+ action_penalty + action_penalty
+ tilt_penalty + smoothness_penalty
) )
def step(self, action): def step(self, action):
r = self.Player.robot r = self.Player.robot
self.previous_action = action self.previous_action = action
self.target_joint_positions = ( self.target_joint_positions = (
@@ -476,6 +507,7 @@ class WalkEnv(gym.Env):
# Update previous position # Update previous position
self.previous_pos = current_pos.copy() self.previous_pos = current_pos.copy()
self.last_action_for_reward = action.copy()
# Fall detection and penalty # Fall detection and penalty
is_fallen = self.Player.world.global_position[2] < 0.3 is_fallen = self.Player.world.global_position[2] < 0.3
@@ -494,13 +526,13 @@ class Train(Train_Base):
def train(self, args): def train(self, args):
# --------------------------------------- Learning parameters # --------------------------------------- Learning parameters
n_envs = 8 # Reduced from 8 to decrease CPU/network pressure during init n_envs = 20 # Reduced from 8 to decrease CPU/network pressure during init
if n_envs < 1: if n_envs < 1:
raise ValueError("GYM_CPU_N_ENVS must be >= 1") raise ValueError("GYM_CPU_N_ENVS must be >= 1")
n_steps_per_env = 512 # RolloutBuffer is of size (n_steps_per_env * n_envs) n_steps_per_env = 512 # RolloutBuffer is of size (n_steps_per_env * n_envs)
minibatch_size = 128 # should be a factor of (n_steps_per_env * n_envs) minibatch_size = 128 # should be a factor of (n_steps_per_env * n_envs)
total_steps = 30000000 total_steps = 30000000
learning_rate = 3e-4 learning_rate = 2e-4
folder_name = f'Walk_R{self.robot_type}' folder_name = f'Walk_R{self.robot_type}'
model_path = f'./scripts/gyms/logs/{folder_name}/' model_path = f'./scripts/gyms/logs/{folder_name}/'
@@ -529,10 +561,10 @@ class Train(Train_Base):
# Custom policy network architecture # Custom policy network architecture
policy_kwargs = dict( policy_kwargs = dict(
net_arch=dict( net_arch=dict(
pi=[256, 256, 128], # Policy network: 3 layers pi=[512, 256, 128], # Policy network: 3 layers
vf=[256, 256, 128] # Value network: 3 layers vf=[512, 256, 128] # Value network: 3 layers
), ),
activation_fn=__import__('torch.nn', fromlist=['ReLU']).ReLU, activation_fn=__import__('torch.nn', fromlist=['ELU']).ELU,
) )
if "model_file" in args: # retrain if "model_file" in args: # retrain
@@ -548,10 +580,10 @@ class Train(Train_Base):
learning_rate=learning_rate, learning_rate=learning_rate,
device="cpu", device="cpu",
policy_kwargs=policy_kwargs, policy_kwargs=policy_kwargs,
ent_coef=0.01, # Entropy coefficient for exploration # ent_coef=0.01, # Entropy coefficient for exploration
clip_range=0.2, # PPO clipping parameter # clip_range=0.2, # PPO clipping parameter
gae_lambda=0.95, # GAE lambda # gae_lambda=0.95, # GAE lambda
gamma=0.99 # Discount factor # gamma=0.99 # Discount factor
) )
model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env, model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
@@ -572,7 +604,7 @@ class Train(Train_Base):
# Uses different server and monitor ports # Uses different server and monitor ports
server_log_dir = os.path.join(args["folder_dir"], "server_logs") server_log_dir = os.path.join(args["folder_dir"], "server_logs")
os.makedirs(server_log_dir, exist_ok=True) os.makedirs(server_log_dir, exist_ok=True)
server = Train_Server(self.server_p - 1, self.monitor_p, 1, log_dir=server_log_dir) server = Train_Server(self.server_p - 1, self.monitor_p, 1)
env = WalkEnv(self.ip, self.server_p - 1) env = WalkEnv(self.ip, self.server_p - 1)
model = PPO.load(args["model_file"], env=env) model = PPO.load(args["model_file"], env=env)
@@ -603,4 +635,6 @@ if __name__ == "__main__":
) )
trainer = Train(script_args) trainer = Train(script_args)
trainer.train({"model_file": "scripts/gyms/logs/Walk_R0_000/model_245760_steps.zip"}) trainer.train({})
# trainer.test({"model_file": "scripts/gyms/logs/Walk_R0_003/best_model.zip",
# "folder_dir": "Walk_R0_003",})