change model and policy

restore
2026-03-14 01:08:22 -04:00 · 2026-03-13 21:44:59 -04:00 · 2026-03-13 21:38:44 -04:00
3 changed files with 73 additions and 39 deletions
--- a/scripts/commons/Server.py
+++ b/scripts/commons/Server.py
@@ -21,7 +21,7 @@ class Server():
            port = first_server_p + i
            mport = first_monitor_p + i
-            server_cmd = f"{cmd} --aport {port} --mport {mport} --no-render  --no-realtime"
+            server_cmd = f"{cmd} --aport {port} --mport {mport} "
            self.rcss_processes.append(
                subprocess.Popen(
--- a/scripts/commons/Train_Base.py
+++ b/scripts/commons/Train_Base.py
@@ -171,8 +171,8 @@ class Train_Base():
            ep_reward += reward
            ep_length += 1
-            if enable_FPS_control:  # control simulation speed (using non blocking user input)
+            # if enable_FPS_control:  # control simulation speed (using non blocking user input)
-                self.control_fps(select.select([sys.stdin], [], [], 0)[0])
+            #     self.control_fps(select.select([sys.stdin], [], [], 0)[0])
            if done:
                obs, _ = env.reset()
--- a/scripts/gyms/Walk.py
+++ b/scripts/gyms/Walk.py
@@ -57,7 +57,7 @@ class WalkEnv(gym.Env):
        self._target_hz = 0.0
        self._target_dt = 0.0
        self._last_sync_time = None
-        target_hz_env = 24
+        target_hz_env = 1000
        if target_hz_env:
            try:
                self._target_hz = float(target_hz_env)
@@ -80,8 +80,8 @@ class WalkEnv(gym.Env):
        action_dim = len(self.Player.robot.ROBOT_MOTORS)
        self.no_of_actions = action_dim
        self.action_space = spaces.Box(
-            low=-1.0,
+            low=-10.0,
-            high=1.0,
+            high=10.0,
            shape=(action_dim,),
            dtype=np.float32
        )
@@ -115,17 +115,17 @@ class WalkEnv(gym.Env):
            ]
        )
        self.reference_joint_nominal_position = self.joint_nominal_position.copy()
-
+        self.joint_nominal_position = np.zeros(self.no_of_actions)
        self.train_sim_flip = np.array(
            [
                1.0,  # 0: Head_yaw (he1)
                -1.0,  # 1: Head_pitch (he2)
                1.0,  # 2: Left_Shoulder_Pitch (lae1)
                -1.0,  # 3: Left_Shoulder_Roll (lae2)
-                1.0,  # 4: Left_Elbow_Pitch (lae3)
+                -1.0,  # 4: Left_Elbow_Pitch (lae3)
                1.0,  # 5: Left_Elbow_Yaw (lae4)
                -1.0,  # 6: Right_Shoulder_Pitch (rae1)
-                1.0,  # 7: Right_Shoulder_Roll (rae2)
+                -1.0,  # 7: Right_Shoulder_Roll (rae2)
                1.0,  # 8: Right_Elbow_Pitch (rae3)
                1.0,  # 9: Right_Elbow_Yaw (rae4)
                1.0,  # 10: Waist (te1)
@@ -145,8 +145,10 @@ class WalkEnv(gym.Env):
        )
        self.scaling_factor = 0.5
        # self.scaling_factor = 1
        self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
        self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
        self.previous_pos = np.array([0.0, 0.0])  # Track previous position
        self.Player.server.connect()
        # sleep(2.0)  # Longer wait for connection to establish completely
@@ -320,6 +322,7 @@ class WalkEnv(gym.Env):
        self.waypoint_index = 0
        self.route_completed = False
        self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
        self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
        self.previous_pos = np.array([0.0, 0.0])  # Initialize for first step
        self.walk_cycle_step = 0
@@ -391,35 +394,60 @@ class WalkEnv(gym.Env):
        return
    def compute_reward(self, previous_pos, current_pos, action):
        eps = 1e-6
        dt = 0.05
        velocity = current_pos - previous_pos
-        velocity_magnitude = np.linalg.norm(velocity)
+        speed_step = float(np.linalg.norm(velocity))
        speed = speed_step / dt
        direction_to_target = self.target_position - current_pos
        prev_direction_to_target = self.target_position - previous_pos
-        distance_to_target = np.linalg.norm(direction_to_target)
+        distance_to_target = float(np.linalg.norm(direction_to_target))
-        prev_distance_to_target = np.linalg.norm(prev_direction_to_target)
+        prev_distance_to_target = float(np.linalg.norm(prev_direction_to_target))
-        progress_reward = np.clip((prev_distance_to_target - distance_to_target) * 30.0, -2.0, 4.0)
+        # Progress toward waypoint (secondary signal)
        progress = prev_distance_to_target - distance_to_target
        progress_reward = np.clip(progress * 2.0, -1.5, 2.5)
-        velocity_in_m_per_sec = velocity_magnitude / 0.05
+        # Forward speed and lateral drift
-        speed_reward = np.clip(velocity_in_m_per_sec * 1.5, 0.0, 1.5)
+        forward_dir = direction_to_target / max(distance_to_target, eps)
        forward_speed = float(np.dot(velocity, forward_dir)) / dt
        target_speed = 1.0
        speed_error = forward_speed - target_speed
        speed_reward = 3.0 * math.exp(-1.5 * (speed_error ** 2))
-        if velocity_magnitude > 1e-4 and distance_to_target > 1e-4:
+        lateral_vec = velocity - forward_dir * np.dot(velocity, forward_dir)
-            directional_alignment = np.dot(velocity, direction_to_target) / (velocity_magnitude * distance_to_target)
+        lateral_speed = float(np.linalg.norm(lateral_vec)) / dt
-            directional_alignment = np.clip(directional_alignment, -1.0, 1.0)
+        lateral_penalty = -0.6 * np.clip(lateral_speed, 0.0, 2.0)
-            direction_reward = max(0.0, directional_alignment)
+
        # Heading alignment (small shaping term)
        if speed_step > 1e-4 and distance_to_target > 1e-4:
            directional_alignment = np.dot(velocity, direction_to_target) / (speed_step * distance_to_target)
            directional_alignment = float(np.clip(directional_alignment, -1.0, 1.0))
            direction_reward = max(0.0, directional_alignment) * 0.3
        else:
            direction_reward = 0.0
        alive_bonus = 0.05
-        height = self.Player.world.global_position[2]
+        # Height and posture
-        if 0.45 <= height <= 1.2:
+        height = float(self.Player.world.global_position[2])
-            height_reward = 1.5
+        if 0.8 <= height <= 1.05:
            height_reward = 1.0
        elif 0.40 <= height <= 1.20:
            height_reward = -1.0
        else:
            height_reward = -6.0
-        motionless_penalty = -1.5 if velocity_magnitude < 0.003 else 0.0
+        orientation_quat_inv = R.from_quat(self.Player.robot._global_cheat_orientation).inv()
        projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0]))
        tilt_mag = float(np.linalg.norm(projected_gravity[:2]))
        posture_penalty = -2.2 * (tilt_mag ** 2)
        motionless_penalty = -1.5 if speed < 0.1 else 0.0
        # Waypoint bonus
        waypoint_bonus = 0.0
        if distance_to_target < 0.5:
            waypoint_bonus = 25.0
@@ -430,26 +458,29 @@ class WalkEnv(gym.Env):
                waypoint_bonus = 100.0
                self.route_completed = True
-        action_magnitude = np.linalg.norm(action[11:])
+        # Effort + smoothness
-        action_penalty = -0.08 * action_magnitude
+        action_magnitude = float(np.linalg.norm(action[11:]))
-        tilt_penalty = -0.2 * np.linalg.norm(self.Player.robot.gyroscope[:2]) / 100.0
+        action_penalty = -0.05 * action_magnitude
        action_delta = action - self.last_action_for_reward
        smoothness_penalty = -0.02 * float(np.linalg.norm(action_delta[11:]))
        return (
                progress_reward
                + speed_reward
                + lateral_penalty
                + direction_reward
                + alive_bonus
                + height_reward
                + posture_penalty
                + motionless_penalty
                + waypoint_bonus
                + action_penalty
-                + tilt_penalty
+                + smoothness_penalty
        )
    def step(self, action):
        r = self.Player.robot
        self.previous_action = action
        self.target_joint_positions = (
@@ -476,6 +507,7 @@ class WalkEnv(gym.Env):
        # Update previous position
        self.previous_pos = current_pos.copy()
        self.last_action_for_reward = action.copy()
        # Fall detection and penalty
        is_fallen = self.Player.world.global_position[2] < 0.3
@@ -494,13 +526,13 @@ class Train(Train_Base):
    def train(self, args):
        # --------------------------------------- Learning parameters
-        n_envs = 8  # Reduced from 8 to decrease CPU/network pressure during init
+        n_envs = 20  # Reduced from 8 to decrease CPU/network pressure during init
        if n_envs < 1:
            raise ValueError("GYM_CPU_N_ENVS must be >= 1")
        n_steps_per_env = 512  # RolloutBuffer is of size (n_steps_per_env * n_envs)
        minibatch_size = 128  # should be a factor of (n_steps_per_env * n_envs)
        total_steps = 30000000
-        learning_rate = 3e-4
+        learning_rate = 2e-4
        folder_name = f'Walk_R{self.robot_type}'
        model_path = f'./scripts/gyms/logs/{folder_name}/'
@@ -529,10 +561,10 @@ class Train(Train_Base):
            # Custom policy network architecture
            policy_kwargs = dict(
                net_arch=dict(
-                    pi=[256, 256, 128],  # Policy network: 3 layers
+                    pi=[512, 256, 128],  # Policy network: 3 layers
-                    vf=[256, 256, 128]  # Value network: 3 layers
+                    vf=[512, 256, 128]  # Value network: 3 layers
                ),
-                activation_fn=__import__('torch.nn', fromlist=['ReLU']).ReLU,
+                activation_fn=__import__('torch.nn', fromlist=['ELU']).ELU,
            )
            if "model_file" in args:  # retrain
@@ -548,10 +580,10 @@ class Train(Train_Base):
                    learning_rate=learning_rate,
                    device="cpu",
                    policy_kwargs=policy_kwargs,
-                    ent_coef=0.01,  # Entropy coefficient for exploration
+                    # ent_coef=0.01,  # Entropy coefficient for exploration
-                    clip_range=0.2,  # PPO clipping parameter
+                    # clip_range=0.2,  # PPO clipping parameter
-                    gae_lambda=0.95,  # GAE lambda
+                    # gae_lambda=0.95,  # GAE lambda
-                    gamma=0.99  # Discount factor
+                    # gamma=0.99  # Discount factor
                )
            model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
@@ -572,7 +604,7 @@ class Train(Train_Base):
        # Uses different server and monitor ports
        server_log_dir = os.path.join(args["folder_dir"], "server_logs")
        os.makedirs(server_log_dir, exist_ok=True)
-        server = Train_Server(self.server_p - 1, self.monitor_p, 1, log_dir=server_log_dir)
+        server = Train_Server(self.server_p - 1, self.monitor_p, 1)
        env = WalkEnv(self.ip, self.server_p - 1)
        model = PPO.load(args["model_file"], env=env)
@@ -603,4 +635,6 @@ if __name__ == "__main__":
    )
    trainer = Train(script_args)
-    trainer.train({"model_file": "scripts/gyms/logs/Walk_R0_000/model_245760_steps.zip"})
+    trainer.train({})
    # trainer.test({"model_file": "scripts/gyms/logs/Walk_R0_003/best_model.zip",
    #               "folder_dir": "Walk_R0_003",})
Author	SHA1	Message	Date
xxh	f99fae68f6	change model and policy	2026-03-14 01:08:22 -04:00
xxh	294fe0bd79	restore	2026-03-13 21:44:59 -04:00
xxh	cf80becd17	restore	2026-03-13 21:38:44 -04:00