|
|
|
@@ -57,7 +57,7 @@ class WalkEnv(gym.Env):
|
|
|
|
self._target_hz = 0.0
|
|
|
|
self._target_hz = 0.0
|
|
|
|
self._target_dt = 0.0
|
|
|
|
self._target_dt = 0.0
|
|
|
|
self._last_sync_time = None
|
|
|
|
self._last_sync_time = None
|
|
|
|
target_hz_env = 24
|
|
|
|
target_hz_env = 1000
|
|
|
|
if target_hz_env:
|
|
|
|
if target_hz_env:
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
self._target_hz = float(target_hz_env)
|
|
|
|
self._target_hz = float(target_hz_env)
|
|
|
|
@@ -80,8 +80,8 @@ class WalkEnv(gym.Env):
|
|
|
|
action_dim = len(self.Player.robot.ROBOT_MOTORS)
|
|
|
|
action_dim = len(self.Player.robot.ROBOT_MOTORS)
|
|
|
|
self.no_of_actions = action_dim
|
|
|
|
self.no_of_actions = action_dim
|
|
|
|
self.action_space = spaces.Box(
|
|
|
|
self.action_space = spaces.Box(
|
|
|
|
low=-1.0,
|
|
|
|
low=-10.0,
|
|
|
|
high=1.0,
|
|
|
|
high=10.0,
|
|
|
|
shape=(action_dim,),
|
|
|
|
shape=(action_dim,),
|
|
|
|
dtype=np.float32
|
|
|
|
dtype=np.float32
|
|
|
|
)
|
|
|
|
)
|
|
|
|
@@ -115,17 +115,17 @@ class WalkEnv(gym.Env):
|
|
|
|
]
|
|
|
|
]
|
|
|
|
)
|
|
|
|
)
|
|
|
|
self.reference_joint_nominal_position = self.joint_nominal_position.copy()
|
|
|
|
self.reference_joint_nominal_position = self.joint_nominal_position.copy()
|
|
|
|
|
|
|
|
self.joint_nominal_position = np.zeros(self.no_of_actions)
|
|
|
|
self.train_sim_flip = np.array(
|
|
|
|
self.train_sim_flip = np.array(
|
|
|
|
[
|
|
|
|
[
|
|
|
|
1.0, # 0: Head_yaw (he1)
|
|
|
|
1.0, # 0: Head_yaw (he1)
|
|
|
|
-1.0, # 1: Head_pitch (he2)
|
|
|
|
-1.0, # 1: Head_pitch (he2)
|
|
|
|
1.0, # 2: Left_Shoulder_Pitch (lae1)
|
|
|
|
1.0, # 2: Left_Shoulder_Pitch (lae1)
|
|
|
|
-1.0, # 3: Left_Shoulder_Roll (lae2)
|
|
|
|
-1.0, # 3: Left_Shoulder_Roll (lae2)
|
|
|
|
1.0, # 4: Left_Elbow_Pitch (lae3)
|
|
|
|
-1.0, # 4: Left_Elbow_Pitch (lae3)
|
|
|
|
1.0, # 5: Left_Elbow_Yaw (lae4)
|
|
|
|
1.0, # 5: Left_Elbow_Yaw (lae4)
|
|
|
|
-1.0, # 6: Right_Shoulder_Pitch (rae1)
|
|
|
|
-1.0, # 6: Right_Shoulder_Pitch (rae1)
|
|
|
|
1.0, # 7: Right_Shoulder_Roll (rae2)
|
|
|
|
-1.0, # 7: Right_Shoulder_Roll (rae2)
|
|
|
|
1.0, # 8: Right_Elbow_Pitch (rae3)
|
|
|
|
1.0, # 8: Right_Elbow_Pitch (rae3)
|
|
|
|
1.0, # 9: Right_Elbow_Yaw (rae4)
|
|
|
|
1.0, # 9: Right_Elbow_Yaw (rae4)
|
|
|
|
1.0, # 10: Waist (te1)
|
|
|
|
1.0, # 10: Waist (te1)
|
|
|
|
@@ -145,8 +145,10 @@ class WalkEnv(gym.Env):
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
self.scaling_factor = 0.5
|
|
|
|
self.scaling_factor = 0.5
|
|
|
|
|
|
|
|
# self.scaling_factor = 1
|
|
|
|
|
|
|
|
|
|
|
|
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
|
|
|
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
|
|
|
|
|
|
|
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
|
|
|
self.previous_pos = np.array([0.0, 0.0]) # Track previous position
|
|
|
|
self.previous_pos = np.array([0.0, 0.0]) # Track previous position
|
|
|
|
self.Player.server.connect()
|
|
|
|
self.Player.server.connect()
|
|
|
|
# sleep(2.0) # Longer wait for connection to establish completely
|
|
|
|
# sleep(2.0) # Longer wait for connection to establish completely
|
|
|
|
@@ -320,6 +322,7 @@ class WalkEnv(gym.Env):
|
|
|
|
self.waypoint_index = 0
|
|
|
|
self.waypoint_index = 0
|
|
|
|
self.route_completed = False
|
|
|
|
self.route_completed = False
|
|
|
|
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
|
|
|
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
|
|
|
|
|
|
|
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
|
|
|
self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step
|
|
|
|
self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step
|
|
|
|
self.walk_cycle_step = 0
|
|
|
|
self.walk_cycle_step = 0
|
|
|
|
|
|
|
|
|
|
|
|
@@ -391,35 +394,60 @@ class WalkEnv(gym.Env):
|
|
|
|
return
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
def compute_reward(self, previous_pos, current_pos, action):
|
|
|
|
def compute_reward(self, previous_pos, current_pos, action):
|
|
|
|
|
|
|
|
eps = 1e-6
|
|
|
|
|
|
|
|
dt = 0.05
|
|
|
|
|
|
|
|
|
|
|
|
velocity = current_pos - previous_pos
|
|
|
|
velocity = current_pos - previous_pos
|
|
|
|
velocity_magnitude = np.linalg.norm(velocity)
|
|
|
|
speed_step = float(np.linalg.norm(velocity))
|
|
|
|
|
|
|
|
speed = speed_step / dt
|
|
|
|
|
|
|
|
|
|
|
|
direction_to_target = self.target_position - current_pos
|
|
|
|
direction_to_target = self.target_position - current_pos
|
|
|
|
prev_direction_to_target = self.target_position - previous_pos
|
|
|
|
prev_direction_to_target = self.target_position - previous_pos
|
|
|
|
distance_to_target = np.linalg.norm(direction_to_target)
|
|
|
|
distance_to_target = float(np.linalg.norm(direction_to_target))
|
|
|
|
prev_distance_to_target = np.linalg.norm(prev_direction_to_target)
|
|
|
|
prev_distance_to_target = float(np.linalg.norm(prev_direction_to_target))
|
|
|
|
|
|
|
|
|
|
|
|
progress_reward = np.clip((prev_distance_to_target - distance_to_target) * 30.0, -2.0, 4.0)
|
|
|
|
# Progress toward waypoint (secondary signal)
|
|
|
|
|
|
|
|
progress = prev_distance_to_target - distance_to_target
|
|
|
|
|
|
|
|
progress_reward = np.clip(progress * 2.0, -1.5, 2.5)
|
|
|
|
|
|
|
|
|
|
|
|
velocity_in_m_per_sec = velocity_magnitude / 0.05
|
|
|
|
# Forward speed and lateral drift
|
|
|
|
speed_reward = np.clip(velocity_in_m_per_sec * 1.5, 0.0, 1.5)
|
|
|
|
forward_dir = direction_to_target / max(distance_to_target, eps)
|
|
|
|
|
|
|
|
forward_speed = float(np.dot(velocity, forward_dir)) / dt
|
|
|
|
|
|
|
|
target_speed = 1.0
|
|
|
|
|
|
|
|
speed_error = forward_speed - target_speed
|
|
|
|
|
|
|
|
speed_reward = 3.0 * math.exp(-1.5 * (speed_error ** 2))
|
|
|
|
|
|
|
|
|
|
|
|
if velocity_magnitude > 1e-4 and distance_to_target > 1e-4:
|
|
|
|
lateral_vec = velocity - forward_dir * np.dot(velocity, forward_dir)
|
|
|
|
directional_alignment = np.dot(velocity, direction_to_target) / (velocity_magnitude * distance_to_target)
|
|
|
|
lateral_speed = float(np.linalg.norm(lateral_vec)) / dt
|
|
|
|
directional_alignment = np.clip(directional_alignment, -1.0, 1.0)
|
|
|
|
lateral_penalty = -0.6 * np.clip(lateral_speed, 0.0, 2.0)
|
|
|
|
direction_reward = max(0.0, directional_alignment)
|
|
|
|
|
|
|
|
|
|
|
|
# Heading alignment (small shaping term)
|
|
|
|
|
|
|
|
if speed_step > 1e-4 and distance_to_target > 1e-4:
|
|
|
|
|
|
|
|
directional_alignment = np.dot(velocity, direction_to_target) / (speed_step * distance_to_target)
|
|
|
|
|
|
|
|
directional_alignment = float(np.clip(directional_alignment, -1.0, 1.0))
|
|
|
|
|
|
|
|
direction_reward = max(0.0, directional_alignment) * 0.3
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
direction_reward = 0.0
|
|
|
|
direction_reward = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
alive_bonus = 0.05
|
|
|
|
alive_bonus = 0.05
|
|
|
|
|
|
|
|
|
|
|
|
height = self.Player.world.global_position[2]
|
|
|
|
# Height and posture
|
|
|
|
if 0.45 <= height <= 1.2:
|
|
|
|
height = float(self.Player.world.global_position[2])
|
|
|
|
height_reward = 1.5
|
|
|
|
if 0.8 <= height <= 1.05:
|
|
|
|
|
|
|
|
height_reward = 1.0
|
|
|
|
|
|
|
|
elif 0.40 <= height <= 1.20:
|
|
|
|
|
|
|
|
height_reward = -1.0
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
height_reward = -6.0
|
|
|
|
height_reward = -6.0
|
|
|
|
|
|
|
|
|
|
|
|
motionless_penalty = -1.5 if velocity_magnitude < 0.003 else 0.0
|
|
|
|
orientation_quat_inv = R.from_quat(self.Player.robot._global_cheat_orientation).inv()
|
|
|
|
|
|
|
|
projected_gravity = orientation_quat_inv.apply(np.array([0.0, 0.0, -1.0]))
|
|
|
|
|
|
|
|
tilt_mag = float(np.linalg.norm(projected_gravity[:2]))
|
|
|
|
|
|
|
|
posture_penalty = -2.2 * (tilt_mag ** 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
motionless_penalty = -1.5 if speed < 0.1 else 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Waypoint bonus
|
|
|
|
waypoint_bonus = 0.0
|
|
|
|
waypoint_bonus = 0.0
|
|
|
|
if distance_to_target < 0.5:
|
|
|
|
if distance_to_target < 0.5:
|
|
|
|
waypoint_bonus = 25.0
|
|
|
|
waypoint_bonus = 25.0
|
|
|
|
@@ -430,26 +458,29 @@ class WalkEnv(gym.Env):
|
|
|
|
waypoint_bonus = 100.0
|
|
|
|
waypoint_bonus = 100.0
|
|
|
|
self.route_completed = True
|
|
|
|
self.route_completed = True
|
|
|
|
|
|
|
|
|
|
|
|
action_magnitude = np.linalg.norm(action[11:])
|
|
|
|
# Effort + smoothness
|
|
|
|
action_penalty = -0.08 * action_magnitude
|
|
|
|
action_magnitude = float(np.linalg.norm(action[11:]))
|
|
|
|
tilt_penalty = -0.2 * np.linalg.norm(self.Player.robot.gyroscope[:2]) / 100.0
|
|
|
|
action_penalty = -0.05 * action_magnitude
|
|
|
|
|
|
|
|
action_delta = action - self.last_action_for_reward
|
|
|
|
|
|
|
|
smoothness_penalty = -0.02 * float(np.linalg.norm(action_delta[11:]))
|
|
|
|
|
|
|
|
|
|
|
|
return (
|
|
|
|
return (
|
|
|
|
progress_reward
|
|
|
|
progress_reward
|
|
|
|
+ speed_reward
|
|
|
|
+ speed_reward
|
|
|
|
|
|
|
|
+ lateral_penalty
|
|
|
|
+ direction_reward
|
|
|
|
+ direction_reward
|
|
|
|
+ alive_bonus
|
|
|
|
+ alive_bonus
|
|
|
|
+ height_reward
|
|
|
|
+ height_reward
|
|
|
|
|
|
|
|
+ posture_penalty
|
|
|
|
+ motionless_penalty
|
|
|
|
+ motionless_penalty
|
|
|
|
+ waypoint_bonus
|
|
|
|
+ waypoint_bonus
|
|
|
|
+ action_penalty
|
|
|
|
+ action_penalty
|
|
|
|
+ tilt_penalty
|
|
|
|
+ smoothness_penalty
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def step(self, action):
|
|
|
|
def step(self, action):
|
|
|
|
|
|
|
|
|
|
|
|
r = self.Player.robot
|
|
|
|
r = self.Player.robot
|
|
|
|
|
|
|
|
|
|
|
|
self.previous_action = action
|
|
|
|
self.previous_action = action
|
|
|
|
|
|
|
|
|
|
|
|
self.target_joint_positions = (
|
|
|
|
self.target_joint_positions = (
|
|
|
|
@@ -476,6 +507,7 @@ class WalkEnv(gym.Env):
|
|
|
|
|
|
|
|
|
|
|
|
# Update previous position
|
|
|
|
# Update previous position
|
|
|
|
self.previous_pos = current_pos.copy()
|
|
|
|
self.previous_pos = current_pos.copy()
|
|
|
|
|
|
|
|
self.last_action_for_reward = action.copy()
|
|
|
|
|
|
|
|
|
|
|
|
# Fall detection and penalty
|
|
|
|
# Fall detection and penalty
|
|
|
|
is_fallen = self.Player.world.global_position[2] < 0.3
|
|
|
|
is_fallen = self.Player.world.global_position[2] < 0.3
|
|
|
|
@@ -494,13 +526,13 @@ class Train(Train_Base):
|
|
|
|
def train(self, args):
|
|
|
|
def train(self, args):
|
|
|
|
|
|
|
|
|
|
|
|
# --------------------------------------- Learning parameters
|
|
|
|
# --------------------------------------- Learning parameters
|
|
|
|
n_envs = 8 # Reduced from 8 to decrease CPU/network pressure during init
|
|
|
|
n_envs = 20 # Reduced from 8 to decrease CPU/network pressure during init
|
|
|
|
if n_envs < 1:
|
|
|
|
if n_envs < 1:
|
|
|
|
raise ValueError("GYM_CPU_N_ENVS must be >= 1")
|
|
|
|
raise ValueError("GYM_CPU_N_ENVS must be >= 1")
|
|
|
|
n_steps_per_env = 512 # RolloutBuffer is of size (n_steps_per_env * n_envs)
|
|
|
|
n_steps_per_env = 512 # RolloutBuffer is of size (n_steps_per_env * n_envs)
|
|
|
|
minibatch_size = 128 # should be a factor of (n_steps_per_env * n_envs)
|
|
|
|
minibatch_size = 128 # should be a factor of (n_steps_per_env * n_envs)
|
|
|
|
total_steps = 30000000
|
|
|
|
total_steps = 30000000
|
|
|
|
learning_rate = 3e-4
|
|
|
|
learning_rate = 2e-4
|
|
|
|
folder_name = f'Walk_R{self.robot_type}'
|
|
|
|
folder_name = f'Walk_R{self.robot_type}'
|
|
|
|
model_path = f'./scripts/gyms/logs/{folder_name}/'
|
|
|
|
model_path = f'./scripts/gyms/logs/{folder_name}/'
|
|
|
|
|
|
|
|
|
|
|
|
@@ -529,10 +561,10 @@ class Train(Train_Base):
|
|
|
|
# Custom policy network architecture
|
|
|
|
# Custom policy network architecture
|
|
|
|
policy_kwargs = dict(
|
|
|
|
policy_kwargs = dict(
|
|
|
|
net_arch=dict(
|
|
|
|
net_arch=dict(
|
|
|
|
pi=[256, 256, 128], # Policy network: 3 layers
|
|
|
|
pi=[512, 256, 128], # Policy network: 3 layers
|
|
|
|
vf=[256, 256, 128] # Value network: 3 layers
|
|
|
|
vf=[512, 256, 128] # Value network: 3 layers
|
|
|
|
),
|
|
|
|
),
|
|
|
|
activation_fn=__import__('torch.nn', fromlist=['ReLU']).ReLU,
|
|
|
|
activation_fn=__import__('torch.nn', fromlist=['ELU']).ELU,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if "model_file" in args: # retrain
|
|
|
|
if "model_file" in args: # retrain
|
|
|
|
@@ -548,10 +580,10 @@ class Train(Train_Base):
|
|
|
|
learning_rate=learning_rate,
|
|
|
|
learning_rate=learning_rate,
|
|
|
|
device="cpu",
|
|
|
|
device="cpu",
|
|
|
|
policy_kwargs=policy_kwargs,
|
|
|
|
policy_kwargs=policy_kwargs,
|
|
|
|
ent_coef=0.01, # Entropy coefficient for exploration
|
|
|
|
# ent_coef=0.01, # Entropy coefficient for exploration
|
|
|
|
clip_range=0.2, # PPO clipping parameter
|
|
|
|
# clip_range=0.2, # PPO clipping parameter
|
|
|
|
gae_lambda=0.95, # GAE lambda
|
|
|
|
# gae_lambda=0.95, # GAE lambda
|
|
|
|
gamma=0.99 # Discount factor
|
|
|
|
# gamma=0.99 # Discount factor
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
|
|
|
|
model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
|
|
|
|
@@ -572,7 +604,7 @@ class Train(Train_Base):
|
|
|
|
# Uses different server and monitor ports
|
|
|
|
# Uses different server and monitor ports
|
|
|
|
server_log_dir = os.path.join(args["folder_dir"], "server_logs")
|
|
|
|
server_log_dir = os.path.join(args["folder_dir"], "server_logs")
|
|
|
|
os.makedirs(server_log_dir, exist_ok=True)
|
|
|
|
os.makedirs(server_log_dir, exist_ok=True)
|
|
|
|
server = Train_Server(self.server_p - 1, self.monitor_p, 1, log_dir=server_log_dir)
|
|
|
|
server = Train_Server(self.server_p - 1, self.monitor_p, 1)
|
|
|
|
env = WalkEnv(self.ip, self.server_p - 1)
|
|
|
|
env = WalkEnv(self.ip, self.server_p - 1)
|
|
|
|
model = PPO.load(args["model_file"], env=env)
|
|
|
|
model = PPO.load(args["model_file"], env=env)
|
|
|
|
|
|
|
|
|
|
|
|
@@ -603,4 +635,6 @@ if __name__ == "__main__":
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
trainer = Train(script_args)
|
|
|
|
trainer = Train(script_args)
|
|
|
|
trainer.train({"model_file": "scripts/gyms/logs/Walk_R0_000/model_245760_steps.zip"})
|
|
|
|
trainer.train({})
|
|
|
|
|
|
|
|
# trainer.test({"model_file": "scripts/gyms/logs/Walk_R0_003/best_model.zip",
|
|
|
|
|
|
|
|
# "folder_dir": "Walk_R0_003",})
|
|
|
|
|