I am using the IsaacGymEnvs library’s Factory template to create a basic peg insertion task.
This is the reward design:
def compute_reward(self, actions):
# Refresh tensors
self.gym.refresh_actor_root_state_tensor(self.sim)
self.gym.refresh_rigid_body_state_tensor(self.sim)
self.rew_buf[:] = 0
# Extract positions
hand_pos = self.rigid_body_states[:, self.hand_body_id_env, 0:3]
plug_pos = self.root_state_tensor[:, self.plug_actor_id_env, 0:3]
socket_pos = self.root_state_tensor[:, self.socket_actor_id_env, 0:3]
# Get orientations for gripper alignment
hand_rot = self.rigid_body_states[:, self.hand_body_id_env, 3:7] # quaternion
# Calculate distances
hand_to_plug = hand_pos - plug_pos
hand_to_plug_dist = torch.norm(hand_to_plug, dim=-1)
# Multi-stage reward with different scales
# Stage 1: Far from plug - gentle gradient to encourage movement
far_mask = hand_to_plug_dist > 0.3
medium_mask = (hand_to_plug_dist <= 0.3) & (hand_to_plug_dist > 0.1)
close_mask = hand_to_plug_dist <= 0.1
# Different reward scales based on distance
far_reward = -hand_to_plug_dist * 0.5 # Linear negative distance
medium_reward = 0.5 - hand_to_plug_dist * 3.0 # Steeper gradient
close_reward = 1.0 - hand_to_plug_dist * 5.0 # Even steeper gradient
# Combine rewards based on distance masks
distance_reward = torch.zeros_like(hand_to_plug_dist)
distance_reward = torch.where(far_mask, far_reward, distance_reward)
distance_reward = torch.where(medium_mask, medium_reward, distance_reward)
distance_reward = torch.where(close_mask, close_reward, distance_reward)
# Height alignment reward - encourage gripper to be slightly above plug
height_diff = hand_pos[:, 2] - plug_pos[:, 2]
ideal_height = 0.05 # gripper should be slightly above plug
height_reward = -torch.abs(height_diff - ideal_height) * 2.0
# XY alignment reward - encourage gripper to be centered above plug
xy_dist = torch.norm(hand_to_plug[:, :2], dim=-1) # Only XY components
xy_reward = -xy_dist * 3.0
# Action smoothness penalty
action_penalty = torch.sum(actions ** 2, dim=-1) * 0.01
# Add up rewards with appropriate weights
self.rew_buf = 0.5 * distance_reward + 0.2 * height_reward + 0.3 * xy_reward - action_penalty
# Success bonus for being very close
success_mask = hand_to_plug_dist < 0.05
self.rew_buf = torch.where(success_mask, self.rew_buf + 2.0, self.rew_buf)
# Ensure rewards are bounded and stable
self.rew_buf = torch.clamp(self.rew_buf, -3.0, 3.0)
# Reset conditions
self.reset_buf[:] = torch.where(self.progress_buf >= self.max_episode_length, 1, 0)
plug_pos_z = self.root_state_tensor[:, self.plug_actor_id_env, 2]
self.reset_buf[:] = torch.where(plug_pos_z < self.cfg_base.env.table_height, 1, self.reset_buf)
return self.rew_buf
And this is how the actions are implemented:
def pre_physics_step(self, actions):
# First, clip actions to prevent extreme values
self.actions = torch.clamp(actions.clone(), -1.0, 1.0).to(self.device)
# Convert to float32 explicitly to ensure numerical stability
self.actions = self.actions.to(dtype=torch.float32)
# Split actions for arm and fingers (assuming last 2 DOFs are fingers)
arm_actions = self.actions[:, :self.num_franka_dofs]
# Scale arm actions
delta_arm_targets = self.franka_dof_speed_scales[
:self.num_franka_dofs] * self.dt * arm_actions * self.action_scale
delta_arm_targets = torch.clamp(delta_arm_targets, -0.1, 0.1)
# Update targets
targets = self.franka_dof_targets[:, :self.num_franka_dofs] + delta_arm_targets
self.franka_dof_targets[:, :self.num_franka_dofs] = tensor_clamp(
targets,
self.franka_dof_lower_limits,
self.franka_dof_upper_limits
)
# Apply the targets
self.gym.set_dof_position_target_tensor(self.sim, gymtorch.unwrap_tensor(self.franka_dof_targets))
def post_physics_step(self):
# implement post-physics simulation code here
# - e.g. compute reward, compute observations
self.progress_buf += 1
env_ids = self.reset_buf.nonzero(as_tuple=False).squeeze(-1)
if len(env_ids) > 0:
self.reset_idx(env_ids)
self.compute_observations()
self.compute_reward(self.actions)
I am attaching the video of how the arm performs during training here. This is after a few hundred episodes of training:
The unstable actions of the arms aren’t what I am expecting. What can be cause of this?
This is the first time I am designing a task from scratch so I might have messed up a few things.