Erratic and unstable movements in Franka Arm

I am using the IsaacGymEnvs library’s Factory template to create a basic peg insertion task.

This is the reward design:

    def compute_reward(self, actions):
        # Refresh tensors
        self.gym.refresh_actor_root_state_tensor(self.sim)
        self.gym.refresh_rigid_body_state_tensor(self.sim)
        self.rew_buf[:] = 0

        # Extract positions
        hand_pos = self.rigid_body_states[:, self.hand_body_id_env, 0:3]
        plug_pos = self.root_state_tensor[:, self.plug_actor_id_env, 0:3]
        socket_pos = self.root_state_tensor[:, self.socket_actor_id_env, 0:3]

        # Get orientations for gripper alignment
        hand_rot = self.rigid_body_states[:, self.hand_body_id_env, 3:7]  # quaternion

        # Calculate distances
        hand_to_plug = hand_pos - plug_pos
        hand_to_plug_dist = torch.norm(hand_to_plug, dim=-1)

        # Multi-stage reward with different scales
        # Stage 1: Far from plug - gentle gradient to encourage movement
        far_mask = hand_to_plug_dist > 0.3
        medium_mask = (hand_to_plug_dist <= 0.3) & (hand_to_plug_dist > 0.1)
        close_mask = hand_to_plug_dist <= 0.1

        # Different reward scales based on distance
        far_reward = -hand_to_plug_dist * 0.5  # Linear negative distance
        medium_reward = 0.5 - hand_to_plug_dist * 3.0  # Steeper gradient
        close_reward = 1.0 - hand_to_plug_dist * 5.0  # Even steeper gradient

        # Combine rewards based on distance masks
        distance_reward = torch.zeros_like(hand_to_plug_dist)
        distance_reward = torch.where(far_mask, far_reward, distance_reward)
        distance_reward = torch.where(medium_mask, medium_reward, distance_reward)
        distance_reward = torch.where(close_mask, close_reward, distance_reward)

        # Height alignment reward - encourage gripper to be slightly above plug
        height_diff = hand_pos[:, 2] - plug_pos[:, 2]
        ideal_height = 0.05  # gripper should be slightly above plug
        height_reward = -torch.abs(height_diff - ideal_height) * 2.0

        # XY alignment reward - encourage gripper to be centered above plug
        xy_dist = torch.norm(hand_to_plug[:, :2], dim=-1)  # Only XY components
        xy_reward = -xy_dist * 3.0

        # Action smoothness penalty
        action_penalty = torch.sum(actions ** 2, dim=-1) * 0.01

        # Add up rewards with appropriate weights
        self.rew_buf = 0.5 * distance_reward + 0.2 * height_reward + 0.3 * xy_reward - action_penalty

        # Success bonus for being very close
        success_mask = hand_to_plug_dist < 0.05
        self.rew_buf = torch.where(success_mask, self.rew_buf + 2.0, self.rew_buf)

        # Ensure rewards are bounded and stable
        self.rew_buf = torch.clamp(self.rew_buf, -3.0, 3.0)

        # Reset conditions
        self.reset_buf[:] = torch.where(self.progress_buf >= self.max_episode_length, 1, 0)
        plug_pos_z = self.root_state_tensor[:, self.plug_actor_id_env, 2]
        self.reset_buf[:] = torch.where(plug_pos_z < self.cfg_base.env.table_height, 1, self.reset_buf)

        return self.rew_buf

And this is how the actions are implemented:

    def pre_physics_step(self, actions):
        # First, clip actions to prevent extreme values
        self.actions = torch.clamp(actions.clone(), -1.0, 1.0).to(self.device)

        # Convert to float32 explicitly to ensure numerical stability
        self.actions = self.actions.to(dtype=torch.float32)

        # Split actions for arm and fingers (assuming last 2 DOFs are fingers)
        arm_actions = self.actions[:, :self.num_franka_dofs]

        # Scale arm actions
        delta_arm_targets = self.franka_dof_speed_scales[
                            :self.num_franka_dofs] * self.dt * arm_actions * self.action_scale
        delta_arm_targets = torch.clamp(delta_arm_targets, -0.1, 0.1)

        # Update targets
        targets = self.franka_dof_targets[:, :self.num_franka_dofs] + delta_arm_targets
        self.franka_dof_targets[:, :self.num_franka_dofs] = tensor_clamp(
            targets,
            self.franka_dof_lower_limits,
            self.franka_dof_upper_limits
        )

        # Apply the targets
        self.gym.set_dof_position_target_tensor(self.sim, gymtorch.unwrap_tensor(self.franka_dof_targets))

    def post_physics_step(self):
        # implement post-physics simulation code here
        #    - e.g. compute reward, compute observations
        self.progress_buf += 1

        env_ids = self.reset_buf.nonzero(as_tuple=False).squeeze(-1)
        if len(env_ids) > 0:
            self.reset_idx(env_ids)

        self.compute_observations()
        self.compute_reward(self.actions)

I am attaching the video of how the arm performs during training here. This is after a few hundred episodes of training:

The unstable actions of the arms aren’t what I am expecting. What can be cause of this?

This is the first time I am designing a task from scratch so I might have messed up a few things.