Hello there,
I am trying to implement an Direct RL Workflow for a Franka robot to pick up a cube and move it to a goal position. Unfortunately I am not achieving good results with the RL training, so I simplified the goal to only move the robot’s end effector to the cube. position.
The reward is simply based on a Euclidian distance on the Franka’s hand to the cube’s position. I am running with the same parameters of the Franka Direct Cabinet Task with 24000 steps.
Besides the bad results there is also a problem in the spawning the cube in a random position in which it spawns and moves without collision with the robot. Some input would be great.
Thanks in advance
# Copyright (c) 2022-2024, The Isaac Lab Project Developers.
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
from __future__ import annotations
import torch
from omni.isaac.core.utils.stage import get_current_stage
from omni.isaac.core.utils.torch.transformations import tf_combine, tf_inverse, tf_vector
from pxr import UsdGeom
import omni.isaac.lab.sim as sim_utils
from omni.isaac.lab.actuators.actuator_cfg import ImplicitActuatorCfg
from omni.isaac.lab.assets import Articulation, ArticulationCfg, RigidObject, RigidObjectCfg
from omni.isaac.lab.envs import DirectRLEnv, DirectRLEnvCfg
from omni.isaac.lab.scene import InteractiveSceneCfg
from omni.isaac.lab.sim import SimulationCfg
from omni.isaac.lab.terrains import TerrainImporterCfg
from omni.isaac.lab.utils import configclass
from omni.isaac.lab.utils.assets import ISAAC_NUCLEUS_DIR
from omni.isaac.lab.utils.math import sample_uniform, quat_mul, quat_from_euler_xyz
@configclass
class FrankaCuboidEnvCfg(DirectRLEnvCfg):
# env
episode_length_s = 8.3333 # 500 timesteps
decimation = 2
num_actions = 9
num_observations = 24
num_states = 0
# simulation
sim: SimulationCfg = SimulationCfg(
dt=1 / 120,
render_interval=decimation,
disable_contact_processing=True,
physics_material=sim_utils.RigidBodyMaterialCfg(
friction_combine_mode="multiply",
restitution_combine_mode="multiply",
static_friction=1.0,
dynamic_friction=1.0,
restitution=0.0,
),
)
# scene
scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=4096, env_spacing=3.0, replicate_physics=True)
# robot
robot = ArticulationCfg(
prim_path="/World/envs/env_.*/Robot",
spawn=sim_utils.UsdFileCfg(
usd_path=f"{ISAAC_NUCLEUS_DIR}/Robots/Franka/franka_instanceable.usd",
activate_contact_sensors=False,
rigid_props=sim_utils.RigidBodyPropertiesCfg(
disable_gravity=False,
max_depenetration_velocity=5.0,
),
articulation_props=sim_utils.ArticulationRootPropertiesCfg(
enabled_self_collisions=False, solver_position_iteration_count=12, solver_velocity_iteration_count=1
),
),
init_state=ArticulationCfg.InitialStateCfg(
joint_pos={
"panda_joint1": 1.157,
"panda_joint2": -1.066,
"panda_joint3": -0.155,
"panda_joint4": -2.239,
"panda_joint5": -1.841,
"panda_joint6": 1.003,
"panda_joint7": 0.469,
"panda_finger_joint.*": 0.035,
},
pos=(0.0, 0.0, 0.0),
rot=(0.0, 0.0, 0.0, 1.0),
),
actuators={
"panda_shoulder": ImplicitActuatorCfg(
joint_names_expr=["panda_joint[1-4]"],
effort_limit=87.0,
velocity_limit=2.175,
stiffness=80.0,
damping=4.0,
),
"panda_forearm": ImplicitActuatorCfg(
joint_names_expr=["panda_joint[5-7]"],
effort_limit=12.0,
velocity_limit=2.61,
stiffness=80.0,
damping=4.0,
),
"panda_hand": ImplicitActuatorCfg(
joint_names_expr=["panda_finger_joint.*"],
effort_limit=200.0,
velocity_limit=0.2,
stiffness=2e3,
damping=1e2,
),
},
)
# cuboid
cuboid = RigidObjectCfg(
prim_path="/World/envs/env_.*/cuboid",
spawn=sim_utils.UsdFileCfg(
usd_path=f"{ISAAC_NUCLEUS_DIR}/Props/Blocks/DexCube/dex_cube_instanceable.usd",
),
init_state=RigidObjectCfg.InitialStateCfg(
pos=(0.5, 0.0, 0.0),
rot=(0.0, 0.0, 0.0, 1.0),
lin_vel=(0.0, 0.0, 0.0),
ang_vel=(0.0, 0.0, 0.0),
),
)
# ground plane
terrain = TerrainImporterCfg(
prim_path="/World/ground",
terrain_type="plane",
collision_group=-1,
physics_material=sim_utils.RigidBodyMaterialCfg(
friction_combine_mode="multiply",
restitution_combine_mode="multiply",
static_friction=1.0,
dynamic_friction=1.0,
restitution=0.0,
),
)
action_scale = 7.5
dof_velocity_scale = 0.1
# reward scales
dist_reward_scale = 1.0
move_reward_scale = 16
lift_reward_scale = 15
class FrankaCuboidEnv(DirectRLEnv):
# pre-physics step calls
# |-- _pre_physics_step(action)
# |-- _apply_action()
# post-physics step calls
# |-- _get_dones()
# |-- _get_rewards()
# |-- _reset_idx(env_ids)
# |-- _get_observations()
cfg: FrankaCuboidEnvCfg
def __init__(self, cfg: FrankaCuboidEnvCfg, render_mode: str | None = None, **kwargs):
super().__init__(cfg, render_mode, **kwargs)
def get_env_local_pose(env_pos: torch.Tensor, xformable: UsdGeom.Xformable, device: torch.device):
"""Compute pose in env-local coordinates"""
world_transform = xformable.ComputeLocalToWorldTransform(0)
world_pos = world_transform.ExtractTranslation()
world_quat = world_transform.ExtractRotationQuat()
px = world_pos[0] - env_pos[0]
py = world_pos[1] - env_pos[1]
pz = world_pos[2] - env_pos[2]
qx = world_quat.imaginary[0]
qy = world_quat.imaginary[1]
qz = world_quat.imaginary[2]
qw = world_quat.real
return torch.tensor([px, py, pz, qw, qx, qy, qz], device=device)
self.dt = self.cfg.sim.dt * self.cfg.decimation
# create auxiliary variables for computing applied action, observations and rewards
self.robot_dof_lower_limits = self._robot.data.soft_joint_pos_limits[0, :, 0].to(device=self.device)
self.robot_dof_upper_limits = self._robot.data.soft_joint_pos_limits[0, :, 1].to(device=self.device)
self.robot_dof_speed_scales = torch.ones_like(self.robot_dof_lower_limits)
self.robot_dof_speed_scales[self._robot.find_joints("panda_finger_joint1")[0]] = 0.1
self.robot_dof_speed_scales[self._robot.find_joints("panda_finger_joint2")[0]] = 0.1
self.robot_dof_targets = torch.zeros((self.num_envs, self._robot.num_joints), device=self.device)
stage = get_current_stage()
hand_pose = get_env_local_pose(
self.scene.env_origins[0],
UsdGeom.Xformable(stage.GetPrimAtPath("/World/envs/env_0/Robot/panda_link7")),
self.device,
)
lfinger_pose = get_env_local_pose(
self.scene.env_origins[0],
UsdGeom.Xformable(stage.GetPrimAtPath("/World/envs/env_0/Robot/panda_leftfinger")),
self.device,
)
rfinger_pose = get_env_local_pose(
self.scene.env_origins[0],
UsdGeom.Xformable(stage.GetPrimAtPath("/World/envs/env_0/Robot/panda_rightfinger")),
self.device,
)
finger_pose = torch.zeros(7, device=self.device)
finger_pose[0:3] = (lfinger_pose[0:3] + rfinger_pose[0:3]) / 2.0
finger_pose[3:7] = lfinger_pose[3:7]
hand_pose_inv_rot, hand_pose_inv_pos = tf_inverse(hand_pose[3:7], hand_pose[0:3])
robot_local_grasp_pose_rot, robot_local_pose_pos = tf_combine(
hand_pose_inv_rot, hand_pose_inv_pos, finger_pose[3:7], finger_pose[0:3]
)
robot_local_pose_pos += torch.tensor([0, 0.04, 0], device=self.device)
self.robot_local_grasp_pos = robot_local_pose_pos.repeat((self.num_envs, 1))
self.robot_local_grasp_rot = robot_local_grasp_pose_rot.repeat((self.num_envs, 1))
self.gripper_forward_axis = torch.tensor([0, 0, 1], device=self.device, dtype=torch.float32).repeat(
(self.num_envs, 1)
)
self.gripper_up_axis = torch.tensor([0, 1, 0], device=self.device, dtype=torch.float32).repeat(
(self.num_envs, 1)
)
self.hand_link_idx = self._robot.find_bodies("panda_link7")[0][0]
self.left_finger_link_idx = self._robot.find_bodies("panda_leftfinger")[0][0]
self.right_finger_link_idx = self._robot.find_bodies("panda_rightfinger")[0][0]
self.robot_grasp_rot = torch.zeros((self.num_envs, 4), device=self.device)
self.robot_grasp_pos = torch.zeros((self.num_envs, 3), device=self.device)
self.object_pos = torch.zeros((self.num_envs, 3), device=self.device)
def _setup_scene(self):
self._robot = Articulation(self.cfg.robot)
self._cuboid = RigidObject(self.cfg.cuboid)
self.scene.articulations["robot"] = self._robot
self.cfg.terrain.num_envs = self.scene.cfg.num_envs
self.cfg.terrain.env_spacing = self.scene.cfg.env_spacing
self._terrain = self.cfg.terrain.class_type(self.cfg.terrain)
# clone, filter, and replicate
self.scene.clone_environments(copy_from_source=False)
self.scene.filter_collisions(global_prim_paths=[self.cfg.terrain.prim_path])
# add lights
light_cfg = sim_utils.DomeLightCfg(intensity=2000.0, color=(0.75, 0.75, 0.75))
light_cfg.func("/World/Light", light_cfg)
# pre-physics step calls
def _pre_physics_step(self, actions: torch.Tensor):
self.actions = actions.clone().clamp(-1.0, 1.0)
targets = self.robot_dof_targets + self.robot_dof_speed_scales * self.dt * self.actions * self.cfg.action_scale
self.robot_dof_targets[:] = torch.clamp(targets, self.robot_dof_lower_limits, self.robot_dof_upper_limits)
def _apply_action(self):
self._robot.set_joint_position_target(self.robot_dof_targets)
# post-physics step calls
def _get_dones(self) -> tuple[torch.Tensor, torch.Tensor]:
#Achieved goal pos
object_pos = self._cuboid.data.body_pos_w[:, 0] # Object position
robot_pos = self._robot.data.body_pos_w[:, self.hand_link_idx]
terminated = torch.norm(object_pos - robot_pos, p=2, dim=-1) < 0.2
d=torch.norm(object_pos - robot_pos, p=2, dim=-1)
print(d)
#Time limit
truncated = self.episode_length_buf >= self.max_episode_length - 1
return terminated, truncated
def _get_rewards(self) -> torch.Tensor:
object_pos = self._cuboid.data.body_pos_w[:, 0] # Object position (x, y, z)
hand_pos = self._robot.data.body_pos_w[:, self.hand_link_idx] # Hand position (x, y, z)
return self._compute_rewards(
self.actions,
hand_pos,
object_pos,
self.num_envs,
)
def _reset_idx(self, env_ids: torch.Tensor | None):
super()._reset_idx(env_ids)
# 1. Reset robot state: randomize joint positions and set joint velocities to zero
joint_pos = self._robot.data.default_joint_pos[env_ids] + sample_uniform(
-0.125, 0.125, (len(env_ids), self._robot.num_joints), self.device
)
joint_pos = torch.clamp(joint_pos, self.robot_dof_lower_limits, self.robot_dof_upper_limits)
joint_vel = torch.zeros_like(joint_pos)
self._robot.set_joint_position_target(joint_pos, env_ids=env_ids)
self._robot.write_joint_state_to_sim(joint_pos, joint_vel, env_ids=env_ids)
root_states = self._cuboid.data.default_root_state[env_ids].clone()
pose_range = {
"x": (-0.2, 0.2), # Range for x position
"y": (-0.6, 0.6), # Range for y position
"z": (0.0, 0.0), # Fixed z position (can be adjusted as needed)
"roll": (0.0, 0.0), # Roll range
"pitch": (0.0, 0.0), # Pitch range
"yaw": (0.0, 0.0) # Yaw range
}
velocity_range = {
"x": (0.0, 0.0), # Example range for x velocity
"y": (0.0, 0.0), # Example range for y velocity
"z": (0.0, 0.0), # Example range for z velocity
"roll": (0.0, 0.0), # Example roll velocity range
"pitch": (0.0, 0.0),# Example pitch velocity range
"yaw": (0.0, 0.0) # Example yaw velocity range
}
# poses
range_list = [pose_range.get(key, (0.0, 0.0)) for key in ["x", "y", "z", "roll", "pitch", "yaw"]]
ranges = torch.tensor(range_list, device=self.device)
rand_samples = sample_uniform(ranges[:, 0], ranges[:, 1], (len(env_ids), 6), device=self.device)
positions = root_states[:, 0:3] + self.scene.env_origins[env_ids] + rand_samples[:, 0:3]
orientations_delta = quat_from_euler_xyz(rand_samples[:, 3], rand_samples[:, 4], rand_samples[:, 5])
orientations = quat_mul(root_states[:, 3:7], orientations_delta)
# velocities
range_list = [velocity_range.get(key, (0.0, 0.0)) for key in ["x", "y", "z", "roll", "pitch", "yaw"]]
ranges = torch.tensor(range_list, device=self.device)
rand_samples = sample_uniform(ranges[:, 0], ranges[:, 1], (len(env_ids), 6), device=self.device)
velocities = root_states[:, 7:13] + rand_samples
# set into the physics simulation
self._cuboid.write_root_pose_to_sim(torch.cat([positions, orientations], dim=-1), env_ids=env_ids)
self._cuboid.write_root_velocity_to_sim(velocities, env_ids=env_ids)
def _get_observations(self) -> dict:
dof_pos_scaled = (
2.0 * (self._robot.data.joint_pos - self.robot_dof_lower_limits)
/ (self.robot_dof_upper_limits - self.robot_dof_lower_limits)
- 1.0
)
# Distance from the hand to the object
to_object = self.object_pos - self.robot_grasp_pos
object_pos = self._cuboid.data.body_pos_w[:, 0]
obs = torch.cat(
(dof_pos_scaled, self._robot.data.joint_vel * self.cfg.dof_velocity_scale, to_object, object_pos),
dim=-1,
)
return {"policy": torch.clamp(obs, -5.0, 5.0)}
def _compute_rewards(
self,
actions,
franka_grasp_pos,
cuboid_pos,
num_envs,
):
action_penalty = torch.sum(actions**2, dim=-1)
# Distance from hand to object (reward for getting close to the object)
object_distance = torch.norm(franka_grasp_pos - cuboid_pos, p=2, dim=-1)
distance_reward = 1.0 / (1.0 + object_distance**2)
rewards = distance_reward #* 1.5 - action_penalty * 0.05
rewards = torch.where(object_distance < 0.3, rewards + 0.25, rewards)
rewards = torch.where(object_distance < 0.25, rewards + 0.5, rewards)
return rewards