Ask a question about foundationpose with ZED

I am currently trying to do foundationpose with ZED. The .launch file is not available so I have modified to create my own launch file.

@staticmethod
def get_composable_nodes(interface_specs: Dict[str, Any]) -> Dict[str, ComposableNode]:

    # Drop node parameters
    input_images_expect_freq = LaunchConfiguration('input_images_expect_freq')
    input_images_drop_freq = LaunchConfiguration('input_images_drop_freq')
    # FoundationPose parameters
    mesh_file_path = LaunchConfiguration('mesh_file_path')
    texture_path = LaunchConfiguration('texture_path')
    refine_model_file_path = LaunchConfiguration('refine_model_file_path')
    refine_engine_file_path = LaunchConfiguration('refine_engine_file_path')
    score_model_file_path = LaunchConfiguration('score_model_file_path')
    score_engine_file_path = LaunchConfiguration('score_engine_file_path')
    # RT-DETR parameters
    rt_detr_model_file_path = LaunchConfiguration('rt_detr_model_file_path')
    rt_detr_engine_file_path = LaunchConfiguration('rt_detr_engine_file_path')
    input_width = interface_specs['camera_resolution']['width']
    input_height = interface_specs['camera_resolution']['height']
    input_to_RT_DETR_ratio = input_width / RT_DETR_MODEL_INPUT_SIZE
    return {
        # Drops input_images_expect_freq out of input_images_drop_freq input messages
        'drop_node':  ComposableNode(
            name='drop_node',
            package='isaac_ros_nitros_topic_tools',
            plugin='nvidia::isaac_ros::nitros::NitrosCameraDropNode',
            parameters=[{
                'X': input_images_drop_freq,
                'Y': input_images_expect_freq,
                'mode': 'mono+depth',
                'depth_format_string': 'nitros_image_mono16'
            }],
            remappings=[
                ('image_1', '/zed_node/left/image_rect_color'),
                ('camera_info_1', '/camera_info_rect'),
                ('depth_1', 'depth'),
                ('image_1_drop', 'rgb/image_rect_color'),
                ('camera_info_1_drop', 'rgb/camera_info'),
                ('depth_1_drop', 'depth_image'),
            ]
        ),
        'image_format_converter_node': ComposableNode(
            name='image_format_converter_node',
            package='isaac_ros_image_proc',
            plugin='nvidia::isaac_ros::image_proc::ImageFormatConverterNode',
            parameters=[{
                'encoding_desired': 'rgb8'
            }],
            remappings=[
                ('image', 'rgb/image_rect_color'),  # Input: bgra8 image
                ('converted_image', 'rgb/image_rect_color_converted')  # Output: rgb8 image
            ]
        ),

        # Resize and pad input images to RT-DETR model input image size
        # Resize from IMAGE_WIDTH x IMAGE_HEIGHT to
        # IMAGE_WIDTH/input_TO_RT_DETR_RATIO x IMAGE_HEIGHT/input_TO_RT_DETR_RATIO
        # output height constraint is not used since keep_aspect_ratio is True
        'resize_left_rt_detr_node': ComposableNode(
            name='resize_left_rt_detr_node',
            package='isaac_ros_image_proc',
            plugin='nvidia::isaac_ros::image_proc::ResizeNode',
            parameters=[{
                'input_width': input_width,
                'input_height': input_height,
                'output_width': RT_DETR_MODEL_INPUT_SIZE,
                'output_height': RT_DETR_MODEL_INPUT_SIZE,
                'keep_aspect_ratio': True,
                'encoding_desired': 'rgb8',
                'disable_padding': True
            }],
            remappings=[
                ('image', 'left/image_rect'),
                ('camera_info', '/camera_info_rect'),
                ('resize/image', 'color_image_resized'),
                ('resize/camera_info', 'camera_info_resized')
            ]
        ),
        # Pad the image from IMAGE_WIDTH/input_TO_RT_DETR_RATIO x
        # IMAGE_HEIGHT/input_TO_RT_DETR_RATIO
        # to RT_DETR_MODEL_INPUT_WIDTH x RT_DETR_MODEL_INPUT_HEIGHT
        'pad_node': ComposableNode(
            name='pad_node',
            package='isaac_ros_image_proc',
            plugin='nvidia::isaac_ros::image_proc::PadNode',
            parameters=[{
                'output_image_width': RT_DETR_MODEL_INPUT_SIZE,
                'output_image_height': RT_DETR_MODEL_INPUT_SIZE,
                'padding_type': 'BOTTOM_RIGHT'
            }],
            remappings=[(
                'image', 'color_image_resized'
            )]
        ),

        # Convert image to tensor and reshape
        'image_to_tensor_node': ComposableNode(
            name='image_to_tensor_node',
            package='isaac_ros_tensor_proc',
            plugin='nvidia::isaac_ros::dnn_inference::ImageToTensorNode',
            parameters=[{
                'scale': False,
                'tensor_name': 'image',
            }],
            remappings=[
                ('image', 'padded_image'),
                ('tensor', 'normalized_tensor'),
            ]
        ),

        'interleave_to_planar_node': ComposableNode(
            name='interleaved_to_planar_node',
            package='isaac_ros_tensor_proc',
            plugin='nvidia::isaac_ros::dnn_inference::InterleavedToPlanarNode',
            parameters=[{
                'input_tensor_shape': [RT_DETR_MODEL_INPUT_SIZE,
                                       RT_DETR_MODEL_INPUT_SIZE,
                                       RT_DETR_MODEL_NUM_CHANNELS]
            }],
            remappings=[
                ('interleaved_tensor', 'normalized_tensor')
            ]
        ),

        'reshape_node': ComposableNode(
            name='reshape_node',
            package='isaac_ros_tensor_proc',
            plugin='nvidia::isaac_ros::dnn_inference::ReshapeNode',
            parameters=[{
                'output_tensor_name': 'input_tensor',
                'input_tensor_shape': [RT_DETR_MODEL_NUM_CHANNELS,
                                       RT_DETR_MODEL_INPUT_SIZE,
                                       RT_DETR_MODEL_INPUT_SIZE],
                'output_tensor_shape': [1, RT_DETR_MODEL_NUM_CHANNELS,
                                        RT_DETR_MODEL_INPUT_SIZE,
                                        RT_DETR_MODEL_INPUT_SIZE]
            }],
            remappings=[
                ('tensor', 'planar_tensor')
            ],
        ),

        'rtdetr_preprocessor_node': ComposableNode(
            name='rtdetr_preprocessor',
            package='isaac_ros_rtdetr',
            plugin='nvidia::isaac_ros::rtdetr::RtDetrPreprocessorNode',
            remappings=[
                ('encoded_tensor', 'reshaped_tensor')
            ]
        ),

        # RT-DETR objection detection pipeline
        'tensor_rt_node': ComposableNode(
            name='tensor_rt',
            package='isaac_ros_tensor_rt',
            plugin='nvidia::isaac_ros::dnn_inference::TensorRTNode',
            parameters=[{
                'model_file_path': rt_detr_model_file_path,
                'engine_file_path': rt_detr_engine_file_path,
                'output_binding_names': ['labels', 'boxes', 'scores'],
                'output_tensor_names': ['labels', 'boxes', 'scores'],
                'input_tensor_names': ['images', 'orig_target_sizes'],
                'input_binding_names': ['images', 'orig_target_sizes'],
                'force_engine_update': False
            }]
        ),
        'rtdetr_decoder_node': ComposableNode(
            name='rtdetr_decoder',
            package='isaac_ros_rtdetr',
            plugin='nvidia::isaac_ros::rtdetr::RtDetrDecoderNode',
        ),

        # Create a binary segmentation mask from a Detection2DArray published by RT-DETR.
        # The segmentation mask is of size
        # int(IMAGE_WIDTH/input_to_RT_DETR_ratio) x int(IMAGE_HEIGHT/input_to_RT_DETR_ratio)
        'detection2_d_to_mask_node': ComposableNode(
            name='detection2_d_to_mask',
            package='isaac_ros_foundationpose',
            plugin='nvidia::isaac_ros::foundationpose::Detection2DToMask',
            parameters=[{
                'mask_width': int(input_width/input_to_RT_DETR_ratio),
                'mask_height': int(input_height/input_to_RT_DETR_ratio)}],
            remappings=[('detection2_d_array', 'detections_output'),
                        ('segmentation', 'rt_detr_segmentation')]),
        
        # Resize segmentation mask to ESS model image size so it can be used by FoundationPose
        # FoundationPose requires depth, rgb image and segmentation mask to be of the same size
        # Resize from int(IMAGE_WIDTH/input_to_RT_DETR_ratio) x
        # int(IMAGE_HEIGHT/input_to_RT_DETR_ratio)
        # to ESS_MODEL_IMAGE_WIDTH x ESS_MODEL_IMAGE_HEIGHT
        # output height constraint is used since keep_aspect_ratio is False
        # and the image is padded
        'resize_mask_node': ComposableNode(
            name='resize_mask_node',
            package='isaac_ros_image_proc',
            plugin='nvidia::isaac_ros::image_proc::ResizeNode',
            parameters=[{
                'input_width': int(input_width/input_to_RT_DETR_ratio),
                'input_height': int(input_height/input_to_RT_DETR_ratio),
                'output_width': input_width,
                'output_height': input_height,
                'keep_aspect_ratio': False,
                'disable_padding': False
            }],
            remappings=[
                ('image', 'rt_detr_segmentation'),
                ('camera_info', 'camera_info_resized'),
                ('resize/image', 'segmentation'),
                ('resize/camera_info', 'camera_info_segmentation')
            ]
        ),

        'resize_left_viz': ComposableNode(
            name='resize_left_viz',
            package='isaac_ros_image_proc',
            plugin='nvidia::isaac_ros::image_proc::ResizeNode',
            parameters=[{
                'input_width': input_width,
                'input_height': input_height,
                'output_width': 480,
                'output_height': 288,
                'keep_aspect_ratio': False,
                'encoding_desired': 'rgb8',
                'disable_padding': False
            }],
            remappings=[
                ('image', 'left/image_rect'),
                ('camera_info', '/camera_info_rect'),
                ('resize/image', 'rgb/image_rect_color_viz'),
                ('resize/camera_info', 'rgb/camera_info_viz')
            ]
        ),

        'foundationpose_node': ComposableNode(
            name='foundationpose_node',
            package='isaac_ros_foundationpose',
            plugin='nvidia::isaac_ros::foundationpose::FoundationPoseNode',
            parameters=[{
                'mesh_file_path': mesh_file_path,
                'texture_path': texture_path,

                'refine_model_file_path': refine_model_file_path,
                'refine_engine_file_path': refine_engine_file_path,
                'refine_input_tensor_names': ['input_tensor1', 'input_tensor2'],
                'refine_input_binding_names': ['input1', 'input2'],
                'refine_output_tensor_names': ['output_tensor1', 'output_tensor2'],
                'refine_output_binding_names': ['output1', 'output2'],

                'score_model_file_path': score_model_file_path,
                'score_engine_file_path': score_engine_file_path,
                'score_input_tensor_names': ['input_tensor1', 'input_tensor2'],
                'score_input_binding_names': ['input1', 'input2'],
                'score_output_tensor_names': ['output_tensor'],
                'score_output_binding_names': ['output1'],
            }],
            remappings=[
                ('pose_estimation/depth_image', 'depth_image'),
                ('pose_estimation/image', 'rgb/image_rect_color'),
                ('pose_estimation/camera_info', 'rgb/camera_info'),
                ('pose_estimation/segmentation', 'segmentation'),
                ('pose_estimation/output', 'output')]
        ),
    }

I can see in my rqt_image is that I have correct segmentation and inputs going into Foundationpose node. However, for my /output topic i get something like this

The size of the box is correct as i have confirmed but I am getting .nan position and orientation.

Could anyone help me with this?

For further information I have verified that I the depth image and the mask seem to be correct as shown in image below.
As for the device, I am running on Jetson Orin.

this is my main error

[zed_wrapper-2] Error: TF_NAN_INPUT: Ignoring transform for child_frame_id “fp_object” from authority “Authority undetectable” because of a nan value in the transform (nan -nan nan) (-nan nan nan -nan)
[zed_wrapper-2] at line 237 in ./src/buffer_core.cpp
[zed_wrapper-2] Error: TF_DENORMALIZED_QUATERNION: Ignoring transform for child_frame_id “fp_object” from authority “Authority undetectable” because of an invalid quaternion in the transform (-nan nan nan -nan)
[zed_wrapper-2] at line 256 in ./src/buffer_core.cpp
[zed_wrapper-2] Error: TF_NAN_INPUT: Ignoring transform for child_frame_id “fp_object” from authority “Authority undetectable” because of a nan value in the transform (nan -nan nan) (-nan nan nan -nan)
[zed_wrapper-2] at line 237 in ./src/buffer_core.cpp
[zed_wrapper-2] Error: TF_DENORMALIZED_QUATERNION: Ignoring transform for child_frame_id “fp_object” from authority “Authority undetectable” because of an invalid quaternion in the transform (-nan nan nan -nan)
[zed_wrapper-2] at line 256 in ./src/buffer_core.cpp
[zed_wrapper-2] Error: TF_NAN_INPUT: Ignoring transform for child_frame_id “fp_object” from authority “Authority undetectable” because of a nan value in the transform (nan -nan nan) (-nan nan nan -nan)
[zed_wrapper-2] at line 237 in ./src/buffer_core.cpp

So after doing some research, I was able to get a reasonable value by modifying the /depth image I am getting from ZED. I converted the float32 values to uint8 and convert back to float32 and divide by 1000.0 and then publishing as /formatted_depth, which the foundationpose node then takes in as the input.

But this doesnt seem to work too well when the object is far away, im not sure if it is due to DETR segmentation issue or depth issue.

Is this the right way to do so? If not, please provide a reasonable explanation

Hi @andychoi35

The reason why you get NaN values on the output is that ZED publishes NaN values where we don’t get a valid depth (such as in the stereo occlusion shadow behind objects). (You probably got rid of the NaN values during the float → int conversion, and that’s why this worked).

Here’s my solution to fill the NaN values with zeros.

import rclpy
from rclpy.node import Node
from sensor_msgs.msg import Image
import numpy as np

class DepthCleanerNode(Node):
    def __init__(self):
        super().__init__("depth_cleaner_node")
        self.bridge = CvBridge()
        self.subscription = self.create_subscription(Image, DEPTH_TOPIC, self.depth_callback, 10)
        self.publisher = self.create_publisher(Image, DEPTH_TOPIC_CLEAN, 10)

    def depth_callback(self, msg):
        depth_image = self.bridge.imgmsg_to_cv2(msg, desired_encoding="passthrough")
        depth_image_cleaned = np.nan_to_num(depth_image, nan=0)
        cleaned_msg = self.bridge.cv2_to_imgmsg(depth_image_cleaned, encoding="passthrough")
        cleaned_msg.header = msg.header  # Preserve original message header
        self.publisher.publish(cleaned_msg)


def main(args=None):
    rclpy.init(args=args)
    node = DepthCleanerNode()
    try:
        rclpy.spin(node)
    except KeyboardInterrupt:
        node.get_logger().info("Shutting down DepthCleanerNode.")
    finally:
        node.destroy_node()
        rclpy.shutdown()

if __name__ == "__main__":
    main()

You will have to set topic names DEPTH_TOPIC and ``DEPTH_TOPIC_CLEAN`.

(I looked at sample data from GitHub - NVlabs/FoundationPose: [CVPR 2024 Highlight] FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects, and it also has zeros in occlusion shadows, so I’m 99% sure that that’s what paper authors used for training, and that’s what the neural net expects).

1 Like

Hi @tomasz_lewicki

Thank you for your support, it’s been a while I read your posts :-)

@andychoi35, please follow Tomasz’s suggestions and let us know.

Best,
Raffaello