Depth Map to Pointcloud Issues

I am trying to use the linear depth data from the camera and the semantic segmentation to create a segmented point cloud. Combining the two to create a segmented depth map is pretty easy. As for the depth map to point cloud, I following a few guides so I am pretty sure that the math is correct, but when I get the point cloud back, and display it on the screen the point cloud is offset from the real object. This offset becomes greater the further the object moves towards the bottom right-hand corner of the viewport and shrinks as the object is closer to the top-left hand corner. Because of this, I believe it is an issue with the intrinsic matrix returned by the simulation or is due to some mistake in my method to go from depth data to point cloud.

You can see the issue in this image:

Could someone help me to get a properly aligned point cloud? Here is a dummy script to show off the issue and each part of the process: Some of the code is from the useful snippets on the documentation and others are some utility functions I created.

#!/usr/bin/env python

import copy
import os
import omni
import random
import numpy as np
from omni.isaac.python_app import OmniKitHelper
import matplotlib.pyplot as plt
import carb
from omni.isaac.utils.scripts.nucleus_utils import find_nucleus_server

    "experience": f'{os.environ["EXP_PATH"]}/omni.isaac.sim.python.kit',
    "width": 1280,
    "height": 720,
    "window_width": 1920,
    "window_height": 1080,
    "headless": True,
    "renderer": "RayTracedLighting"

def get_meters_per_unit():
    from pxr import UsdGeom
    stage = omni.usd.get_context().get_stage()
    return UsdGeom.GetStageMetersPerUnit(stage)

def gf_as_numpy(gf_matrix)->np.array:
    """Take in a pxr.Gf matrix and returns it as a numpy array.
    Specifically it transposes the matrix so that it follows numpy
    matrix rules.

        gf_matrix (Gf.Matrix_d): Gf matrix to convert

    # Convert matrix/vector to numpy array
    return np.array(list(gf_matrix)).T

def get_intrinsic_matrix(viewport):
    # Get camera params from usd
    stage = omni.usd.get_context().get_stage()
    prim = stage.GetPrimAtPath(viewport.get_active_camera())
    focal_length = prim.GetAttribute("focalLength").Get()
    horiz_aperture = prim.GetAttribute("horizontalAperture").Get()
    x_min, y_min, x_max, y_max = viewport.get_viewport_rect()
    width, height = x_max - x_min, y_max - y_min

    # Pixels are square so we can do:
    vert_aperture = height / width * horiz_aperture

    # Compute focal point and center
    focal_x = width * focal_length / horiz_aperture
    focal_y = height * focal_length / vert_aperture
    center_x = width * 0.5
    center_y = height * 0.5
    # Turn into matrix
    intrinsic_matrix = np.array([[focal_x, 0, center_x],
                                 [0, focal_y, center_y],
                                 [0, 0, 1]])
    return intrinsic_matrix

def get_extrinsic_matrix(viewport, meters=False):
    from pxr import UsdGeom
    # Get camera pose
    stage = omni.usd.get_context().get_stage()
    camera_prim = stage.GetPrimAtPath(viewport.get_active_camera())
    camera_pose = gf_as_numpy(UsdGeom.Camera(camera_prim).GetLocalTransformation())
    if meters:
        camera_pose[:,3] = camera_pose[:,3]*get_meters_per_unit()
    view_matrix = np.linalg.inv(camera_pose)
    return view_matrix

def freq_count(v:np.array)->np.array:
    """Return the number of times each element in an array occur

        v (np.array): 1D array to count

        np.array: Frequency list [[num, count], [num, count],...]
    unique, counts = np.unique(v, return_counts=True)
    return np.asarray((unique, counts)).T

def pointcloud_from_mask_and_depth(depth:np.array, mask:np.array, mask_val:int, intrinsic_matrix:np.array, extrinsic_matrix:np.array=None):
    depth = np.array(depth).squeeze()
    mask = np.array(mask).squeeze()
    # Mask the depth array
    masked_depth =!=mask_val, depth)
    masked_depth =, 8000)
    # Create idx array
    idxs = np.indices(masked_depth.shape)
    u_idxs = idxs[1]
    v_idxs = idxs[0]
    # Get only non-masked depth and idxs
    z = masked_depth[~masked_depth.mask]
    compressed_u_idxs = u_idxs[~masked_depth.mask]
    compressed_v_idxs = v_idxs[~masked_depth.mask]
    # Calculate local position of each point
    # Apply vectorized math to depth using compressed arrays
    cx = intrinsic_matrix[0,2]
    fx = intrinsic_matrix[0,0]
    x = (compressed_u_idxs - cx) * z / fx
    cy = intrinsic_matrix[1,2]
    fy = intrinsic_matrix[1,1]
    # Flip y as we want +y pointing up not down
    y = -((compressed_v_idxs - cy) * z / fy)

    # Apply camera_matrix to pointcloud as to get the pointcloud in world coords
    if extrinsic_matrix is not None:
        # Calculate camera pose from extrinsic matrix
        camera_matrix = np.linalg.inv(extrinsic_matrix)
        # Create homogenous array of vectors by adding 4th entry of 1
        # At the same time flip z as for eye space the camera is looking down the -z axis
        w = np.ones(z.shape)
        x_y_z_eye_hom = np.vstack((x, y, -z, w))
        # Transform the points from eye space to world space
        x_y_z_world =, x_y_z_eye_hom)[:3]
        return x_y_z_world.T
        x_y_z_local = np.vstack((x, y, z))
        return x_y_z_local.T

def main():
    kit = OmniKitHelper(config=CONFIG)

    from pxr import UsdGeom, Semantics
    from omni.isaac.synthetic_utils import SyntheticDataHelper
    import carb
    from pxr import UsdGeom, Gf, UsdPhysics, Sdf, UsdLux
    import omni.ext
    import omni.usd
    import omni.kit.settings

    sd_helper = SyntheticDataHelper()

    from omni.syntheticdata import visualize, helpers
    from omni.isaac.utils.scripts.scene_utils import set_up_z_axis
    import omni.isaac.utils.scripts.scene_utils as scene_utils
    from omni.isaac.utils import _isaac_utils
    from omni.isaac.utils.scripts.scene_utils import set_translate

    # Get the current stage
    stage = kit.get_stage()

    # Add a distant light
    stage.DefinePrim("/World/Light", "DistantLight")

    # Add object
    asset = stage.DefinePrim(f"/World/Cube", "Cube")  # create an empty Xform at the given path
    set_translate(asset, Gf.Vec3d(0,0,0))
    # Add semantic label based on asset type
    sem = Semantics.SemanticsAPI.Apply(asset, "Semantics")
    # Update to register objecs

    # Add camera  
    # needs to happen here becuase otherwise the depth and semantic segmentation
    # don't work, not entirely sure why but I don't like it
    camera_path = "/World/Camera"
    camera_prim = stage.DefinePrim(camera_path, "Camera")
    # Set as current camera
    viewport_interface = omni.kit.viewport.get_viewport_interface()
    viewport = viewport_interface.get_viewport_window()
    viewport.set_camera_position(camera_path, 0.0, 0.0, 20.0, True)
    viewport.set_camera_target(camera_path, 0.0, 0.0, 0.0, True)

    # Get ground truths
    # Calling it once returned zeros for some reason, but calling it twice, now its happy
    gt = sd_helper.get_groundtruth(
    gt = sd_helper.get_groundtruth(

    depth_linear = gt["depthLinear"]
    semantic_seg = gt["semanticSegmentation"]
    intrinsic_matrix = get_intrinsic_matrix(viewport)
    extrinsic_matrix = get_extrinsic_matrix(viewport, meters=True)
    points = pointcloud_from_mask_and_depth(depth_linear, semantic_seg, 1, intrinsic_matrix, extrinsic_matrix)

    # Add pointcloud to scene
    points = (1.0/get_meters_per_unit()) * points
    draw = _isaac_utils.debug_draw.acquire_debug_draw_interface()
    # Draw lines
    draw.draw_points(points, [(1, 0, 0, 1)] * len(points), [1]*len(points))


    # Add pointcloud to scene
    points = (1.0/get_meters_per_unit()) * points
    draw = _isaac_utils.debug_draw.acquire_debug_draw_interface()
    # Draw lines
    draw.draw_points(points, [(1, 0, 0, 1)] * len(points), [1]*len(points))

    # Get ground truths
    gt = sd_helper.get_groundtruth(
    # Setup a figure
    fig, axes = plt.subplots(2, 2, figsize=(20, 7))
    axes = axes.flat
    for ax in axes:

    # RGB

    # DEPTH
    depth_data = np.clip(gt["depth"], 0, 255)

    axes[2].set_title("Semantic Segmentation")
    semantic_seg = gt["semanticSegmentation"]
    semantic_rgb = visualize.colorize_segmentation(semantic_seg)

    # Save figure
    target_fig_path = os.path.join(os.path.dirname(__file__), '../data/result_images/rgbd_pointcloud_0.png')
    print(f"Saved figure to {target_fig_path}")
    # cleanup

if __name__ == "__main__":

Before version 2021.1.1, there were greater issues with the depth map due to some issues with the camera model’s vertical aperture: Incorrect depth image generated by camera in IsaacSim. Evidently this issue was fixed, however, I am using 2021.1.1.

I think I see the issue, there is a bug in the way the


function works.
in the next release (mid october hopefully) we have a simpler api

width, height = viewport.get_texture_resolution()

for now can you set width and height manually and see if that resolves the issue?

    width = 1280
    height = 720

Yup that was the issue, thank you. Works perfectly now! Is there a different way to programmatically get the viewport’s height and width or will I just have to wait until 2021.1.2?

You can actually get it from the RGB/depth image size, as that directly corresponds to the render resolution.

1 Like