I am trying to use the linear depth data from the camera and the semantic segmentation to create a segmented point cloud. Combining the two to create a segmented depth map is pretty easy. As for the depth map to point cloud, I following a few guides so I am pretty sure that the math is correct, but when I get the point cloud back, and display it on the screen the point cloud is offset from the real object. This offset becomes greater the further the object moves towards the bottom right-hand corner of the viewport and shrinks as the object is closer to the top-left hand corner. Because of this, I believe it is an issue with the intrinsic matrix returned by the simulation or is due to some mistake in my method to go from depth data to point cloud.

You can see the issue in this image:

Could someone help me to get a properly aligned point cloud? Here is a dummy script to show off the issue and each part of the process: Some of the code is from the useful snippets on the documentation and others are some utility functions I created.

```
#!/usr/bin/env python
import copy
import os
import omni
import random
import numpy as np
from omni.isaac.python_app import OmniKitHelper
import matplotlib.pyplot as plt
import carb
from omni.isaac.utils.scripts.nucleus_utils import find_nucleus_server
CONFIG = {
"experience": f'{os.environ["EXP_PATH"]}/omni.isaac.sim.python.kit',
"width": 1280,
"height": 720,
"window_width": 1920,
"window_height": 1080,
"headless": True,
"renderer": "RayTracedLighting"
}
def get_meters_per_unit():
from pxr import UsdGeom
stage = omni.usd.get_context().get_stage()
return UsdGeom.GetStageMetersPerUnit(stage)
def gf_as_numpy(gf_matrix)->np.array:
"""Take in a pxr.Gf matrix and returns it as a numpy array.
Specifically it transposes the matrix so that it follows numpy
matrix rules.
Args:
gf_matrix (Gf.Matrix_d): Gf matrix to convert
Returns:
np.array:
"""
# Convert matrix/vector to numpy array
return np.array(list(gf_matrix)).T
def get_intrinsic_matrix(viewport):
# Get camera params from usd
stage = omni.usd.get_context().get_stage()
prim = stage.GetPrimAtPath(viewport.get_active_camera())
focal_length = prim.GetAttribute("focalLength").Get()
horiz_aperture = prim.GetAttribute("horizontalAperture").Get()
x_min, y_min, x_max, y_max = viewport.get_viewport_rect()
width, height = x_max - x_min, y_max - y_min
# Pixels are square so we can do:
vert_aperture = height / width * horiz_aperture
# Compute focal point and center
focal_x = width * focal_length / horiz_aperture
focal_y = height * focal_length / vert_aperture
center_x = width * 0.5
center_y = height * 0.5
# Turn into matrix
intrinsic_matrix = np.array([[focal_x, 0, center_x],
[0, focal_y, center_y],
[0, 0, 1]])
return intrinsic_matrix
def get_extrinsic_matrix(viewport, meters=False):
from pxr import UsdGeom
# Get camera pose
stage = omni.usd.get_context().get_stage()
camera_prim = stage.GetPrimAtPath(viewport.get_active_camera())
camera_pose = gf_as_numpy(UsdGeom.Camera(camera_prim).GetLocalTransformation())
if meters:
camera_pose[:,3] = camera_pose[:,3]*get_meters_per_unit()
view_matrix = np.linalg.inv(camera_pose)
return view_matrix
def freq_count(v:np.array)->np.array:
"""Return the number of times each element in an array occur
Args:
v (np.array): 1D array to count
Returns:
np.array: Frequency list [[num, count], [num, count],...]
"""
unique, counts = np.unique(v, return_counts=True)
return np.asarray((unique, counts)).T
def pointcloud_from_mask_and_depth(depth:np.array, mask:np.array, mask_val:int, intrinsic_matrix:np.array, extrinsic_matrix:np.array=None):
depth = np.array(depth).squeeze()
mask = np.array(mask).squeeze()
# Mask the depth array
masked_depth = np.ma.masked_where(mask!=mask_val, depth)
masked_depth = np.ma.masked_greater(masked_depth, 8000)
# Create idx array
idxs = np.indices(masked_depth.shape)
u_idxs = idxs[1]
v_idxs = idxs[0]
# Get only non-masked depth and idxs
z = masked_depth[~masked_depth.mask]
compressed_u_idxs = u_idxs[~masked_depth.mask]
compressed_v_idxs = v_idxs[~masked_depth.mask]
# Calculate local position of each point
# Apply vectorized math to depth using compressed arrays
cx = intrinsic_matrix[0,2]
fx = intrinsic_matrix[0,0]
x = (compressed_u_idxs - cx) * z / fx
cy = intrinsic_matrix[1,2]
fy = intrinsic_matrix[1,1]
# Flip y as we want +y pointing up not down
y = -((compressed_v_idxs - cy) * z / fy)
# Apply camera_matrix to pointcloud as to get the pointcloud in world coords
if extrinsic_matrix is not None:
# Calculate camera pose from extrinsic matrix
camera_matrix = np.linalg.inv(extrinsic_matrix)
# Create homogenous array of vectors by adding 4th entry of 1
# At the same time flip z as for eye space the camera is looking down the -z axis
w = np.ones(z.shape)
x_y_z_eye_hom = np.vstack((x, y, -z, w))
# Transform the points from eye space to world space
x_y_z_world = np.dot(camera_matrix, x_y_z_eye_hom)[:3]
return x_y_z_world.T
else:
x_y_z_local = np.vstack((x, y, z))
return x_y_z_local.T
def main():
kit = OmniKitHelper(config=CONFIG)
from pxr import UsdGeom, Semantics
from omni.isaac.synthetic_utils import SyntheticDataHelper
import carb
from pxr import UsdGeom, Gf, UsdPhysics, Sdf, UsdLux
import omni.ext
import omni.usd
import omni.kit.settings
sd_helper = SyntheticDataHelper()
from omni.syntheticdata import visualize, helpers
from omni.isaac.utils.scripts.scene_utils import set_up_z_axis
import omni.isaac.utils.scripts.scene_utils as scene_utils
from omni.isaac.utils import _isaac_utils
from omni.isaac.utils.scripts.scene_utils import set_translate
# SCENE SETUP
# Get the current stage
stage = kit.get_stage()
set_up_z_axis(stage)
# Add a distant light
stage.DefinePrim("/World/Light", "DistantLight")
# Add object
asset = stage.DefinePrim(f"/World/Cube", "Cube") # create an empty Xform at the given path
set_translate(asset, Gf.Vec3d(0,0,0))
# Add semantic label based on asset type
sem = Semantics.SemanticsAPI.Apply(asset, "Semantics")
sem.CreateSemanticTypeAttr()
sem.CreateSemanticDataAttr()
sem.GetSemanticTypeAttr().Set("class")
sem.GetSemanticDataAttr().Set("Cube")
# Update to register objecs
kit.update()
# Add camera
# needs to happen here becuase otherwise the depth and semantic segmentation
# don't work, not entirely sure why but I don't like it
camera_path = "/World/Camera"
camera_prim = stage.DefinePrim(camera_path, "Camera")
# Set as current camera
viewport_interface = omni.kit.viewport.get_viewport_interface()
viewport = viewport_interface.get_viewport_window()
viewport.set_active_camera(camera_path)
viewport.set_camera_position(camera_path, 0.0, 0.0, 20.0, True)
viewport.set_camera_target(camera_path, 0.0, 0.0, 0.0, True)
kit.update()
# Get ground truths
# Calling it once returned zeros for some reason, but calling it twice, now its happy
gt = sd_helper.get_groundtruth(
[
"rgb",
"depth",
"semanticSegmentation",
"depthLinear"
],
viewport,
)
gt = sd_helper.get_groundtruth(
[
"rgb",
"depth",
"semanticSegmentation",
"depthLinear"
],
viewport,
)
# POINTCLOUD
depth_linear = gt["depthLinear"]
semantic_seg = gt["semanticSegmentation"]
intrinsic_matrix = get_intrinsic_matrix(viewport)
extrinsic_matrix = get_extrinsic_matrix(viewport, meters=True)
points = pointcloud_from_mask_and_depth(depth_linear, semantic_seg, 1, intrinsic_matrix, extrinsic_matrix)
# Add pointcloud to scene
points = (1.0/get_meters_per_unit()) * points
draw = _isaac_utils.debug_draw.acquire_debug_draw_interface()
# Draw lines
draw.draw_points(points, [(1, 0, 0, 1)] * len(points), [1]*len(points))
kit.update()
kit.update()
# Add pointcloud to scene
points = (1.0/get_meters_per_unit()) * points
draw = _isaac_utils.debug_draw.acquire_debug_draw_interface()
# Draw lines
draw.draw_points(points, [(1, 0, 0, 1)] * len(points), [1]*len(points))
# Get ground truths
gt = sd_helper.get_groundtruth(
[
"rgb",
"depth",
"semanticSegmentation",
"depthLinear"
],
viewport,
)
# GROUNDTRUTH VISUALIZATION
# Setup a figure
fig, axes = plt.subplots(2, 2, figsize=(20, 7))
axes = axes.flat
for ax in axes:
ax.axis("off")
# RGB
axes[0].set_title("RGB")
axes[0].imshow(gt["rgb"])
# DEPTH
axes[1].set_title("Depth")
depth_data = np.clip(gt["depth"], 0, 255)
axes[1].imshow(visualize.colorize_depth(depth_data.squeeze()))
# SEMANTIC SEGMENTATION
axes[2].set_title("Semantic Segmentation")
semantic_seg = gt["semanticSegmentation"]
semantic_rgb = visualize.colorize_segmentation(semantic_seg)
axes[2].imshow(semantic_rgb)
# Save figure
target_fig_path = os.path.join(os.path.dirname(__file__), '../data/result_images/rgbd_pointcloud_0.png')
plt.savefig(target_fig_path)
print(f"Saved figure to {target_fig_path}")
# cleanup
kit.stop()
kit.shutdown()
if __name__ == "__main__":
main()
```

Before version 2021.1.1, there were greater issues with the depth map due to some issues with the camera modelâ€™s vertical aperture: Incorrect depth image generated by camera in IsaacSim. Evidently this issue was fixed, however, I am using 2021.1.1.