Different output of Mask RCNN using script vs 'tao deploy mask_rcnn inference '

Dear @Morganh

I am trying to test Mask-Rcnn model with some images. I have observed, when I am testing the engine using tao deploy mask_rcnn inference I got correct output but when I am testing via script I am getting different.

With below command inside notebook giving correct output
command:

# Running inference for detection on a dir of images
!tao deploy mask_rcnn inference -i $DATA_DOWNLOAD_DIR/raw-data/test \
                                -r $USER_EXPERIMENT_DIR/maskrcnn_annotated_images \
                                -e $SPECS_DIR/maskrcnn_train_resnet50.txt \
                                -m $USER_EXPERIMENT_DIR/experiment_dir_unpruned/model.epoch-9.uff.engine \
                                -c $SPECS_DIR/coco_labels.txt \
                                -t 0.5

OutputImage

But when I am testing the same image with script but getting different output.

Script:

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2

# Constants
BATCH_SIZE = 1
INPUT_SHAPE = (BATCH_SIZE, 3, 640, 640)  # Batch size 1, 3 channels, 640x640 resolution
THRESHOLD = 0.5  # Confidence threshold for detections
NMS_SIZE = 100  # Max number of detections after NMS
MASK_SIZE = 28  # Size of masks
N_CLASSES = 2  # Number of classes

def load_engine(engine_file_path):
    """Load the TensorRT engine from file."""
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    trt.init_libnvinfer_plugins(TRT_LOGGER, namespace="")
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

def allocate_buffers(engine):
    """Allocate input and output buffers for TensorRT engine."""
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()

    for binding in engine:
        binding_shape = engine.get_tensor_shape(binding)
        size = trt.volume(binding_shape)
        dtype = trt.nptype(engine.get_tensor_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))
        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            inputs.append({"host": host_mem, "device": device_mem})
        else:
            outputs.append({"host": host_mem, "device": device_mem})
    return inputs, outputs, bindings, stream

def preprocess_image(image_path, input_shape):
    """Preprocess input image to match the engine's input size."""
    image = cv2.imread(image_path)
    original_h, original_w = image.shape[:2]
    input_h, input_w = input_shape[2:]
    
    # Calculate scaling factors
    scale_x = input_w / original_w
    scale_y = input_h / original_h
    
    resized_image = cv2.resize(image, (input_w, input_h))
    normalized_image = resized_image.astype(np.float32) / 255.0  # Normalize to [0, 1]
    transposed_image = np.transpose(normalized_image, (2, 0, 1))  # HWC to CHW
    batch_image = np.expand_dims(transposed_image, axis=0)  # Add batch dimension
    return batch_image.astype(np.float32), image, scale_x, scale_y

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1, execute_v2=False):
    """Generalized function for multiple inputs/outputs."""
    # Transfer input data to the GPU.
    for inp in inputs:
        cuda.memcpy_htod_async(inp["device"], inp["host"], stream)
    # Run inference.
    if execute_v2:
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    else:
        context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    for out in outputs:
        cuda.memcpy_dtoh_async(out["host"], out["device"], stream)
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out["host"] for out in outputs]

def trt_output_process_fn(y_pred, nms_size, mask_size, n_classes):
    """Process raw output from TRT engine."""
    y_detection = y_pred[0].reshape((-1, nms_size, 6))
    y_mask = y_pred[1].reshape((-1, nms_size, n_classes, mask_size, mask_size))
    y_mask[y_mask < 0] = 0
    return [y_detection, y_mask]

def postprocess_output(outputs, image, scale_x, scale_y, threshold=THRESHOLD):
    """Post-process the output to overlay detections on the original image."""
    y_pred = [i.reshape(BATCH_SIZE, -1)[:BATCH_SIZE] for i in outputs]
    processed_outputs = trt_output_process_fn(y_pred, NMS_SIZE, MASK_SIZE, N_CLASSES)
    print("processed_outputs: ",processed_outputs)

    detections = {}
    bs, nd, _, _, _ = processed_outputs[1].shape
    masks = np.zeros((bs, nd)).tolist()

    for b in range(bs):
        for n in range(nd):
            class_idx = processed_outputs[0][..., -2][b, n]
            masks[b][n] = processed_outputs[1][b, n, int(class_idx), ...] if class_idx >= 0 else None
    masks = np.array(masks)

    bboxes = processed_outputs[0][..., :4]
    scores = processed_outputs[0][..., -1]
    classes = processed_outputs[0][..., -2]

    for i in range(len(scores[0])):
        if scores[0][i] > threshold:
            # Reverse scaling for bounding boxes
            x1, y1, x2, y2 = bboxes[0][i]
            x1, x2 = int(x1 / scale_x), int(x2 / scale_x)
            y1, y2 = int(y1 / scale_y), int(y2 / scale_y)
            
            # Draw the bounding box
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
            
            # Process mask
            mask = masks[0][i]
            if mask is not None:
                mask = (mask > threshold).astype(np.uint8)
                mask_resized = cv2.resize(mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_LINEAR)
                
                # Handle exact mask dimensions to prevent broadcast issues
                mask_resized = mask_resized[:y2 - y1, :x2 - x1]  # Crop to bbox dimensions
                
                colored_mask = np.zeros_like(image, dtype=np.uint8)
                colored_mask[y1:y1 + mask_resized.shape[0], x1:x1 + mask_resized.shape[1], 1] = mask_resized * 255
                image = cv2.addWeighted(image, 1, colored_mask, 0.5, 0)
    return image

if __name__ == "__main__":
    engine_file = "model.epoch-9.uff.engine"  # Path to your TensorRT engine
    image_path = "./test/CARTON3.jpg"  # Path to the input image

    # Load TensorRT engine
    engine = load_engine(engine_file)
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    context = engine.create_execution_context()

    # Preprocess input image
    input_image, original_image, scale_x, scale_y = preprocess_image(image_path, INPUT_SHAPE)
    np.copyto(inputs[0]["host"], input_image.ravel())

    # Run inference
    raw_outputs = do_inference(context, bindings, inputs, outputs, stream, batch_size=BATCH_SIZE)

    # Post-process and visualize the result
    result_image = postprocess_output(raw_outputs, original_image, scale_x, scale_y)
    cv2.imwrite("processed_img.jpg", result_image)

I have refer code from

Output:

Can u please suggest where are gaps?

Thanks

Could you narrow down by leveraging the tao_deploy code again?
You can try this way.

  1. Inside the tao_deploy docker, git clone the code, and make sure the mask_rcnn inference can work.
  2. Then find the actual python code of mask_rcnn inference (Should be under /usr/…).
  3. Generate the standalone code which can work.
  4. Delete the code you are not interested in.