Dear @Morganh
I am trying to test Mask-Rcnn model with some images. I have observed, when I am testing the engine using tao deploy mask_rcnn inference I got correct output but when I am testing via script I am getting different.
With below command inside notebook giving correct output
command:
# Running inference for detection on a dir of images
!tao deploy mask_rcnn inference -i $DATA_DOWNLOAD_DIR/raw-data/test \
-r $USER_EXPERIMENT_DIR/maskrcnn_annotated_images \
-e $SPECS_DIR/maskrcnn_train_resnet50.txt \
-m $USER_EXPERIMENT_DIR/experiment_dir_unpruned/model.epoch-9.uff.engine \
-c $SPECS_DIR/coco_labels.txt \
-t 0.5
OutputImage
But when I am testing the same image with script but getting different output.
Script:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2
# Constants
BATCH_SIZE = 1
INPUT_SHAPE = (BATCH_SIZE, 3, 640, 640) # Batch size 1, 3 channels, 640x640 resolution
THRESHOLD = 0.5 # Confidence threshold for detections
NMS_SIZE = 100 # Max number of detections after NMS
MASK_SIZE = 28 # Size of masks
N_CLASSES = 2 # Number of classes
def load_engine(engine_file_path):
"""Load the TensorRT engine from file."""
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(TRT_LOGGER, namespace="")
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def allocate_buffers(engine):
"""Allocate input and output buffers for TensorRT engine."""
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
binding_shape = engine.get_tensor_shape(binding)
size = trt.volume(binding_shape)
dtype = trt.nptype(engine.get_tensor_dtype(binding))
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
inputs.append({"host": host_mem, "device": device_mem})
else:
outputs.append({"host": host_mem, "device": device_mem})
return inputs, outputs, bindings, stream
def preprocess_image(image_path, input_shape):
"""Preprocess input image to match the engine's input size."""
image = cv2.imread(image_path)
original_h, original_w = image.shape[:2]
input_h, input_w = input_shape[2:]
# Calculate scaling factors
scale_x = input_w / original_w
scale_y = input_h / original_h
resized_image = cv2.resize(image, (input_w, input_h))
normalized_image = resized_image.astype(np.float32) / 255.0 # Normalize to [0, 1]
transposed_image = np.transpose(normalized_image, (2, 0, 1)) # HWC to CHW
batch_image = np.expand_dims(transposed_image, axis=0) # Add batch dimension
return batch_image.astype(np.float32), image, scale_x, scale_y
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1, execute_v2=False):
"""Generalized function for multiple inputs/outputs."""
# Transfer input data to the GPU.
for inp in inputs:
cuda.memcpy_htod_async(inp["device"], inp["host"], stream)
# Run inference.
if execute_v2:
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
else:
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
for out in outputs:
cuda.memcpy_dtoh_async(out["host"], out["device"], stream)
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out["host"] for out in outputs]
def trt_output_process_fn(y_pred, nms_size, mask_size, n_classes):
"""Process raw output from TRT engine."""
y_detection = y_pred[0].reshape((-1, nms_size, 6))
y_mask = y_pred[1].reshape((-1, nms_size, n_classes, mask_size, mask_size))
y_mask[y_mask < 0] = 0
return [y_detection, y_mask]
def postprocess_output(outputs, image, scale_x, scale_y, threshold=THRESHOLD):
"""Post-process the output to overlay detections on the original image."""
y_pred = [i.reshape(BATCH_SIZE, -1)[:BATCH_SIZE] for i in outputs]
processed_outputs = trt_output_process_fn(y_pred, NMS_SIZE, MASK_SIZE, N_CLASSES)
print("processed_outputs: ",processed_outputs)
detections = {}
bs, nd, _, _, _ = processed_outputs[1].shape
masks = np.zeros((bs, nd)).tolist()
for b in range(bs):
for n in range(nd):
class_idx = processed_outputs[0][..., -2][b, n]
masks[b][n] = processed_outputs[1][b, n, int(class_idx), ...] if class_idx >= 0 else None
masks = np.array(masks)
bboxes = processed_outputs[0][..., :4]
scores = processed_outputs[0][..., -1]
classes = processed_outputs[0][..., -2]
for i in range(len(scores[0])):
if scores[0][i] > threshold:
# Reverse scaling for bounding boxes
x1, y1, x2, y2 = bboxes[0][i]
x1, x2 = int(x1 / scale_x), int(x2 / scale_x)
y1, y2 = int(y1 / scale_y), int(y2 / scale_y)
# Draw the bounding box
cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
# Process mask
mask = masks[0][i]
if mask is not None:
mask = (mask > threshold).astype(np.uint8)
mask_resized = cv2.resize(mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_LINEAR)
# Handle exact mask dimensions to prevent broadcast issues
mask_resized = mask_resized[:y2 - y1, :x2 - x1] # Crop to bbox dimensions
colored_mask = np.zeros_like(image, dtype=np.uint8)
colored_mask[y1:y1 + mask_resized.shape[0], x1:x1 + mask_resized.shape[1], 1] = mask_resized * 255
image = cv2.addWeighted(image, 1, colored_mask, 0.5, 0)
return image
if __name__ == "__main__":
engine_file = "model.epoch-9.uff.engine" # Path to your TensorRT engine
image_path = "./test/CARTON3.jpg" # Path to the input image
# Load TensorRT engine
engine = load_engine(engine_file)
inputs, outputs, bindings, stream = allocate_buffers(engine)
context = engine.create_execution_context()
# Preprocess input image
input_image, original_image, scale_x, scale_y = preprocess_image(image_path, INPUT_SHAPE)
np.copyto(inputs[0]["host"], input_image.ravel())
# Run inference
raw_outputs = do_inference(context, bindings, inputs, outputs, stream, batch_size=BATCH_SIZE)
# Post-process and visualize the result
result_image = postprocess_output(raw_outputs, original_image, scale_x, scale_y)
cv2.imwrite("processed_img.jpg", result_image)
I have refer code from
Output:
Can u please suggest where are gaps?
Thanks