Getting wrong result on ONNX file using cutom script. (TLT to ONNX)

Hi @Morganh ,

I have converted one model from tlt to onnx. But when I am performing inference then It is giving me multiple b-boxes. Please suggest where I am making mistake. Below is the code.

`import onnxruntime as ort
import onnx
import cv2
import numpy as np

def validate_onnx_model(model_onnx_path):
    try:
        onnx_model = onnx.load(model_onnx_path)
        onnx.checker.check_model(onnx_model)
        print("ONNX model is valid.")
        return True
    except Exception as e:
        print(f"ONNX model validation failed: {e}")
        return False

def model_detection_init(model_onnx_path):
    try:
        session = ort.InferenceSession(model_onnx_path)
        input_name = session.get_inputs()[0].name
        output_names = [output.name for output in session.get_outputs()]
        
        input_shape = session.get_inputs()[0].shape
        detection_height, detection_width = input_shape[2], input_shape[3]
        
        return session, input_name, output_names, detection_height, detection_width
    except Exception as e:
        print("Exception in Detection model load:", e)
        return None

model_h = 544
model_w = 960
stride = 16
box_norm = 35.0

grid_h = int(model_h / stride)
grid_w = int(model_w / stride)
grid_size = grid_h * grid_w

grid_centers_w = [(i * stride + 0.5) / box_norm for i in range(grid_w)]
grid_centers_h = [(i * stride + 0.5) / box_norm for i in range(grid_h)]

def applyBoxNorm(o1, o2, o3, o4, x, y):
    o1 = (o1 - grid_centers_w[x]) * -box_norm
    o2 = (o2 - grid_centers_h[y]) * -box_norm
    o3 = (o3 + grid_centers_w[x]) * box_norm
    o4 = (o4 + grid_centers_h[y]) * box_norm
    return o1, o2, o3, o4

def postprocess(outputs, min_confidence, analysis_classes, wh_format=True):
    bbs = []
    class_ids = []
    scores = []
    for c in analysis_classes:
        x1_idx = c * 4 * grid_size
        y1_idx = x1_idx + grid_size
        x2_idx = y1_idx + grid_size
        y2_idx = x2_idx + grid_size

        boxes = outputs[0]
        for h in range(grid_h):
            for w in range(grid_w):
                i = w + h * grid_w
                score = outputs[1][c * grid_size + i]
                if score >= min_confidence:
                    o1 = boxes[x1_idx + w + h * grid_w]
                    o2 = boxes[y1_idx + w + h * grid_w]
                    o3 = boxes[x2_idx + w + h * grid_w]
                    o4 = boxes[y2_idx + w + h * grid_w]

                    o1, o2, o3, o4 = applyBoxNorm(o1, o2, o3, o4, w, h)

                    xmin = int(o1)
                    ymin = int(o2)
                    xmax = int(o3)
                    ymax = int(o4)
                    if wh_format:
                        bbs.append([xmin, ymin, xmax - xmin, ymax - ymin])
                    else:
                        bbs.append([xmin, ymin, xmax, ymax])
                    class_ids.append(c)
                    scores.append(float(score))

    return bbs, class_ids, scores

NUM_CLASSES = 3
threshold = 0.01

def vehicle_detection(frames, session, input_name, output_names, detection_height, detection_width):
    try:
        input_images = np.stack([cv2.resize(image, (detection_width, detection_height)) for image in frames])
        input_images = input_images.transpose((0, 3, 1, 2)).astype(np.float32)

        results = session.run(output_names, {input_name: input_images})

        score_info = results[1].reshape(6120)
        output_tensor = np.squeeze(results[0])

        outputs = output_tensor.reshape(24480)

        bboxes, class_ids, scores = postprocess([outputs, score_info], threshold, list(range(NUM_CLASSES)))
        print("bboxes : ",class_ids)

        # Filter out boxes with low confidence scores
        filtered_bboxes = []
        filtered_class_ids = []
        filtered_scores = []
        for i, score in enumerate(scores):
            print("score : ",score)
            if score >= threshold:
                filtered_bboxes.append(bboxes[i])
                filtered_class_ids.append(class_ids[i])
                filtered_scores.append(score)
                
        # Apply NMS
        if len(filtered_bboxes) > 0:
            # Adjust these parameters based on your requirements
            nms_threshold = 0.4  # overlapThreshold
            score_threshold = threshold  # scoreThreshold

            # Convert bounding boxes to the format required by NMSBoxes
            bboxes = np.array(filtered_bboxes)
            scores = np.array(filtered_scores)

            # Apply NMS
            indices = cv2.dnn.NMSBoxes(bboxes.tolist(), scores.tolist(), score_threshold, nms_threshold)

            print("indices : ",indices)

            # Draw remaining boxes after NMS
            for idx in indices:
                idx = int(idx)
                xmin, ymin, w, h = filtered_bboxes[idx]
                print("xmin, ymin, w, h ",xmin, ymin, w, h)
                if xmin > 0 and ymin > 0:
                    class_id = filtered_class_ids[idx]
                    color = [255, 0, 0] if class_id else [0, 0, 255]
                    cv2.rectangle(frames[0], (xmin, ymin), (xmin + w, ymin + h), color, 2)
                    cv2.imwrite("image1.jpg", frames[0])
        else:
            print("No valid detections after score filtering.")

    except Exception as e:
        print("Exception in vehicle detection:", e)

# Path to the ONNX model file
detection_model_onnx_path = '/home/smarg/Documents/openvino_container/MODEL/VehicleDetection_MobileNetV1_ReTrained_V1.6.onnx'

if validate_onnx_model(detection_model_onnx_path):
    session, input_name, output_names, detection_height, detection_width = model_detection_init(detection_model_onnx_path)
    if session:
        img = './input_image.jpg'
        frames = [cv2.imread(img)]
        vehicle_detection(frames, session, input_name, output_names, detection_height, detection_width)
    else:
        print("Failed to initialize the detection model.")
else:
    print("ONNX model validation failed.")
`

Below is the output image.

Please suggest.

Thanks.

Please suggest what can I do. The same etlt file of this tlt model is working fine with tensorRT script.

Below is custom tensorRT script which is working fine and both have the same post processing of output layer.

`import os
import time

import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
from PIL import Image
import glob,json
# from NUMBER_PLATE_DETECTION import npd

trt_engine_path = "/root/data/Pritam/ANPR_SCRIPT/MODELS/VEHICLE_DETECTION/VehicleDetection_MobileNetV1_ReTrained_V1.6.etlt_fp16_b1.engine"

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


def load_engine(trt_runtime, engine_path):
    with open(engine_path, "rb") as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine


def allocate_buffers(engine, batch_size=1):

    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()

    # Current NMS implementation in TRT only supports DataType.FLOAT but
    # it may change in the future, which could brake this sample here
    # when using lower precision [e.g. NMS output would not be np.float32
    # anymore, even though this is assumed in binding_to_type]

    binding_to_type = {
        "input_1": np.float32,
        "output_bbox/BiasAdd": np.float32,
        "output_cov/Sigmoid": np.float32,
    }

    for binding in engine:
        # print("bindings : ",bindings)
        size = trt.volume(engine.get_binding_shape(binding)) * batch_size
        dtype = binding_to_type[str(binding)]
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


# TensorRT logger singleton
TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
# trt_engine_path = os.path.join("YOUR .TRT FILE HERE")

trt_runtime = trt.Runtime(TRT_LOGGER)
trt_engine = load_engine(trt_runtime, trt_engine_path)

# This allocates memory for network inputs/outputs on both CPU and GPU
# inputs, outputs, bindings, stream = allocate_buffers(trt_engine)
inputs, outputs, bindings, stream=None,None,None,None 
# Execution context is needed for inference
# context = trt_engine.create_execution_context()

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    #print("context",context)
    #print("bindings",bindings)
    #print("inputs",inputs)
    #print("outputs",outputs)
    #print("stream",stream)

    # print("do_inference step 1")
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    print("do_inference step 2")

    context.execute_async(
            batch_size=batch_size, bindings=bindings, stream_handle=stream.handle
        )
    
    print("do_inference step 3")
    print("outputs : ",outputs)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # print("do_inference step 4")
    # Synchronize the stream
    stream.synchronize()
    # print("do_inference step 5")
    # Return only the host outputs.
    print("outputs : ",outputs)
    return [out.host for out in outputs]


def process_image(arr, w, h):
    image = Image.fromarray(np.uint8(arr))

    image_resized = image.resize(size=(w, h), resample=Image.BILINEAR)
    img_np = np.array(image_resized)
    # HWC -> CHW
    img_np = img_np.transpose((2, 0, 1))
    # Normalize to [0.0, 1.0] interval (expected by model)
    img_np = (1.0 / 255.0) * img_np
    # #print(img_np.shape)
    img_np = img_np.ravel()
    return img_np


def predict(context,image, model_w, model_h):

    # print("predict step 1")
    img = process_image(image, model_w, model_h)
    # print("predict step 2")
    # #print(img.shape)
    # Copy it into appropriate place into memory
    # (self.inputs was returned earlier by allocate_buffers())
    np.copyto(inputs[0].host, img.ravel())

    # When infering on single image, we measure inference
    # time to output it to the user
    inference_start_time = time.time()

    # Fetch output from the model
    # print("predict step 3")

    [detection_out, keepCount_out] = do_inference(
        context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream
    )
    # print("predict step 4")

    # Output inference time
    # #print(
    #     "TensorRT inference time: {} ms".format(
    #         int(round((time.time() - inference_start_time) * 1000))
    #     )
    # )

    # And return results
    return detection_out, keepCount_out


# -------------- MODEL PARAMETERS FOR DETECTNET_V2 --------------------------------
model_h = 544
model_w = 960
stride = 16
box_norm = 35.0

grid_h = int(model_h / stride)
grid_w = int(model_w / stride)
grid_size = grid_h * grid_w

grid_centers_w = []
grid_centers_h = []

for i in range(grid_h):
    value = (i * stride + 0.5) / box_norm
    grid_centers_h.append(value)

for i in range(grid_w):
    value = (i * stride + 0.5) / box_norm
    grid_centers_w.append(value)


def applyBoxNorm(o1, o2, o3, o4, x, y):

    o1 = (o1 - grid_centers_w[x]) * -box_norm
    o2 = (o2 - grid_centers_h[y]) * -box_norm
    o3 = (o3 + grid_centers_w[x]) * box_norm
    o4 = (o4 + grid_centers_h[y]) * box_norm
    return o1, o2, o3, o4


def postprocess(outputs, min_confidence, analysis_classes, wh_format=True):

    bbs = []
    class_ids = []
    scores = []
    
    for c in analysis_classes:
        
        print("analysis_classes : ",analysis_classes)
        print("grid_size:",grid_size)
        print("grid_h:",grid_h)
        print("grid_w:",grid_w)

        x1_idx = c * 4 * grid_size
        y1_idx = x1_idx + grid_size
        x2_idx = y1_idx + grid_size
        y2_idx = x2_idx + grid_size

        boxes = outputs[0]
        print("outputs : ",outputs[1].shape)
        for h in range(grid_h):
            for w in range(grid_w):
                i = w + h * grid_w
                score = outputs[1][c * grid_size + i]
                if score >= min_confidence:
                    o1 = boxes[x1_idx + w + h * grid_w]
                    o2 = boxes[y1_idx + w + h * grid_w]
                    o3 = boxes[x2_idx + w + h * grid_w]
                    o4 = boxes[y2_idx + w + h * grid_w]

                    o1, o2, o3, o4 = applyBoxNorm(o1, o2, o3, o4, w, h)

                    xmin = int(o1)
                    ymin = int(o2)
                    xmax = int(o3)
                    ymax = int(o4)
                    if wh_format:
                        bbs.append([xmin, ymin, xmax - xmin, ymax - ymin])
                    else:
                        bbs.append([xmin, ymin, xmax, ymax])
                    class_ids.append(c)
                    scores.append(float(score))

    return bbs, class_ids, scores


def vehicleDetection(image_name):
    # count=0 
    # for image_path in glob.glob("./images/input_images/*"):
    # #     count+=1
    #     print(image_path)

    
    #     image_name = image_path.split("/")[-1]

    output_image_name = "./images/output_images_vehicle/"+image_name    
        # image = cv2.imread(image_path)[..., ::-1]
    # print("VD _step -1",image_name)
    file_path="./images/input_images/"+image_name
    # file_path=image_name

    image = cv2.imread(image_name)[..., ::-1]
    
    global inputs
    global outputs
    global bindings
    global stream 
    global trt_engine
    inputs, outputs, bindings, stream = allocate_buffers(trt_engine)

    context = trt_engine.create_execution_context()

    detection_out, keepCount_out = predict(context,image, model_w, model_h)

    # print(detection_out)
    # print("VD _step -2")
    
    NUM_CLASSES = 1
    threshold = 0.1
    # print("VD _step -3")
    bboxes, class_ids, scores = postprocess(
        [detection_out, keepCount_out], threshold, list(range(NUM_CLASSES))
    )
    # print("VD _step -4")

    image_cpy = image.copy()
    image_cpy = cv2.resize(image_cpy, (model_w, model_h))
    # print("VD _step -5")

    # Final bboxes only take afet NMS
    # print("VD _step -6")
    indexes = cv2.dnn.NMSBoxes(bboxes, scores, threshold, 0.5)
    # print("VD _step -7")
    v_image = None
    
    for idx in indexes:
        idx = int(idx)
        xmin, ymin, w, h = bboxes[idx]
        if xmin >0 and ymin > 0:
            # print("b-boxes : ",xmin,ymin,w,h)
            class_id = class_ids[idx]
            color = [255, 0, 0] if class_id else [0, 0, 255]
            v_image = image_cpy[ymin:ymin + h,xmin:xmin + w]
            # npd(v_image) 
            cv2.rectangle(image_cpy, (xmin, ymin), (xmin + w, ymin + h), color, 2)
            cv2.imwrite(output_image_name,image_cpy)
    # image_cpy2 = cv2.cvtColor(image_cpy, cv2.COLOR_BGR2RGB) 
    return v_image

image_path = "/root/data/Pritam/ANPR_SCRIPT/images/input_images/_17534.jpg"
vehicleDetection(image_path)
`

Please suggest what is the problem with the onnx model or its output post processing logic.

Thanks.

So, the models have no issues. Can you refer to the tao-deploy code(tao_deploy/nvidia_tao_deploy/cv at main · NVIDIA/tao_deploy · GitHub) for preprocessing and postprocessing?

Yes the model has no issue. Detection are working fine of this model with custom tensorRT script and on Deepstream as well.
Facing problem with only converted onnx model.

Not getting much clarity with the shared link.

Can you please make some updation in script that what can be post processing?

Thanks.

For detectnet_v2 network, please try to narrow down on your side by leveraging tao_deploy/nvidia_tao_deploy/cv/detectnet_v2/postprocessor.py at main · NVIDIA/tao_deploy · GitHub or tao-toolkit-triton-apps/tao_triton/python/postprocessing/detectnet_processor.py at main · NVIDIA-AI-IOT/tao-toolkit-triton-apps · GitHub.

Sure.
Do you think there is a problem with preprocessing as well?

Please have a look in below code.

import onnxruntime as ort
import onnx
import cv2
import numpy as np

def validate_onnx_model(model_onnx_path):
    try:
        onnx_model = onnx.load(model_onnx_path)
        onnx.checker.check_model(onnx_model)
        print("ONNX model is valid.")
        return True
    except Exception as e:
        print(f"ONNX model validation failed: {e}")
        return False

def model_detection_init(model_onnx_path):
    try:
        session = ort.InferenceSession(model_onnx_path)
        input_name = session.get_inputs()[0].name
        output_names = [output.name for output in session.get_outputs()]
        
        input_shape = session.get_inputs()[0].shape
        detection_height, detection_width = input_shape[2], input_shape[3]
        
        return session, input_name, output_names, detection_height, detection_width
    except Exception as e:
        print("Exception in Detection model load:", e)
        return None

model_h = 544
model_w = 960
stride = 16
box_norm = 35.0

grid_h = int(model_h / stride)
grid_w = int(model_w / stride)
grid_size = grid_h * grid_w

grid_centers_w = [(i * stride + 0.5) / box_norm for i in range(grid_w)]
grid_centers_h = [(i * stride + 0.5) / box_norm for i in range(grid_h)]

def applyBoxNorm(o1, o2, o3, o4, x, y):
    o1 = (o1 + grid_centers_w[x]) * box_norm
    o2 = (o2 + grid_centers_h[y]) * box_norm
    o3 = (o3 + grid_centers_w[x]) * box_norm
    o4 = (o4 + grid_centers_h[y]) * box_norm

    # Ensure coordinates are within valid ranges
    o1 = max(0, min(o1, model_w))
    o2 = max(0, min(o2, model_h))
    o3 = max(0, min(o3, model_w))
    o4 = max(0, min(o4, model_h))
    
    print(f"applyBoxNorm: ({o1}, {o2}, {o3}, {o4}) at grid ({x}, {y})")
    return o1, o2, o3, o4



def postprocess(outputs, min_confidence, analysis_classes, wh_format=True):
    bbs = []
    class_ids = []
    scores = []
    for c in analysis_classes:
        x1_idx = c * 4 * grid_size
        y1_idx = x1_idx + grid_size
        x2_idx = y1_idx + grid_size
        y2_idx = x2_idx + grid_size

        boxes = outputs[0]
        for h in range(grid_h):
            for w in range(grid_w):
                i = w + h * grid_w
                score = outputs[1][c * grid_size + i]
                if score >= min_confidence:
                    o1 = boxes[x1_idx + w + h * grid_w]
                    o2 = boxes[y1_idx + w + h * grid_w]
                    o3 = boxes[x2_idx + w + h * grid_w]
                    o4 = boxes[y2_idx + w + h * grid_w]

                    print(f"Before applyBoxNorm: ({o1}, {o2}, {o3}, {o4}) at grid ({w}, {h})")
                    o1, o2, o3, o4 = applyBoxNorm(o1, o2, o3, o4, w, h)
                    print(f"After applyBoxNorm: ({o1}, {o2}, {o3}, {o4}) at grid ({w}, {h})")

                    xmin = int(o1)
                    ymin = int(o2)
                    xmax = int(o3)
                    ymax = int(o4)
                    if wh_format:
                        bbs.append([xmin, ymin, xmax - xmin, ymax - ymin])
                    else:
                        bbs.append([xmin, ymin, xmax, ymax])
                    class_ids.append(c)
                    scores.append(float(score))


    return bbs, class_ids, scores

NUM_CLASSES = 3
threshold = 0.1

def vehicle_detection(frames, session, input_name, output_names, detection_height, detection_width):
    try:
        input_images = np.stack([cv2.resize(image, (detection_width, detection_height)) for image in frames])
        input_images = input_images.transpose((0, 3, 1, 2)).astype(np.float32)

        results = session.run(output_names, {input_name: input_images})

        # score_info = results[1].reshape(6120)
        print("used flatten method")
        score_info = results[1].ravel()

        output_tensor = np.squeeze(results[0])
        
        # outputs = output_tensor.reshape(24480)
        outputs = output_tensor.ravel()


        bboxes, class_ids, scores = postprocess([outputs, score_info], threshold, list(range(NUM_CLASSES)))
        print("bboxes : ",class_ids)

        # Filter out boxes with low confidence scores
        filtered_bboxes = []
        filtered_class_ids = []
        filtered_scores = []
        for i, score in enumerate(scores):
            # print("score : ",score)
            if score >= threshold:
                filtered_bboxes.append(bboxes[i])
                filtered_class_ids.append(class_ids[i])
                filtered_scores.append(score)

        # Apply NMS
        if len(filtered_bboxes) > 0:
            # Adjust these parameters based on your requirements
            nms_threshold = 0.4  # overlapThreshold
            score_threshold = threshold  # scoreThreshold

            # Convert bounding boxes to the format required by NMSBoxes
            bboxes = np.array(filtered_bboxes)
            scores = np.array(filtered_scores)

            # Apply NMS
            indices = cv2.dnn.NMSBoxes(bboxes.tolist(), scores.tolist(), score_threshold, nms_threshold)

            print("indices : ",indices)

            # Draw remaining boxes after NMS
            for idx in indices:
                idx = int(idx)
                xmin, ymin, w, h = filtered_bboxes[idx]
                print("xmin, ymin, w, h ",xmin, ymin, w, h)
                if xmin > 0 and ymin > 0:
                    class_id = filtered_class_ids[idx]
                    color = [255, 0, 0] if class_id else [0, 0, 255]
                    cv2.rectangle(frames[0], (xmin, ymin), (xmin + w, ymin + h), color, 2)
                    cv2.imwrite("image1.jpg", frames[0])
        else:
            print("No valid detections after score filtering.")

    except Exception as e:
        print("Exception in vehicle detection:", e)

# Path to the ONNX model file
detection_model_onnx_path = '/home/smarg/Documents/openvino_container/MODEL/VehicleDetection_MobileNetV1_ReTrained_V1.6.onnx'

if validate_onnx_model(detection_model_onnx_path):
    session, input_name, output_names, detection_height, detection_width = model_detection_init(detection_model_onnx_path)
    if session:
        img = './_17534.jpg'
        frames = [cv2.imread(img)]
        vehicle_detection(frames, session, input_name, output_names, detection_height, detection_width)
    else:
        print("Failed to initialize the detection model.")
else:
    print("ONNX model validation failed.")

Yes, please also check the preprocessing as well. You can check if the input array is the same between onnx inference and tensorrt engine inference.
You already have a working code in tensorrt engine inference. So, you can leverage it to check if the preprocessing is correct before onnx inference.

Okay. I will try.
Meanwhile if you have any script for detectnet_V2 infer for onnx model then please share.

Thanks.

There is no update from you for a period, assuming this is not an issue anymore. Hence we are closing this topic. If need further support, please open a new one. Thanks