How to run yolov3-tiny.engine on tensorrt converted by run deepstream-app


Hello all
I trained yolov3-tiny model with custom class. When i run with deepstream-app objectDection-Yolo very well.

Good result predict with cv2.dnn :

But when i use engine encode by deepstream-app on tensorrt with python api the result not good.
Please help me

import ctypes
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import numpy as np
import cv2

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem): = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str( + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device,, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(, out.device, stream) for out in outputs]
    # Synchronize the stream
    # Return only the host outputs.
    return [ for out in outputs]

def extract_boxes_confidences_classids(outputs, confidence, width, height):
    boxes = []
    confidences = []
    classIDs = []

    for output in outputs:
        for detection in output:           
            # Extract the scores, classid, and the confidence of the prediction
            scores = detection[5:]
            classID = np.argmax(scores)
            conf = scores[classID]
            # Consider only the predictions that are above the confidence threshold
            if conf > confidence:
                # Scale the bounding box back to the size of the image
                box = detection[0:4] * np.array([width, height, width, height])
                centerX, centerY, w, h = box.astype('int')

                # Use the center coordinates, width and height to get the coordinates of the top left corner
                x = int(centerX - (w / 2))
                y = int(centerY - (h / 2))

                boxes.append([x, y, int(w), int(h)])

    return boxes, confidences, classIDs

with open("model_b1_gpu0_fp32.engine", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(
    context = engine.create_execution_context()

    inputs, outputs, bindings, stream = allocate_buffers(engine)

    imgRaw = cv2.imread('testimg/test.jpg')
    imgResize = cv2.cvtColor(imgRaw, cv2.COLOR_BGR2RGB)
    imgResize = cv2.resize(imgResize, (416, 416),interpolation=cv2.INTER_LINEAR)

    imgResize = imgResize.astype(np.float32)
    imgResize *= (1/255.0)
    imgResize = np.transpose(imgResize, [2,0,1])
    imgResize = np.expand_dims(imgResize, axis=0)
    imgResize = np.array(imgResize, dtype=np.float32, order='C')

    width, height, _ = imgRaw.shape
    print(width, height, _)

    inputs[0].host = imgResize
    outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
    # print(trt_outputs[0])

    output_reshape = list()
    for output in outputs:
        len, = output.shape
        output = np.reshape(output,(len//10,10))

    # Extract bounding boxes, confidences and classIDs
    confidence = 0.5
    boxes, confidences, classIDs = extract_boxes_confidences_classids(output_reshape, confidence, width, height)

    print(boxes, confidences, classIDs)


TensorRT: 7.1.3
GPU Type: Jetson NX
Nvidia Driver Version:
CUDA Version: 10.2.89
CUDNN Version:
Operating System + Version: Jetpack 4.4.1
Deepstream: 5.0
Python Version (if applicable): 3.6.9

Below link might help you with your query, Kindly check below link for all 3d support layers:


Thank for response,

I think TensorRT support all layer of Yolov3-tiny, because i use this engine on deepstream then the result very good but on my python scrip with tensorrt python api so bad.

can you check my scrip?


We recommend you to please check the official yolov3 sample and make sure, your script is correct or you can use this sample.

Thank you.