YOLO v4 inference with TensorRT after training with TLT 3.0

Here is the full script (it’s quite basic):

import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
trt.init_libnvinfer_plugins(TRT_LOGGER,'')
DTYPE_TRT = trt.float32
import pycuda.driver as cuda
import pycuda.autoinit
from PIL import Image
import numpy as np

path_img = "image.jpg"
offsets  = ( 103.939, 116.779, 123.68 )
yolo_reso = (3, 768, 1024)

# Simple helper data class that's a little nicer to use than a 2-tuple
# from TRT Python sample code
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        #dtype = DTYPE_TRT
        print(dtype)
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings

def load_input(img_path, host_buffer):
    # convert to BGR and CHW format
    with Image.open(img_path) as img:
        # RGB to BGR
        r, g, b = img.split()              
        img = Image.merge('RGB', (b, g, r))

        c, h, w = yolo_reso
        dtype = trt.nptype(DTYPE_TRT) 
        img_res = img.resize((w, h), Image.BICUBIC)
        img_res = np.array(img_res, dtype=dtype, order='C')

        # HWC to CHW format:
        img_chw = np.transpose(img_res, [2, 0, 1])
       
        # Applying offsets to BGR channels
        img_chw[0] = img_chw[0] - offsets[0]
        img_chw[1] = img_chw[1] - offsets[1]
        img_chw[2] = img_chw[2] - offsets[2]

        img_array = img_chw.ravel()
        np.copyto(host_buffer, img_array)

# Inference
with open("model_fp32.engine", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())
    
    with engine.create_execution_context() as context:

        # allocate buffers
        inputs, outputs, bindings = allocate_buffers(engine)
        stream = cuda.Stream()

        # load image and pre-processing
        load_input(path_img, inputs[0].host)

        # transfer input data to the GPU.
        cuda.memcpy_htod_async(inputs[0].device, inputs[0].host, stream)
        
        # inference
        inference = context.execute_async(batch_size=1, bindings=bindings, stream_handle=stream.handle)
        
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(outputs[0].host, outputs[0].device, stream)
        
        # Synchronize the stream
        stream.synchronize()
        
        # Print the host output:
        print("OUTPUT")
        print(outputs)

You can download an etlt model file here: Dropbox - File Deleted. The model is trained to detect 15 classes.

I converted it using the tlt-converter utility (for CUDA 10.2, CUDNN 8 and TRT 7.1) using this command:

tlt-converter -k nvidia_tlt \
                   -d 3,768,1024 \
                   -o BatchedNMS \
                   -e model_fp32.engine \
                   -m 1 \
                   -t fp32 \
                   -i nchw \
                    yolov4.etlt

If I use the etlt or TensorRT engine file in DeepStream, it works without any issue. But unfortunately I cannot use DeepStream for this work.

Any idea why I only get 0? I tried different type of pre-processing (with and without the offsets, BGR and RGB format, dividing the pixel values by 255…) and I always get the same type of output (only the number of detection varies).