Here is the full script (it’s quite basic):
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
trt.init_libnvinfer_plugins(TRT_LOGGER,'')
DTYPE_TRT = trt.float32
import pycuda.driver as cuda
import pycuda.autoinit
from PIL import Image
import numpy as np
path_img = "image.jpg"
offsets = ( 103.939, 116.779, 123.68 )
yolo_reso = (3, 768, 1024)
# Simple helper data class that's a little nicer to use than a 2-tuple
# from TRT Python sample code
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
#dtype = DTYPE_TRT
print(dtype)
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
def load_input(img_path, host_buffer):
# convert to BGR and CHW format
with Image.open(img_path) as img:
# RGB to BGR
r, g, b = img.split()
img = Image.merge('RGB', (b, g, r))
c, h, w = yolo_reso
dtype = trt.nptype(DTYPE_TRT)
img_res = img.resize((w, h), Image.BICUBIC)
img_res = np.array(img_res, dtype=dtype, order='C')
# HWC to CHW format:
img_chw = np.transpose(img_res, [2, 0, 1])
# Applying offsets to BGR channels
img_chw[0] = img_chw[0] - offsets[0]
img_chw[1] = img_chw[1] - offsets[1]
img_chw[2] = img_chw[2] - offsets[2]
img_array = img_chw.ravel()
np.copyto(host_buffer, img_array)
# Inference
with open("model_fp32.engine", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
with engine.create_execution_context() as context:
# allocate buffers
inputs, outputs, bindings = allocate_buffers(engine)
stream = cuda.Stream()
# load image and pre-processing
load_input(path_img, inputs[0].host)
# transfer input data to the GPU.
cuda.memcpy_htod_async(inputs[0].device, inputs[0].host, stream)
# inference
inference = context.execute_async(batch_size=1, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(outputs[0].host, outputs[0].device, stream)
# Synchronize the stream
stream.synchronize()
# Print the host output:
print("OUTPUT")
print(outputs)
You can download an etlt model file here: Dropbox - File Deleted. The model is trained to detect 15 classes.
I converted it using the tlt-converter utility (for CUDA 10.2, CUDNN 8 and TRT 7.1) using this command:
tlt-converter -k nvidia_tlt \
-d 3,768,1024 \
-o BatchedNMS \
-e model_fp32.engine \
-m 1 \
-t fp32 \
-i nchw \
yolov4.etlt
If I use the etlt or TensorRT engine file in DeepStream, it works without any issue. But unfortunately I cannot use DeepStream for this work.
Any idea why I only get 0? I tried different type of pre-processing (with and without the offsets, BGR and RGB format, dividing the pixel values by 255…) and I always get the same type of output (only the number of detection varies).