Hi,
I need help in building a python Tensorrt Inference engine with model trained with Tao toolkit. The inference engine code is pasted below, the output results are always 0 or -1.
With the help of documentation available i could train a model on the custom dataset on yolov4_tiny architecture.
Onnx Graph after converting it from hdf5
Here code I used to convert to trt engine.
!tao deploy yolo_v4_tiny gen_trt_engine -m $USER_EXPERIMENT_DIR/export/yolov4_cspdarknet_tiny_epoch_$EPOCH.onnx
-e $SPECS_DIR/yolo_v4_tiny_train_kitti.txt
–batch_size 1
–min_batch_size 1
–opt_batch_size 1
–max_batch_size 1
–data_type fp32
–results_dir $USER_EXPERIMENT_DIR/export
–engine_file $USER_EXPERIMENT_DIR/export/trt_2.engine
Verified the results as well with the following command, The results were perfect.
!tao deploy yolo_v4_tiny inference -m $USER_EXPERIMENT_DIR/export/trt_2.engine
-e $SPECS_DIR/yolo_v4_tiny_train_kitti.txt
-i $DATA_DOWNLOAD_DIR/frames-v1/test
-r $USER_EXPERIMENT_DIR/yolo_infer_images_21
-t 0.6
This is the code that i’m using:
import time
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
from PIL import Image
import cv2
# Loading the TRT Engine
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
trt.init_libnvinfer_plugins(TRT_LOGGER,'')
DTYPE_TRT = trt.float32
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem, binding_name, shape=None):
self.host = host_mem
self.device = device_mem
self.binding_name = binding_name
self.shape = shape
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice\n" + str(self.device) + "Shape: " + str(self.shape)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine, context):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
binding_id = engine.get_binding_index(str(binding))
size = trt.volume(context.get_binding_shape(binding_id)) * engine.max_batch_size
print("{}:{}".format(binding, size))
dtype = trt.nptype(engine.get_binding_dtype(binding))
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem, binding))
else:
output_shape = engine.get_binding_shape(binding)
if len(output_shape) == 3:
dims = trt.Dims3(engine.get_binding_shape(binding))
output_shape = (engine.max_batch_size, dims[0], dims[1], dims[2])
elif len(output_shape) == 2:
dims = trt.Dims2(output_shape)
output_shape = (engine.max_batch_size, dims[0], dims[1])
outputs.append(HostDeviceMem(host_mem, device_mem, binding, output_shape))
return inputs, outputs, bindings, stream
def preprocess_ds_nchw(batch_img):
batch_img_array = np.array([np.array(img) for img in batch_img], dtype=np.float32)
batch_img_array = batch_img_array / 255.0
batch_transpose = np.transpose(batch_img_array, (0, 3, 1, 2))
return batch_transpose
def load_images_cv(img_path, new_shape):
orig_img = cv2.imread(img_path)
img = cv2.resize(orig_img.copy(),(640,640))
img = img[..., [2, 1, 0]] # BGR -> RGB
images = preprocess_ds_nchw([img])
print(images.shape)
print(orig_img.shape)
return images, orig_img
def do_inference(batch, context, bindings, inputs, outputs, stream):
batch_size = batch.shape[0]
assert len(inputs) == 1
inputs[0].host = np.ascontiguousarray(batch, dtype=np.float32)
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
stream.synchronize()
outputs_dict = {}
outputs_shape = {}
for out in outputs:
print(out)
outputs_dict[out.binding_name] = np.reshape(out.host, out.shape)
outputs_shape[out.binding_name] = out.shape
return outputs_shape, outputs_dict
with open("trt_2.engine", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
with engine.create_execution_context() as context:
context.set_binding_shape(0, (1, 3, 640, 640))
new_shape = (640, 640)
inputs, outputs, bindings, stream = allocate_buffers(engine, context)
images, orig_img = load_images_cv("test_1.jpg", new_shape)
batch_images = images
#batch_images = batch_images[np.newaxis, :, :, :]
outputs_shape, outputs_data = do_inference(batch=batch_images, context=context,
bindings=bindings, inputs=inputs,
outputs=outputs, stream=stream)
print(outputs_data)
Here is log :
logs.txt (22.0 KB)
Dont know what i’m doing wrong.