Hi.
I exported a .tlt model and generated trt.engine on jetson nano . I get following error for inference:
[TensorRT] ERROR: 2: [pluginV2DynamicExtRunner.cpp::execute::115] Error Code 2: Internal Error (Assertion status == kSTATUS_SUCCESS failed.)
I use the yolov4 model and with this command, I generate the trt.engine:
./tao-converter -k $KEY -p Input,1x3x416x416,8x3x416x416,16x3x416x416 -d 3,416,416 -o BatchedNMS -i nchw -e /home/jetsonuser/jp4.6/trt.engine -b 2 -t fp16 -w 1073741824 /home/jetsonuser/jp4.6/yolov4_export/final_model.etlt
Inference code:
import tensorrt as trt
import numpy as np
from PIL import Image
import os
import cv2
import pycuda.driver as cuda
import pycuda.autoinit
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class TrtEngine:
#Initializes TensorRT objects needed for model inference.
def __init__(self,engine_path, input_height, input_width, input_channels, max_batch_size, dtype):
self.engine_path = engine_path
self.input_height = input_height
self.input_width = input_width
self.input_channels = input_channels
self.dtype = dtype
self.logger = trt.Logger(trt.Logger.VERBOSE)
self.runtime = trt.Runtime(self.logger)
self.engine = self.load_engine(self.runtime, self.engine_path)
self.max_batch_size = max_batch_size
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
self.context = self.engine.create_execution_context()
# Allocate memory for multiple usage [e.g. multiple batch inference]
# self.context.set_binding_shape(0, (self.max_batch_size , 3, self.input_height, self.input_width))
input_volume = trt.volume((self.input_channels, self.input_width, self.input_height))
self.numpy_array = np.zeros((self.engine.max_batch_size, input_volume))
@staticmethod
def load_engine(trt_runtime, engine_path):
trt.init_libnvinfer_plugins(None, "")
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def allocate_buffers(self):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * -1
host_mem = cuda.pagelocked_empty(size, self.dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def infer_batch(self, image_paths):
"""Infers model on batch of same sized images resized to fit the model.
Args:
image_paths (str): paths to images, that will be packed into batch
and fed into model
"""
# Verify if the supplied batch size is not too big
max_batch_size = self.engine.max_batch_size
actual_batch_size = len(image_paths)
if actual_batch_size > max_batch_size:
raise ValueError(
"image_paths list bigger ({}) than engine max batch size ({})".format(actual_batch_size, max_batch_size))
# Load all images to CPU...
imgs = self._load_imgs(image_paths)
# ...copy them into appropriate place into memory...
# (self.inputs was returned earlier by allocate_buffers())
# print("check for more than 1 image")
# print(len(self.inputs))
np.copyto(self.inputs[0].host, imgs.ravel().astype(self.dtype))
# ...fetch model outputs...
input_shape = (1,3,self.img_height,self.img_width)
self.context.set_binding_shape(0, input_shape)
# [detection_out, keep_count_out] = do_inference(
# context=self.context, bindings=self.bindings, inputs=self.inputs,
# outputs=self.outputs, stream=self.stream)
# # ...and return results.
# return detection_out, keep_count_out
outputs = do_inference(
context=self.context, bindings=self.bindings, inputs=self.inputs,
outputs=self.outputs, stream=self.stream)
# ...and return results.
return outputs
def _load_image_into_numpy_array(self, image):
# (im_width, im_height) = image.size
# return np.array(image).reshape(
# (im_height, im_width, self.input_channels)
# ).astype(np.float16)
return np.array(image, dtype=self.dtype, order='C')
def _load_imgs(self, image_paths):
# batch_size = self.engine.max_batch_size
for idx, image_path in enumerate(image_paths):
img_np = self._load_img(image_path)
self.numpy_array[idx] = img_np
return self.numpy_array
def _load_img(self, image_path):
image = Image.open(image_path)
r, g, b = image.split()
image = Image.merge('RGB', (b, g, r))
model_input_width = self.input_width
model_input_height = self.input_width
# Note: Bilinear interpolation used by Pillow is a little bit
# different than the one used by Tensorflow, so if network receives
# an image that is not 300x300, the network output may differ
# from the one output by Tensorflow
image_resized = image.resize(
size=(model_input_width, model_input_height),
resample=Image.BICUBIC
)
img_np = self._load_image_into_numpy_array(image_resized)
# HWC -> CHW
img_np = img_np.transpose((2, 0, 1))
# Normalize to [-1.0, 1.0] interval (expected by model)
img_np = img_np / 255.0
# img_np = (2.0 / 255.0) * img_np - 1.0
img_np = img_np.ravel()
return img_np
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
Also this is the link to download the model file:
Environment
TensorRT Version: 8.2.0
GPU Type: GPU ARCHS 53
Nvidia Driver Version: 32.6.1
CUDA Version: 11.3.1
CUDNN Version: 8.2
Operating System + Version: 4.9.253-tegra
Python Version (if applicable): Python 3.6.9