I try to extend https://github.com/AastaNV/TRT_object_detection to be performed with multiple threads. but when I run the code it fails with:
[TensorRT] ERROR: cuda/cudaConvolutionLayer.cpp (238) - Cudnn Error in execute: 7 (CUDNN_STATUS_MAPPING_ERROR)
[TensorRT] ERROR: engine.cpp (370) - Cuda Error in ~ExecutionContext: 4 (unspecified launch failure)
terminate called after throwing an instance of 'nvinfer1::CudaError'
what(): std::exception
Aborted (core dumped)
I run it on a TX2 with Jetpack 4.2.1 and TensorRT 5.1.6-1+cuda10.0 and CUDA 10 and python3. The TRT binary was created as described in the above repository. The repo also contains the .so required to run the code below.
I think there is something weird going on with creating an engine first and then the context later, but I have not too much understanding of the matter.
Here the code:
import ctypes
import cv2
from threading import Thread
import pycuda.driver as cuda
import tensorrt as trt
import sys
import numpy as np
ctypes.CDLL("lib/libflattenconcat.so")
def _inference_thread(image, engine, device):
ctx = device.make_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
host_mem = cuda.pagelocked_empty(size, np.float32)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(cuda_mem))
if engine.binding_is_input(binding):
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
stream = cuda.Stream()
with engine.create_execution_context() as context:
np.copyto(host_inputs[0], image.ravel())
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
context.execute_async(bindings=bindings, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
stream.synchronize()
ctx.pop()
ori = cv2.imread(sys.argv[1])
image = cv2.cvtColor(ori, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (300, 300))
image = (2.0/255.0) * image - 1.0
image = image.transpose((2, 0, 1))
cuda.init()
device = cuda.Device(0)
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
trt.init_libnvinfer_plugins(TRT_LOGGER, '')
fname = "TRT_ssd_mobilenet_v1_coco_2018_01_28.bin"
with open(fname, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
buf = f.read()
engine = runtime.deserialize_cuda_engine(buf)
threads=[]
for idx in range(4):
threads.append(Thread(target=_inference_thread, args=(image, engine, device)))
threads[idx].start()
for thread in threads:
thread.join()