I try to extend https://github.com/AastaNV/TRT_object_detection to be performed with multiple threads. but when I run the code it fails with:
[TensorRT] ERROR: cuda/cudaConvolutionLayer.cpp (238) - Cudnn Error in execute: 7 (CUDNN_STATUS_MAPPING_ERROR) [TensorRT] ERROR: engine.cpp (370) - Cuda Error in ~ExecutionContext: 4 (unspecified launch failure) terminate called after throwing an instance of 'nvinfer1::CudaError' what(): std::exception Aborted (core dumped)
I run it on a TX2 with Jetpack 4.2.1 and TensorRT 5.1.6-1+cuda10.0 and CUDA 10 and python3. The TRT binary was created as described in the above repository. The repo also contains the .so required to run the code below.
I think there is something weird going on with creating an engine first and then the context later, but I have not too much understanding of the matter.
Here the code:
import ctypes import cv2 from threading import Thread import pycuda.driver as cuda import tensorrt as trt import sys import numpy as np ctypes.CDLL("lib/libflattenconcat.so") def _inference_thread(image, engine, device): ctx = device.make_context() host_inputs =  cuda_inputs =  host_outputs =  cuda_outputs =  bindings =  for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size host_mem = cuda.pagelocked_empty(size, np.float32) cuda_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(cuda_mem)) if engine.binding_is_input(binding): host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) stream = cuda.Stream() with engine.create_execution_context() as context: np.copyto(host_inputs, image.ravel()) cuda.memcpy_htod_async(cuda_inputs, host_inputs, stream) context.execute_async(bindings=bindings, stream_handle=stream.handle) cuda.memcpy_dtoh_async(host_outputs, cuda_outputs, stream) cuda.memcpy_dtoh_async(host_outputs, cuda_outputs, stream) stream.synchronize() ctx.pop() ori = cv2.imread(sys.argv) image = cv2.cvtColor(ori, cv2.COLOR_BGR2RGB) image = cv2.resize(image, (300, 300)) image = (2.0/255.0) * image - 1.0 image = image.transpose((2, 0, 1)) cuda.init() device = cuda.Device(0) TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) trt.init_libnvinfer_plugins(TRT_LOGGER, '') fname = "TRT_ssd_mobilenet_v1_coco_2018_01_28.bin" with open(fname, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: buf = f.read() engine = runtime.deserialize_cuda_engine(buf) threads= for idx in range(4): threads.append(Thread(target=_inference_thread, args=(image, engine, device))) threads[idx].start() for thread in threads: thread.join()