Multi-threaded Object detection fails with Cudnn Error in execute: 7

I try to extend https://github.com/AastaNV/TRT_object_detection to be performed with multiple threads. but when I run the code it fails with:

[TensorRT] ERROR: cuda/cudaConvolutionLayer.cpp (238) - Cudnn Error in execute: 7 (CUDNN_STATUS_MAPPING_ERROR)
[TensorRT] ERROR: engine.cpp (370) - Cuda Error in ~ExecutionContext: 4 (unspecified launch failure)
terminate called after throwing an instance of 'nvinfer1::CudaError'
  what():  std::exception
Aborted (core dumped)

I run it on a TX2 with Jetpack 4.2.1 and TensorRT 5.1.6-1+cuda10.0 and CUDA 10 and python3. The TRT binary was created as described in the above repository. The repo also contains the .so required to run the code below.

I think there is something weird going on with creating an engine first and then the context later, but I have not too much understanding of the matter.

Here the code:

import ctypes
import cv2
from threading import Thread
import pycuda.driver as cuda
import tensorrt as trt
import sys
import numpy as np

ctypes.CDLL("lib/libflattenconcat.so")

def _inference_thread(image, engine, device):
    ctx = device.make_context()
    host_inputs  = []
    cuda_inputs  = []
    host_outputs = []
    cuda_outputs = []
    bindings = []
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        host_mem = cuda.pagelocked_empty(size, np.float32)
        cuda_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(cuda_mem))
        if engine.binding_is_input(binding):
            host_inputs.append(host_mem)
            cuda_inputs.append(cuda_mem)
        else:
            host_outputs.append(host_mem)
            cuda_outputs.append(cuda_mem)
    stream = cuda.Stream()
    with engine.create_execution_context() as context:
        np.copyto(host_inputs[0], image.ravel())
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        context.execute_async(bindings=bindings, stream_handle=stream.handle)
        cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        stream.synchronize()
    ctx.pop()

ori = cv2.imread(sys.argv[1])
image = cv2.cvtColor(ori, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (300, 300))
image = (2.0/255.0) * image - 1.0
image = image.transpose((2, 0, 1))

cuda.init()
device = cuda.Device(0)

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
trt.init_libnvinfer_plugins(TRT_LOGGER, '')

fname = "TRT_ssd_mobilenet_v1_coco_2018_01_28.bin"
with open(fname, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
   buf = f.read()
   engine = runtime.deserialize_cuda_engine(buf)

threads=[]
for idx in range(4):
    threads.append(Thread(target=_inference_thread, args=(image, engine, device)))
    threads[idx].start()
for thread in threads:
    thread.join()

I’m having the same issue. My trt inference code works in a single threaded python script, but gives this error in a multiprocessed framework.