Can TensorRT do inference in a child thread ?

Hello,
I have used tensorrt to inference my net,

as:
with open(“my.engine”, “rb”) as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
# Build a TensorRT engine.
#with build_engine_onnx(onnx_model_file) as engine:
# Allocate buffers and create a CUDA stream.
h_input, d_input, h_output, d_output, stream = self.allocate_buffers(engine)
# Contexts are used to perform inference.
with engine.create_execution_context() as context:
# do inference

when I do this in the main thread,it works well.
but when I start a Child thread to do this, it return a error as:

Exception in thread Thread-1:
Traceback (most recent call last):
File “/usr/lib/python3.6/threading.py”, line 916, in _bootstrap_inner
self.run()
File “/usr/lib/python3.6/threading.py”, line 864, in run
self._target(*self._args, **self._kwargs)
File “resnet50_trt.py”, line 149, in prediction
h_input, d_input, h_output, d_output, stream = self.allocate_buffers(engine)
File “resnet50_trt.py”, line 56, in allocate_buffers
h_input = cuda.pagelocked_empty(trt.volume((INPUT_SIZE,3,224,224)), dtype=trt.nptype(ModelData.DTYPE))
pycuda._driver.LogicError: explicit_context_dependent failed: invalid device context - no currently active context?

same of the code:
import pycuda.driver as cuda

# Allocate host and device buffers, and create a stream.
def allocate_buffers(self,engine):
    # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
    h_input = cuda.pagelocked_empty(trt.volume((INPUT_SIZE,3,224,224)), dtype=trt.nptype(ModelData.DTYPE))
    h_output = cuda.pagelocked_empty(trt.volume((INPUT_SIZE,8)), dtype=trt.nptype(ModelData.DTYPE))
    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()
    return h_input, d_input, h_output, d_output, stream

the error seem happend at Allocate host and device buffers.
So,Is the tensorRT can’t use in child thread?

My code is base on the python samples in introductory_parser_samples onnx_resnet50.py,I just change the code start a child thread ,and do inference in the child thread.

def main(name,times):
# Set the data path to the directory that contains the trained models and test images for inference.
data_path, data_files = common.find_sample_data(description=“Runs a ResNet50 network with a TensorRT inference engine.”, subfolder=os.path.join(“samples”, “resnet50”), find_files=[“binoculars.jpeg”, “reflex_camera.jpeg”, “tabby_tiger_cat.jpg”, ModelData.MODEL_PATH, “class_labels.txt”])
# Get test images, models and labels.
test_images = data_files[0:3]
onnx_model_file, labels_file = data_files[3:]
labels = open(labels_file, ‘r’).read().split(’\n’)

# Build a TensorRT engine.
with build_engine_onnx(onnx_model_file) as engine:
    # Inference is the same regardless of which parser is used to build the engine, since the model architecture is the same.
    # Allocate buffers and create a CUDA stream.
    h_input, d_input, h_output, d_output, stream = allocate_buffers(engine)
    # Contexts are used to perform inference.
    with engine.create_execution_context() as context:
        # Load a normalized test case into the host input page-locked buffer.
        test_image = random.choice(test_images)
        test_case = load_normalized_test_case(test_image, h_input)
        # Run the engine. The output will be a 1D tensor of length 1000, where each value represents the
        # probability that the image corresponds to that label
        do_inference(context, h_input, d_input, h_output, d_output, stream)
        # We use the highest probability as our prediction. Its index corresponds to the predicted label.
        pred =labels[np.argmax(h_output)]
        if "_".join(pred.split()) in os.path.splitext(os.path.basename(test_case))[0]:
            print("Correctly recognized " + test_case + " as " + pred)
        else:
            print("Incorrectly recognized " + test_case + " as " + pred)

if name == ‘main’:

action_classify = threading.Thread(target=main,args=('get-result',2),daemon=True)
action_classify.start()
#main()
print("main do same things")
while 1:
    pass

I got the same error .

I need do other things in main thread,so I need start a child thread to use tensorrt do inference.

Who can help me?

Has anyone else encountered the same problem as me?

If the tensorrt only can inference in the main thread, when I need do some UI things, how can I do?

I used the cuda.Context.attach() ,then the inference can run .

Hi, longzhu_71.Do you solve this problem? I am in same situation.