Description
Can multiple CUDA contexts share an inference engine?
I am trying to do inference with TensorRT in a multi-threaded Python environment.
I generate a CUDA context for each sub-thread (cuda.Device(0).make_context()
) and an execution context (engine.create_execution_context()
).
The inference engine for generating the execution context is deserialized in the main thread and shared with the subthreads.
However, this method causes an error.
It works well if the inference engine is deserialized for each execution context.
Do I need to generate an inference engine for each CUDA context?
I am using the following as a reference for testing with modifications.
Please point out if my understanding of this matter is incorrect and provide pointers to the documents.
Your support is sincerely appreciated.
import threading
import time
from my_tensorrt_code import TRTInference, trt
exitFlag = 0
class myThread(threading.Thread):
def __init__(self, func, args):
threading.Thread.__init__(self)
self.func = func
self.args = args
def run(self):
print ("Starting " + self.args[0])
self.func(*self.args)
print ("Exiting " + self.args[0])
if __name__ == '__main__':
# Create new threads
'''
format thread:
- func: function names, function that we wished to use
- arguments: arguments that will be used for the func's arguments
'''
trt_engine_path = 'mnist.trt'
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
trt.init_libnvinfer_plugins(TRT_LOGGER, '')
runtime = trt.Runtime(TRT_LOGGER)
# deserialize engine
with open(trt_engine_path, 'rb') as f:
buf = f.read()
engine = runtime.deserialize_cuda_engine(buf)
max_batch_size = 1
trt_inference_wrapper = TRTInference(engine,
trt_engine_datatype=trt.DataType.FLOAT,
batch_size=max_batch_size)
# Get TensorRT SSD model output
input_img_path = '/mnt/ogasawara/teams/rd/inoue67/tmp/TensorRT-10.6.0.26/data/mnist/3.pgm'
thread1 = myThread(trt_inference_wrapper.infer, [input_img_path])
# Start new Threads
thread1.start()
thread1.join()
trt_inference_wrapper.destory();
print ("Exiting Main Thread")
from PIL import Image
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
import threading
import time
import math
class TRTInference:
def __init__(self, engine, trt_engine_datatype, batch_size):
self.cfx = cuda.Device(0).make_context()
stream = cuda.Stream()
context = engine.create_execution_context()
# prepare buffer
host_inputs = []
cuda_inputs = []
name_inputs = []
host_outputs = []
cuda_outputs = []
name_outputs = []
bindings = []
for binding in engine:
size = trt.volume(engine.get_tensor_shape(binding))
host_mem = cuda.pagelocked_empty(size, np.float32)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(cuda_mem))
if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
name_inputs.append(binding)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
name_outputs.append(binding)
# store
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.name_inputs = name_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.name_outputs = name_outputs
self.bindings = bindings
def infer(self, input_img_path):
threading.Thread.__init__(self)
self.cfx.push()
# restore
stream = self.stream
context = self.context
engine = self.engine
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
name_inputs = self.name_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
name_outputs = self.name_outputs
bindings = self.bindings
# read image
image = 1 - (np.asarray(Image.open(input_img_path), dtype=np.float32)/255)
np.copyto(host_inputs[0], image.ravel())
# inference
start_time = time.time()
for i, name in enumerate(name_inputs):
context.set_tensor_address(name, cuda_inputs[i])
for i, name in enumerate(name_outputs):
context.set_tensor_address(name, cuda_outputs[i])
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
context.execute_async_v3(stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
stream.synchronize()
print("execute times "+str(time.time()-start_time))
# parse output
output = np.array([math.exp(o) for o in host_outputs[0]])
output /= sum(output)
for i in range(len(output)): print("%d: %.2f"%(i,output[i]))
self.cfx.pop()
def destory(self):
self.cfx.pop()
Environment
TensorRT Version: 10.6
GPU Type: RTX 4070 Ti
Nvidia Driver Version: 535.154.05
CUDA Version: 12.2
Python Version (if applicable): 3.9