How can I access the same TensorRT engine model in different thread

Description

I write a python program using TensorRT to inference image. The program load a engine model file and initial several local context data as follwing shows:

def __init__(self, modelFileName):  
        # 加载runtime,记录log
        runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
        # 反序列化模型
        trt.init_libnvinfer_plugins(None, "")
        engine = runtime.deserialize_cuda_engine(open(modelFileName, "rb").read())
        print(engine)
        print("输入",engine.get_binding_shape(0))
        print("输出",engine.get_binding_shape(1))
        # 1. Allocate some host and device buffers for inputs and outputs:
        self.__h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
        self.__h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32))
        # Allocate device memory for inputs and outputs.
        self.__d_input = cuda.mem_alloc(self.__h_input.nbytes)
        self.__d_output = cuda.mem_alloc(self.__h_output.nbytes)
        # Create a stream in which to copy inputs/outputs and run inference.
        self.__stream = cuda.Stream()
        # 推理上下文
        self.__context = engine.create_execution_context()

Then I write the inference program to inference as below, and it really works.

def PredictImageFile(self, imgFileName):
                if not os.path.exists(imgFileName):
            raise Exception("image no found:" + imgFileName)

        image = Image.open(imgFileName).convert('L')
        image = np.asarray(image)

        return self.PredictImage(image)


def PredictImage(self, image):

        if len(image.shape) == 3 and image.shape[2] == 3:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        cv2.imshow('image',image)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        
        self.__img2input(image)
        
        cuda.memcpy_htod_async(self.__d_input, self.__h_input, self.__stream)
        
        self.__context.execute_async(bindings=[int(self.__d_input), int(self.__d_output)], stream_handle=self.__stream.handle)
        
        cuda.memcpy_dtoh_async(self.__h_output, self.__d_output, self.__stream)
        
        self.__stream.synchronize()

        pred_mask = self.__h_output
        
        pred_mask = pred_mask.reshape(1440, -1)
        
        offset = int((1440 - self.__imgHeight) / 2)
        toY = int(offset + self.__imgHeight)

        out_img = pred_mask[offset:toY,:]   

        out_img = np.array(out_img)

        img_uint8 = cv2.normalize(src=out_img, dst=None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)   
        
        cv2.imshow('img_uint8',img_uint8)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

        return img_uint8


def __img2input(self, image):

        image = self.__resize_image(image)
        image = np.expand_dims(image, axis=0)

        image = np.array(image)
        
        np.copyto(self.__h_input, image.ravel())

Then I create a new thread to inference. In order to save memory, I reuse the above engine model and context in the thread. But it can’t load the new image to the local memory, and still remain the main thread’s image.

Then I try to allocate context (__h_input,__h_output,__d_input,__d_output,__stream and __context) in local thread, but still reuse the same engine model with main thread. But this time I can’t inference image, in which case the inference result image is black, both in main thread and in sub thread.

Then I tried to allocate the model in sub thread, in which case both main thread and sub thread have its engine model. and then I can infence in both thread. However, the GPU memory allocation doubled.

So I want to wonder how can two thread share the same engine model to avoid reallocated GPU memory. Thanbks so much!

My whole code is as below(Both thread have their own engine model and context situation):

import tensorrt as trt
import pycuda.driver as cuda
from PIL import Image
import numpy as np
import cv2
import pycuda.driver as cuda
import pycuda.autoinit as autoinit
import os
import threading

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

class TensorRTContext():

    def __init__(self,engine):
        
        runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
        
        trt.init_libnvinfer_plugins(None, "")
        self.__engine = runtime.deserialize_cuda_engine(open(modelFileName, "rb").read())

        print(self.__engine)
        print("输入",self.__engine.get_binding_shape(0))
        print("输出",self.__engine.get_binding_shape(1)) 

        engine = self.__engine

        # 1. Allocate some host and device buffers for inputs and outputs:
        self.h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
        self.h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32))
        # Allocate device memory for inputs and outputs.
        self.d_input = cuda.mem_alloc(self.h_input.nbytes)
        self.d_output = cuda.mem_alloc(self.h_output.nbytes)
        # Create a stream in which to copy inputs/outputs and run inference.
        self.stream = cuda.Stream()
        self.context = engine.create_execution_context()

class TensorRTPredictor():

    localCtxt = None
     
    def __init__(self, modelFileName):

        import pycuda.autoinit

        self.thread_local_data = threading.local()

        self.__engine = None

        """ # Formal reuse same engine method code
        runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
        
        trt.init_libnvinfer_plugins(None, "")
        self.__engine = runtime.deserialize_cuda_engine(open(modelFileName, "rb").read())

        #self.Get_execution_context()  

        """
  
    def Get_execution_context(self):
        
        if not hasattr(self.thread_local_data, 'context'):
            self.thread_local_data.context = TensorRTContext(self.__engine)
        return self.thread_local_data.context

    def __resize_image(self, image):

        top, bottom, left, right = (0, 0, 0, 0)
        h = image.shape[0]
        w = image.shape[1]

        self.__imgWidth = w
        self.__imgHeight = h

        if w != 1440 or h > 1440:
            raise Exception("")
        longest_edge = 1440
        
        if h < longest_edge:
            dh = longest_edge - h
            top = dh // 2
            bottom = dh - top
        if w < longest_edge:
            dw = longest_edge - w
            left = dw // 2
            right = dw - left
        
        BLACK = [0]
        
        constant = cv2.copyMakeBorder(image, top , bottom, left, right, cv2.BORDER_CONSTANT, value = BLACK)
         return constant
    
    def __img2input(self, image):

        image = self.__resize_image(image)
        image = np.expand_dims(image, axis=0)

        image = np.array(image)
                
        np.copyto(self.Get_execution_context().h_input, image.ravel())


    def PredictImageFile(self, imgFileName):
        
        if not os.path.exists(imgFileName):
            raise Exception("image not found" + imgFileName)

        image = Image.open(imgFileName).convert('L')
        image = np.asarray(image)

        return self.PredictImage(image)

    def PredictImage(self, image):
        if len(image.shape) == 3 and image.shape[2] == 3:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        cv2.imshow('image',image)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        
        self.__img2input(image)
    
        localContext = self.Get_execution_context()
        
        cuda.memcpy_htod_async(localContext.d_input, localContext.h_input, localContext.stream)
        
        localContext.context.execute_async(bindings=[int(localContext.d_input), int(localContext.d_output)], stream_handle=localContext.stream.handle)
        
        cuda.memcpy_dtoh_async(localContext.h_output, localContext.d_output, localContext.stream)
        
        localContext.stream.synchronize()

        pred_mask = localContext.h_output
        
        pred_mask = pred_mask.reshape(1440, -1)

        offset = int((1440 - self.__imgHeight) / 2)
        toY = int(offset + self.__imgHeight)

        out_img = pred_mask[offset:toY,:]   

        out_img = np.array(out_img)

        img_uint8 = cv2.normalize(src=out_img, dst=None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
 
        return img_uint8

def TestPredict(predictor,imgFileName):

    import pycuda.autoinit

    img = cv2.imdecode(np.fromfile(imgFileName,dtype=np.uint8),cv2.IMREAD_GRAYSCALE
    img_uint8 = predictor.PredictImageFile(imgFileName)

    cv2.imshow('file img_uint8',img_uint8) 
    cv2.waitKey(0)
    cv2.destroyAllWindows()


if __name__ == '__main__':

    try:
        modelFileName = "D:\\TensorRT\\model\\A86.bin"

        predictor = TensorRTPredictor(modelFileName)

        imgFileName = 'D:\\TensorRT\\image\\20211104-22-31-39_573.bmp'
        TestPredict(predictor,imgFileName) 

        imgFileName = 'D:\\TensorRT\\image\\20211202-22-12-25_448.bmp'
    
        threadRun = threading.Thread(target=TestPredict,args=(predictor,imgFileName,))
        
        threadRun.start()
        threadRun.join()     

        print("done")
    except:
        print("except")
        autoinit.FinishUp()
    
    exit(0)

Environment

TensorRT Version:
GPU Type: RTX 3060 Laptop
Nvidia Driver Version:
CUDA Version: 11.3, V11.3.58
CUDNN Version: 6.14.11.6050
Operating System + Version: Window 11
Python Version (if applicable): 3.7.13
TensorFlow Version (if applicable): NO
PyTorch Version (if applicable): NO
Baremetal or Container (if container which image + tag): NO

Hi,
Can you try running your model with trtexec command, and share the “”–verbose"" log in case if the issue persist

You can refer below link for all the supported operators list, in case any operator is not supported you need to create a custom plugin to support that operation

Also, request you to share your model and script if not shared already so that we can help you better.

Meanwhile, for some common errors and queries please refer to below link:

Thanks!