Description
I write a python program using TensorRT to inference image. The program load a engine model file and initial several local context data as follwing shows:
def __init__(self, modelFileName):
# 加载runtime,记录log
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
# 反序列化模型
trt.init_libnvinfer_plugins(None, "")
engine = runtime.deserialize_cuda_engine(open(modelFileName, "rb").read())
print(engine)
print("输入",engine.get_binding_shape(0))
print("输出",engine.get_binding_shape(1))
# 1. Allocate some host and device buffers for inputs and outputs:
self.__h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
self.__h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32))
# Allocate device memory for inputs and outputs.
self.__d_input = cuda.mem_alloc(self.__h_input.nbytes)
self.__d_output = cuda.mem_alloc(self.__h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
self.__stream = cuda.Stream()
# 推理上下文
self.__context = engine.create_execution_context()
Then I write the inference program to inference as below, and it really works.
def PredictImageFile(self, imgFileName):
if not os.path.exists(imgFileName):
raise Exception("image no found:" + imgFileName)
image = Image.open(imgFileName).convert('L')
image = np.asarray(image)
return self.PredictImage(image)
def PredictImage(self, image):
if len(image.shape) == 3 and image.shape[2] == 3:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
cv2.imshow('image',image)
cv2.waitKey(0)
cv2.destroyAllWindows()
self.__img2input(image)
cuda.memcpy_htod_async(self.__d_input, self.__h_input, self.__stream)
self.__context.execute_async(bindings=[int(self.__d_input), int(self.__d_output)], stream_handle=self.__stream.handle)
cuda.memcpy_dtoh_async(self.__h_output, self.__d_output, self.__stream)
self.__stream.synchronize()
pred_mask = self.__h_output
pred_mask = pred_mask.reshape(1440, -1)
offset = int((1440 - self.__imgHeight) / 2)
toY = int(offset + self.__imgHeight)
out_img = pred_mask[offset:toY,:]
out_img = np.array(out_img)
img_uint8 = cv2.normalize(src=out_img, dst=None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
cv2.imshow('img_uint8',img_uint8)
cv2.waitKey(0)
cv2.destroyAllWindows()
return img_uint8
def __img2input(self, image):
image = self.__resize_image(image)
image = np.expand_dims(image, axis=0)
image = np.array(image)
np.copyto(self.__h_input, image.ravel())
Then I create a new thread to inference. In order to save memory, I reuse the above engine model and context in the thread. But it can’t load the new image to the local memory, and still remain the main thread’s image.
Then I try to allocate context (__h_input,__h_output,__d_input,__d_output,__stream and __context) in local thread, but still reuse the same engine model with main thread. But this time I can’t inference image, in which case the inference result image is black, both in main thread and in sub thread.
Then I tried to allocate the model in sub thread, in which case both main thread and sub thread have its engine model. and then I can infence in both thread. However, the GPU memory allocation doubled.
So I want to wonder how can two thread share the same engine model to avoid reallocated GPU memory. Thanbks so much!
My whole code is as below(Both thread have their own engine model and context situation):
import tensorrt as trt
import pycuda.driver as cuda
from PIL import Image
import numpy as np
import cv2
import pycuda.driver as cuda
import pycuda.autoinit as autoinit
import os
import threading
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
class TensorRTContext():
def __init__(self,engine):
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
trt.init_libnvinfer_plugins(None, "")
self.__engine = runtime.deserialize_cuda_engine(open(modelFileName, "rb").read())
print(self.__engine)
print("输入",self.__engine.get_binding_shape(0))
print("输出",self.__engine.get_binding_shape(1))
engine = self.__engine
# 1. Allocate some host and device buffers for inputs and outputs:
self.h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
self.h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32))
# Allocate device memory for inputs and outputs.
self.d_input = cuda.mem_alloc(self.h_input.nbytes)
self.d_output = cuda.mem_alloc(self.h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
self.stream = cuda.Stream()
self.context = engine.create_execution_context()
class TensorRTPredictor():
localCtxt = None
def __init__(self, modelFileName):
import pycuda.autoinit
self.thread_local_data = threading.local()
self.__engine = None
""" # Formal reuse same engine method code
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
trt.init_libnvinfer_plugins(None, "")
self.__engine = runtime.deserialize_cuda_engine(open(modelFileName, "rb").read())
#self.Get_execution_context()
"""
def Get_execution_context(self):
if not hasattr(self.thread_local_data, 'context'):
self.thread_local_data.context = TensorRTContext(self.__engine)
return self.thread_local_data.context
def __resize_image(self, image):
top, bottom, left, right = (0, 0, 0, 0)
h = image.shape[0]
w = image.shape[1]
self.__imgWidth = w
self.__imgHeight = h
if w != 1440 or h > 1440:
raise Exception("")
longest_edge = 1440
if h < longest_edge:
dh = longest_edge - h
top = dh // 2
bottom = dh - top
if w < longest_edge:
dw = longest_edge - w
left = dw // 2
right = dw - left
BLACK = [0]
constant = cv2.copyMakeBorder(image, top , bottom, left, right, cv2.BORDER_CONSTANT, value = BLACK)
return constant
def __img2input(self, image):
image = self.__resize_image(image)
image = np.expand_dims(image, axis=0)
image = np.array(image)
np.copyto(self.Get_execution_context().h_input, image.ravel())
def PredictImageFile(self, imgFileName):
if not os.path.exists(imgFileName):
raise Exception("image not found" + imgFileName)
image = Image.open(imgFileName).convert('L')
image = np.asarray(image)
return self.PredictImage(image)
def PredictImage(self, image):
if len(image.shape) == 3 and image.shape[2] == 3:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
cv2.imshow('image',image)
cv2.waitKey(0)
cv2.destroyAllWindows()
self.__img2input(image)
localContext = self.Get_execution_context()
cuda.memcpy_htod_async(localContext.d_input, localContext.h_input, localContext.stream)
localContext.context.execute_async(bindings=[int(localContext.d_input), int(localContext.d_output)], stream_handle=localContext.stream.handle)
cuda.memcpy_dtoh_async(localContext.h_output, localContext.d_output, localContext.stream)
localContext.stream.synchronize()
pred_mask = localContext.h_output
pred_mask = pred_mask.reshape(1440, -1)
offset = int((1440 - self.__imgHeight) / 2)
toY = int(offset + self.__imgHeight)
out_img = pred_mask[offset:toY,:]
out_img = np.array(out_img)
img_uint8 = cv2.normalize(src=out_img, dst=None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
return img_uint8
def TestPredict(predictor,imgFileName):
import pycuda.autoinit
img = cv2.imdecode(np.fromfile(imgFileName,dtype=np.uint8),cv2.IMREAD_GRAYSCALE
img_uint8 = predictor.PredictImageFile(imgFileName)
cv2.imshow('file img_uint8',img_uint8)
cv2.waitKey(0)
cv2.destroyAllWindows()
if __name__ == '__main__':
try:
modelFileName = "D:\\TensorRT\\model\\A86.bin"
predictor = TensorRTPredictor(modelFileName)
imgFileName = 'D:\\TensorRT\\image\\20211104-22-31-39_573.bmp'
TestPredict(predictor,imgFileName)
imgFileName = 'D:\\TensorRT\\image\\20211202-22-12-25_448.bmp'
threadRun = threading.Thread(target=TestPredict,args=(predictor,imgFileName,))
threadRun.start()
threadRun.join()
print("done")
except:
print("except")
autoinit.FinishUp()
exit(0)
Environment
TensorRT Version:
GPU Type: RTX 3060 Laptop
Nvidia Driver Version:
CUDA Version: 11.3, V11.3.58
CUDNN Version: 6.14.11.6050
Operating System + Version: Window 11
Python Version (if applicable): 3.7.13
TensorFlow Version (if applicable): NO
PyTorch Version (if applicable): NO
Baremetal or Container (if container which image + tag): NO