my environment:
cuda 11.4
tensorrt: 8.6.0
language: python
I did use multi-threading,
Different from other bugs, I use
pip install python-cuda
So the way I call it is
from cuda import cuda, cudaart
It is not
import pycuda.driver as cuda
my core code as fllow:
import os
import numpy as np
import cv2
import tensorrt as trt
from cuda import cuda, cudart
from typing import Optional, List, Tuple, Union
from pathlib import Path
import ctypes
import onnx
class TensorRTEngine(object):
def __init__(self, onnx_file=None, trt_file=None, trt_data_file=None, gpu_id=0, channel_num=3, num_classes=1,conf_thresh=0.1,
nms_thresh=0.65, image_height=640, image_width=640, batch_size=1, yolo_version = 8): #image_height & image_width
ret = cudart.cudaSetDevice(gpu_id)
# self.batch_size = batch_size
# self.channel_num = channel_num
# self.num_classes = num_classes # defect class number
self.conf_thresh = conf_thresh
self.nms_threash = nms_thresh
# self.image_height = image_height
# self.image_width = image_width
self.trt_file = trt_file
self.yolo_version = yolo_version
self.trt_data_file = trt_data_file
self.class_names = self.get_class_names()
self.aoi_flag = False
if os.path.exists(trt_file):
self.engine = self.get_engine()
else:
self.engine = self.load_engine(onnx_file)
# input = [16,1,1280,1280]
# v4-tiny: output box = [16,100800,1,4] output confs = [16,100800,2]
# v7-tiny: output box = [16,XXX,1,7]
# v8s-det: input [16, 3, 640, 640]
# v8s-det: output: [16, 7, 8400] 7 = [x,y,w,h,cls1,cls2,cls3],, 8400 = (640/8)^2 + (640/16)^2 + (640/32)^2
self.input_shape, self.output_shape = self.infer_shape()
# self.batch_size = batch_size
# nchw,clss_num
self.input_batch, self.input_channel_num = self.input_shape[:2] # batch, channel_num
self.input_height, self.input_width = self.input_shape[2:] # h, w
self.image_height, self.image_width = self.input_shape[2:]
self.num_classes = self.output_shape[1] - 4 # defect class number
self.ration = 0.0, 0.0 # 原图和模型尺寸的比例
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
self.context = self.engine.create_execution_context()
def delTRT(self):
self.free_buffers(self.inputs, self.outputs, self.stream)
del self.stream
del self.inputs
del self.outputs
del self.context
del self.engine
# cudart.cudaDeviceReset()
def get_class_names(self):
data_file = open(self.trt_data_file, "r")
data_file_lines = data_file.readlines()
data_file_dict = {}
for line in data_file_lines:
line = line.strip("\n")
field, value = line.split("=")
data_file_dict[field] = value
trt_name_file = data_file_dict["names"]
data_file.close()
name_file = open(trt_name_file, "r", encoding="utf-8")
class_names = [line.strip("\n") for line in name_file.readlines()]
return class_names
def get_engine(self):
# If a serialized engine exists, use it instead of building an engine.
print("Reading engine from file {}".format(self.trt_file))
# TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# trt.init_libnvinfer_plugins(TRT_LOGGER, namespace='')
trt_runtime = trt.Runtime(TRT_LOGGER)
with open(self.trt_file, "rb") as f:
return trt_runtime.deserialize_cuda_engine(f.read())
def load_engine(self,onnx_file_path):
# 存储相关io_shape
# self.get_onnx_input_output_sizes(onnx_file_path)
# TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# trt.init_libnvinfer_plugins(TRT_LOGGER, namespace='')
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
"""Takes an ONNX file and creates a TensorRT engine to run inference with"""
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
EXPLICIT_BATCH
) as network, builder.create_builder_config() as config, trt.OnnxParser(
network, TRT_LOGGER
) as parser, trt.Runtime(
TRT_LOGGER
) as runtime:
config.max_workspace_size = 1 << 28 # 256MiB
builder.max_batch_size = 1
# Parse model file
if not os.path.exists(onnx_file_path):
print(
"ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.".format(onnx_file_path)
)
exit(0)
print("Loading ONNX file from path {}...".format(onnx_file_path))
with open(onnx_file_path, "rb") as model:
print("Beginning ONNX file parsing")
if not parser.parse(model.read()):
print("ERROR: Failed to parse the ONNX file.")
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
# The actual yolov3.onnx is generated with batch size 64. Reshape input to batch size 1
# network.get_input(0).shape = [512, 1, 192, 128]
# network.get_input(0).shape = input_shape
print("Completed parsing of ONNX file")
print("Building an engine from file {}; this may take a while...".format(onnx_file_path))
plan = builder.build_serialized_network(network, config)
engine = runtime.deserialize_cuda_engine(plan)
print("Completed creating Engine")
with open(self.trt_file, "wb") as f:
f.write(plan)
return engine
def infer_shape(self):
for binding in self.engine:
if self.engine.binding_is_input(binding):
input_shape = self.engine.get_binding_shape(binding)
else:
output_shape = self.engine.get_binding_shape(binding)
return input_shape, output_shape
def get_engine_io_info(self):
# nchw clsss_num
return [self.input_batch, self.input_channel_num, self.input_width, self.input_width, self.num_classes]
def allocate_buffers(self,profile_idx: Optional[int] = None):
inputs = []
outputs = []
bindings = []
stream = cuda_call(cudart.cudaStreamCreate())
tensor_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
for binding in tensor_names:
# get_tensor_profile_shape returns (min_shape, optimal_shape, max_shape)
# Pick out the max shape to allocate enough memory for the binding.
shape = self.engine.get_tensor_shape(binding) if profile_idx is None else \
self.engine.get_tensor_profile_shape(binding, profile_idx)[-1]
shape_valid = np.all([s >= 0 for s in shape])
if not shape_valid and profile_idx is None:
raise ValueError(f"Binding {binding} has dynamic shape, " + \
"but no profile was specified.")
size = trt.volume(shape)
if self.engine.has_implicit_batch_dimension:
size *= self.engine.max_batch_size
dtype = np.dtype(trt.nptype(self.engine.get_tensor_dtype(binding)))
# Allocate host and device buffers
bindingMemory = HostDeviceMem(size, dtype)
# Append the device buffer to device bindings.
bindings.append(int(bindingMemory.device))
# Append to the appropriate list.
if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
inputs.append(bindingMemory)
else:
outputs.append(bindingMemory)
return inputs, outputs, bindings, stream
def free_buffers(self,inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], stream: cudart.cudaStream_t):
for mem in inputs + outputs:
mem.free()
cuda_call(cudart.cudaStreamDestroy(stream))
def do_inference_base(self,inputs, outputs, stream, execute_async):
# Transfer input data to the GPU.
kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
[cuda_call(cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)) for inp in inputs]
# Run inference.
execute_async()
# Transfer predictions back from the GPU.
kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
[cuda_call(cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)) for out in outputs] #没存
# Synchronize the stream
cuda_call(cudart.cudaStreamSynchronize(stream))
# Return only the host outputs.
return [out.host for out in outputs]
def do_inference(self,context, bindings, inputs, outputs, stream):
def execute_async():
context.execute_async_v2(bindings=bindings, stream_handle=stream)*
return self.do_inference_base(inputs, outputs, stream, execute_async)
def preprocess(self, data_list):
img_in_array = [self.mono_image_preprocess(mono_data) for mono_data in data_list]
imgs_in = np.array(img_in_array)
# (bath=1 channel=3, width=640, height=640, )
return imgs_in
def postprocess_v8_det(self, trt_outputs):
trt_outputs[0] = trt_outputs[0].reshape(self.input_batch, 4 +self.num_classes, -1)
boxes = self.post_processing_v8_det(self.conf_thresh, self.nms_threash, trt_outputs[0])
return boxes
def inference(self, data):
np.copyto(self.inputs[0].host, self.preprocess(data).ravel())
trt_outputs = self.do_inference(self.context, bindings=self.bindings,
inputs=self.inputs,
outputs=self.outputs,
stream=self.stream)
if self.yolo_version == 8:
output = self.postprocess_v8_det(trt_outputs)
return output
else:
output = None
print("Not support yolo version!")
return output
The bug appears in def inference(self, data)
erros as below: