Description
A clear and concise description of the bug or issue.
Environment
TensorRT Version: 8.2.1.8
GPU Type: 3090
Nvidia Driver Version: 516.59
CUDA Version: 11.3
CUDNN Version: 8.2 for 11.x
Operating System + Version: win10
Python Version (if applicable): 3.8
TensorFlow Version (if applicable): 2.9
PyTorch Version (if applicable): 1.10
Baremetal or Container (if container which image + tag):
Relevant Files
model_file
链接:百度网盘 请输入提取码
提取码:hqu5
Steps To Reproduce
"""
An example that uses TensorRT's Python api to make inferences.
"""
import os
import shutil
import cv2
import numpy as np
import torch
import time
import pycuda.driver as cuda
import tensorrt as trt
from collections import OrderedDict,namedtuple
class YoLov7TRT(object):
"""
description: A YOLOv7 class that warps TensorRT ops, preprocess and postprocess ops.
"""
def __init__(self, engine_file_path):
# Create a Context on this device,
self.imgsz = [960, 960]
self.ctx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
self.device = "cuda:0"
# Deserialize the engine from file
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr', 'is_input'))
bindings = OrderedDict()
for binding in engine:
print('bingding:', binding, engine.get_binding_shape(binding))
ind = engine.get_binding_index(binding)
name = engine.get_binding_name(ind)
dtype = trt.nptype(engine.get_binding_dtype(ind))
shape = context.get_binding_shape(ind)
data = torch.from_numpy(np.empty(shape, dtype=np.dtype(dtype))).to(self.device)
# Append to the appropriate list.
is_input = engine.binding_is_input(binding)
if is_input:
inp_dtype = dtype
bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr()), is_input)
print('{:<20s}{:^30s}{:^20s}{:>20s}'.format(name, str(dtype), str(shape), str(is_input)))
# Store
self.stream = stream
self.context = context
self.engine = engine
self.bindings = bindings
self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items())
self.batch_size = engine.max_batch_size
self.input_dtype = inp_dtype
if 'float16' in str(self.input_dtype):
self.half = True
else:
self.half = False
def inference(self, img):
b, c, h, w = (1, 3, 960, 960)
self.ctx.push()
stream = self.stream
self.binding_addrs[self.engine.get_binding_name(0)] = int(img.data_ptr())
# Run inference.
# self.context.execute_async_v2(bindings=list(self.binding_addrs.values()), stream_handle=stream.handle)
self.context.execute_v2(bindings=list(self.binding_addrs.values()))
# Transfer predictions back from the GPU.
result = []
for name in self.bindings:
if not self.bindings[name].is_input:
result.append(self.bindings[name].data)
self.ctx.pop()
if len(result) > 1:
nmsed_indices, nmsed_boxes, nmsed_poses, nmsed_scores = result
nmsed_indices = nmsed_indices.reshape(b, -1, 3)
nmsed_boxes = nmsed_boxes.reshape(b, -1, 4)
nmsed_poses = nmsed_poses.reshape(b, -1, 69)
nmsed_scores = nmsed_scores.reshape(b, -1, 1)
nmsed_confes = torch.ones_like(nmsed_scores).to(nmsed_scores.device)
keep = torch.unique(nmsed_indices[..., 2]).numel()
if torch.any(torch.isnan(nmsed_indices[..., 2])) or torch.all(nmsed_indices[..., 2] < 0):
keep = 0
out = torch.cat([nmsed_boxes, nmsed_scores, nmsed_confes, nmsed_poses], axis=-1)
out = out[:, :keep, :]
else:
out = result[0]
out = out.reshape(b, -1, 75)
return out
def __del__(self):
# del self.inputs
# del self.outputs
del self.stream
self.ctx.detach() # 2. 实例释放时需要detech cuda上下文
def preprocess_image(raw_bgr_image):
"""
description: Convert BGR image to RGB,
resize and pad it to target size, normalize to [0,1],
transform to NCHW format.
param:
input_image_path: str, image path
return:
image: the processed image
image_raw: the original image
h: original height
w: original width
"""
input_w = 960
input_h = 960
image_raw = raw_bgr_image
h, w, c = image_raw.shape
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
# Calculate widht and height and paddings
r_w = input_w / w
r_h = input_h / h
if r_h > r_w:
tw = input_w
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((input_h - th) / 2)
ty2 = input_h - th - ty1
else:
tw = int(r_h * w)
th = input_h
tx1 = int((input_w - tw) / 2)
tx2 = input_w - tw - tx1
ty1 = ty2 = 0
# Resize the image with long side while maintaining ratio
image = cv2.resize(image, (tw, th))
# Pad the short side with (128,128,128)
image = cv2.copyMakeBorder(
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
)
image = image.astype(np.float32)
# Normalize to [0,1]
image /= 255.0
# HWC to CHW format:
image = np.transpose(image, [2, 0, 1])
# CHW to NCHW format
image = np.expand_dims(image, axis=0)
# Convert the image to row-major order, also known as "C order":
image = np.ascontiguousarray(image)
# return image, image_raw, h, w
return image, image_raw
def main():
# load custom plugin and engine
engine_file_path = r"E:\workspace\yolov7-pose-tensorrt\weights\best.trt"
categories = ["person"]
names = ['person']
device = "cuda:0"
# a YoLov7TRT instance
# try:
yolov7_wrapper = YoLov7TRT(engine_file_path)
# try:
batch_size = yolov7_wrapper.batch_size
print('batch size is', yolov7_wrapper.batch_size)
image_dir = r"E:\workspace\yolov7-pose-tensorrt\data\image\person.png"
# image_dir = r"E:\workspace\yolov7-pose-tensorrt\data\video\2.jpg"
# image, image_raw = preprocess(image_dir, device)
img = cv2.imread(image_dir)
image, image_raw = preprocess_image(img)
if isinstance(image, np.ndarray):
img_src = torch.from_numpy(image)
else:
img_src = image
image = img_src.to(device)
for i in range(5):
start = time.time()
for j in range(100):
results = yolov7_wrapper.inference(image)
print("use time",time.time()-start)
if __name__ == "__main__":
main()
error
-------------------------------------------------------------------
PyCUDA ERROR: The context stack was not empty upon module cleanup.
-------------------------------------------------------------------
A context was still active when the context stack was being
cleaned up. At this point in our execution, CUDA may already
have been deinitialized, so there is no way we can finish
cleanly. The program will be aborted now.
Use Context.pop() to avoid this problem.
-------------------------------------------------------------------
Process finished with exit code -1073740791 (0xC0000409)
Please include:
- Exact steps/commands to build your repro
- Exact steps/commands to run your repro
- Full traceback of errors encountered