Why i add context.pop(), it's also report "PyCUDA ERROR: The context stack was not empty upon module cleanup."

Description

A clear and concise description of the bug or issue.

Environment

TensorRT Version: 8.2.1.8
GPU Type: 3090
Nvidia Driver Version: 516.59
CUDA Version: 11.3
CUDNN Version: 8.2 for 11.x
Operating System + Version: win10
Python Version (if applicable): 3.8
TensorFlow Version (if applicable): 2.9
PyTorch Version (if applicable): 1.10
Baremetal or Container (if container which image + tag):

Relevant Files

model_file
链接:百度网盘 请输入提取码
提取码:hqu5

Steps To Reproduce

"""
An example that uses TensorRT's Python api to make inferences.
"""

import os
import shutil
import cv2
import numpy as np
import torch
import time
import pycuda.driver as cuda
import tensorrt as trt
from collections import OrderedDict,namedtuple



class YoLov7TRT(object):
    """
    description: A YOLOv7 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.imgsz = [960, 960]
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)
        self.device = "cuda:0"
        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr', 'is_input'))
        bindings = OrderedDict()
        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            ind = engine.get_binding_index(binding)
            name = engine.get_binding_name(ind)
            dtype = trt.nptype(engine.get_binding_dtype(ind))
            shape = context.get_binding_shape(ind)
            data = torch.from_numpy(np.empty(shape, dtype=np.dtype(dtype))).to(self.device)
            # Append to the appropriate list.
            is_input = engine.binding_is_input(binding)
            if is_input:
                inp_dtype = dtype
            bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr()), is_input)
            print('{:<20s}{:^30s}{:^20s}{:>20s}'.format(name, str(dtype), str(shape), str(is_input)))

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine

        self.bindings = bindings
        self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items())
        self.batch_size = engine.max_batch_size
        self.input_dtype = inp_dtype
        if 'float16' in str(self.input_dtype):
            self.half = True
        else:
            self.half = False

    def inference(self, img):
        b, c, h, w = (1, 3, 960, 960)
        self.ctx.push()
        stream = self.stream
        self.binding_addrs[self.engine.get_binding_name(0)] = int(img.data_ptr())
        # Run inference.
        # self.context.execute_async_v2(bindings=list(self.binding_addrs.values()), stream_handle=stream.handle)
        self.context.execute_v2(bindings=list(self.binding_addrs.values()))
        # Transfer predictions back from the GPU.
        result = []
        for name in self.bindings:
            if not self.bindings[name].is_input:
                result.append(self.bindings[name].data)

        self.ctx.pop()
        if len(result) > 1:
            nmsed_indices, nmsed_boxes, nmsed_poses, nmsed_scores = result
            nmsed_indices = nmsed_indices.reshape(b, -1, 3)
            nmsed_boxes = nmsed_boxes.reshape(b, -1, 4)
            nmsed_poses = nmsed_poses.reshape(b, -1, 69)

            nmsed_scores = nmsed_scores.reshape(b, -1, 1)
            nmsed_confes = torch.ones_like(nmsed_scores).to(nmsed_scores.device)
            keep = torch.unique(nmsed_indices[..., 2]).numel()
            if torch.any(torch.isnan(nmsed_indices[..., 2])) or torch.all(nmsed_indices[..., 2] < 0):
                keep = 0
            out = torch.cat([nmsed_boxes, nmsed_scores, nmsed_confes, nmsed_poses], axis=-1)
            out = out[:, :keep, :]
        else:
            out = result[0]
        out = out.reshape(b, -1, 75)

        return out

    def __del__(self):
        # del self.inputs
        # del self.outputs
        del self.stream
        self.ctx.detach()  # 2. 实例释放时需要detech cuda上下文


def preprocess_image(raw_bgr_image):
    """
    description: Convert BGR image to RGB,
                 resize and pad it to target size, normalize to [0,1],
                 transform to NCHW format.
    param:
        input_image_path: str, image path
    return:
        image:  the processed image
        image_raw: the original image
        h: original height
        w: original width
    """
    input_w = 960
    input_h = 960
    image_raw = raw_bgr_image
    h, w, c = image_raw.shape
    image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
    # Calculate widht and height and paddings
    r_w = input_w / w
    r_h = input_h / h
    if r_h > r_w:
        tw = input_w
        th = int(r_w * h)
        tx1 = tx2 = 0
        ty1 = int((input_h - th) / 2)
        ty2 = input_h - th - ty1
    else:
        tw = int(r_h * w)
        th = input_h
        tx1 = int((input_w - tw) / 2)
        tx2 = input_w - tw - tx1
        ty1 = ty2 = 0
    # Resize the image with long side while maintaining ratio
    image = cv2.resize(image, (tw, th))
    # Pad the short side with (128,128,128)
    image = cv2.copyMakeBorder(
        image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
    )
    image = image.astype(np.float32)
    # Normalize to [0,1]
    image /= 255.0
    # HWC to CHW format:
    image = np.transpose(image, [2, 0, 1])
    # CHW to NCHW format
    image = np.expand_dims(image, axis=0)
    # Convert the image to row-major order, also known as "C order":
    image = np.ascontiguousarray(image)
    # return image, image_raw, h, w
    return image, image_raw





def main():
    # load custom plugin and engine
    engine_file_path = r"E:\workspace\yolov7-pose-tensorrt\weights\best.trt"
    categories = ["person"]
    names = ['person']
    device = "cuda:0"
    # a YoLov7TRT instance
    # try:

    yolov7_wrapper = YoLov7TRT(engine_file_path)
    # try:
    batch_size = yolov7_wrapper.batch_size
    print('batch size is', yolov7_wrapper.batch_size)

    image_dir = r"E:\workspace\yolov7-pose-tensorrt\data\image\person.png"
    # image_dir = r"E:\workspace\yolov7-pose-tensorrt\data\video\2.jpg"
    # image, image_raw = preprocess(image_dir, device)
    img = cv2.imread(image_dir)
    image, image_raw = preprocess_image(img)
    if isinstance(image, np.ndarray):

        img_src = torch.from_numpy(image)

    else:
        img_src = image
    image = img_src.to(device)
    for i in range(5):
        start = time.time()
        for j in range(100):
            results = yolov7_wrapper.inference(image)
        print("use time",time.time()-start)

    
    



if __name__ == "__main__":
 
    main()
   

error

-------------------------------------------------------------------
PyCUDA ERROR: The context stack was not empty upon module cleanup.
-------------------------------------------------------------------
A context was still active when the context stack was being
cleaned up. At this point in our execution, CUDA may already
have been deinitialized, so there is no way we can finish
cleanly. The program will be aborted now.
Use Context.pop() to avoid this problem.
-------------------------------------------------------------------

Process finished with exit code -1073740791 (0xC0000409)

Please include:

  • Exact steps/commands to build your repro
  • Exact steps/commands to run your repro
  • Full traceback of errors encountered

Hi,

We recommend you the following,

Thank you.