Error Code 1: Cask (Cask convolution execution)

my environment:
cuda 11.4
tensorrt: 8.6.0
language: python

I did use multi-threading,
Different from other bugs, I use
pip install python-cuda
So the way I call it is
from cuda import cuda, cudaart
It is not
import pycuda.driver as cuda

my core code as fllow:

import os
import numpy as np
import cv2
import tensorrt as trt
from cuda import cuda, cudart
from typing import Optional, List, Tuple, Union
from pathlib import Path

import ctypes
import onnx

class TensorRTEngine(object):
    def __init__(self, onnx_file=None, trt_file=None, trt_data_file=None, gpu_id=0, channel_num=3, num_classes=1,conf_thresh=0.1,
                 nms_thresh=0.65, image_height=640, image_width=640, batch_size=1, yolo_version = 8):  #image_height & image_width 
        ret = cudart.cudaSetDevice(gpu_id)
        # self.batch_size = batch_size
        # self.channel_num = channel_num
        # self.num_classes = num_classes  # defect class number
        self.conf_thresh = conf_thresh
        self.nms_threash = nms_thresh
        # self.image_height = image_height
        # self.image_width = image_width
        self.trt_file = trt_file
        self.yolo_version = yolo_version
        self.trt_data_file = trt_data_file
        self.class_names = self.get_class_names()
        self.aoi_flag = False 
        
        if os.path.exists(trt_file):
            self.engine = self.get_engine()
        else:                                       
            self.engine = self.load_engine(onnx_file)
        
    # input = [16,1,1280,1280]
    # v4-tiny: output box = [16,100800,1,4]    output confs = [16,100800,2]
    # v7-tiny: output box = [16,XXX,1,7]
    # v8s-det: input  [16, 3, 640, 640]
    # v8s-det: output: [16, 7, 8400] 7 = [x,y,w,h,cls1,cls2,cls3],, 8400 = (640/8)^2 + (640/16)^2 + (640/32)^2
    self.input_shape, self.output_shape = self.infer_shape()
    # self.batch_size = batch_size
    
    # nchw,clss_num
    self.input_batch, self.input_channel_num = self.input_shape[:2] # batch, channel_num
    self.input_height, self.input_width = self.input_shape[2:]      # h, w
    self.image_height, self.image_width = self.input_shape[2:]      
    self.num_classes = self.output_shape[1] - 4                     # defect class number
    self.ration = 0.0, 0.0 # 原图和模型尺寸的比例
    self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
    self.context = self.engine.create_execution_context()

def delTRT(self):
    self.free_buffers(self.inputs, self.outputs, self.stream)
    del self.stream
    del self.inputs
    del self.outputs
    del self.context
    del self.engine
    # cudart.cudaDeviceReset()
    

def get_class_names(self):
    data_file = open(self.trt_data_file, "r")
    data_file_lines = data_file.readlines()
    data_file_dict = {}
    for line in data_file_lines:
        line = line.strip("\n")
        field, value = line.split("=")
        data_file_dict[field] = value
    trt_name_file = data_file_dict["names"]
    data_file.close()
    name_file = open(trt_name_file, "r", encoding="utf-8")
    class_names = [line.strip("\n") for line in name_file.readlines()]
    return class_names
    
def get_engine(self):
    # If a serialized engine exists, use it instead of building an engine.
    print("Reading engine from file {}".format(self.trt_file))
    
    # TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    # trt.init_libnvinfer_plugins(TRT_LOGGER, namespace='')
    
    trt_runtime = trt.Runtime(TRT_LOGGER)
    with open(self.trt_file, "rb") as f:
        return trt_runtime.deserialize_cuda_engine(f.read())

def load_engine(self,onnx_file_path):
    # 存储相关io_shape
    # self.get_onnx_input_output_sizes(onnx_file_path)
    
    # TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    # trt.init_libnvinfer_plugins(TRT_LOGGER, namespace='')
    
    EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    """Takes an ONNX file and creates a TensorRT engine to run inference with"""
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
        EXPLICIT_BATCH
    ) as network, builder.create_builder_config() as config, trt.OnnxParser(
        network, TRT_LOGGER
    ) as parser, trt.Runtime(
        TRT_LOGGER
    ) as runtime:
        config.max_workspace_size = 1 << 28  # 256MiB
        builder.max_batch_size = 1
        # Parse model file
        if not os.path.exists(onnx_file_path):
            print(
                "ONNX file {} not found, please run yolov3_to_onnx.py first to generate it.".format(onnx_file_path)
            )
            exit(0)
        print("Loading ONNX file from path {}...".format(onnx_file_path))
        with open(onnx_file_path, "rb") as model:
            print("Beginning ONNX file parsing")
            if not parser.parse(model.read()):
                print("ERROR: Failed to parse the ONNX file.")
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                return None
        # The actual yolov3.onnx is generated with batch size 64. Reshape input to batch size 1
        # network.get_input(0).shape = [512, 1, 192, 128]
        # network.get_input(0).shape = input_shape
        print("Completed parsing of ONNX file")
        print("Building an engine from file {}; this may take a while...".format(onnx_file_path))
        plan = builder.build_serialized_network(network, config)
        engine = runtime.deserialize_cuda_engine(plan)
        print("Completed creating Engine")
        with open(self.trt_file, "wb") as f:
            f.write(plan)
        return engine

def infer_shape(self):
    for binding in self.engine:
        if self.engine.binding_is_input(binding):
            input_shape = self.engine.get_binding_shape(binding)
        else:
            output_shape = self.engine.get_binding_shape(binding)

    return input_shape, output_shape

def get_engine_io_info(self):
    # nchw clsss_num
    return [self.input_batch, self.input_channel_num, self.input_width, self.input_width, self.num_classes]

def allocate_buffers(self,profile_idx: Optional[int] = None):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda_call(cudart.cudaStreamCreate())
    tensor_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
    for binding in tensor_names:
        # get_tensor_profile_shape returns (min_shape, optimal_shape, max_shape)
        # Pick out the max shape to allocate enough memory for the binding.
        shape = self.engine.get_tensor_shape(binding) if profile_idx is None else \
        self.engine.get_tensor_profile_shape(binding, profile_idx)[-1]
        shape_valid = np.all([s >= 0 for s in shape])
        if not shape_valid and profile_idx is None:
            raise ValueError(f"Binding {binding} has dynamic shape, " + \
                             "but no profile was specified.")
        size = trt.volume(shape)
        if self.engine.has_implicit_batch_dimension:
            size *= self.engine.max_batch_size
        dtype = np.dtype(trt.nptype(self.engine.get_tensor_dtype(binding)))

        # Allocate host and device buffers
        bindingMemory = HostDeviceMem(size, dtype)

        # Append the device buffer to device bindings.
        bindings.append(int(bindingMemory.device))

        # Append to the appropriate list.
        if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            inputs.append(bindingMemory)
        else:
            outputs.append(bindingMemory)
    return inputs, outputs, bindings, stream

def free_buffers(self,inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], stream: cudart.cudaStream_t):
    for mem in inputs + outputs:
        mem.free()
    cuda_call(cudart.cudaStreamDestroy(stream))

def do_inference_base(self,inputs, outputs, stream, execute_async):
    # Transfer input data to the GPU.
    kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
    [cuda_call(cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)) for inp in inputs]
    # Run inference.
    execute_async()
    # Transfer predictions back from the GPU.
    kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
    [cuda_call(cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)) for out in outputs] #没存
    # Synchronize the stream
    cuda_call(cudart.cudaStreamSynchronize(stream))
    # Return only the host outputs.
    return [out.host for out in outputs]

def do_inference(self,context, bindings, inputs, outputs, stream):
    def execute_async():
        context.execute_async_v2(bindings=bindings, stream_handle=stream)*
        
    return self.do_inference_base(inputs, outputs, stream, execute_async)
def preprocess(self, data_list): 
    img_in_array = [self.mono_image_preprocess(mono_data) for mono_data in data_list]
    imgs_in = np.array(img_in_array)
    # (bath=1 channel=3, width=640, height=640, )
    return imgs_in

def postprocess_v8_det(self, trt_outputs):
    trt_outputs[0] = trt_outputs[0].reshape(self.input_batch, 4 +self.num_classes, -1)
    boxes = self.post_processing_v8_det(self.conf_thresh, self.nms_threash, trt_outputs[0])
    return boxes

def inference(self, data):
    np.copyto(self.inputs[0].host, self.preprocess(data).ravel())
    
    trt_outputs = self.do_inference(self.context, bindings=self.bindings,
                                    inputs=self.inputs,
                                    outputs=self.outputs,
                                    stream=self.stream)
    if self.yolo_version == 8:
        output = self.postprocess_v8_det(trt_outputs)
        return output
    else:
        output = None
        print("Not support yolo version!")
    return output

The bug appears in def inference(self, data)
erros as below:

@ mati-nvidia

Hi @colave ,
Can you please help us with the model and repro script

ok, below is best.onnx
by the way ,
from cuda import cuda, cudaart
It is not
import pycuda.driver as cuda

Is the solution to this problem no longer effective?
qa.zip (36.0 MB)