TensorRT Error

After loading a trt engine with 3 inputs and 1 output, I wrote the code to allocate a total of 4 outputs of memory space at once, but I get the following error statement.

The first input is an image of (1, 3, 512, 512)
The second and third inputs have floating point values in the form of np.float32, and the fourth inputs have outputs in the form of (1, 3, 512, 512).

I didn’t understand why the following error statement is occurring.

[TRT] [E] 1: [convBaseRunner.cpp::execute::295] Error Code 1: Cask (Cask convolution execution)
[TRT] [E] 1: [checkMacros.cpp::catchCudaError::203] Error Code 1: Cuda Runtime (invalid resource handle)

Environment

TensorRT Version: 8.6.1.6
GPU Type: GeForce RTX 3090
Nvidia Driver Version: 11.4
CUDA Version: 11.3
CUDNN Version: 8.2
Operating System + Version: ubuntu 20.04.5 LTS
Python Version (if applicable): 3.7.4
PyTorch Version (if applicable): 1.12.1+cu113

This is the original code.

Import necessary libraries and modules

import tensorrt as trt
import numpy as np
import os
import pycuda.driver as cuda
import pycuda.autoinit

cuda.init()

Define a class for managing host and device memory

class HostDeviceMem(object):
def init(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem

def __str__(self):
    return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

def __repr__(self):
    return self.__str__()

Define a class for managing a TensorRT model

class TrtModel:
def init(self, engine_path, max_batch_size=1, dtype=np.float32):

    self.engine_path = engine_path
    self.dtype = dtype
    self.logger = trt.Logger(trt.Logger.ERROR)
    self.runtime = trt.Runtime(self.logger)
    self.engine = self.load_engine(self.runtime, self.engine_path)
    self.max_batch_size = max_batch_size
    self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
    self.context = self.engine.create_execution_context()

@staticmethod
def load_engine(trt_runtime, engine_path):
    trt.init_libnvinfer_plugins(None, "")             
    with open(engine_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

def allocate_buffers(self):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    
    for binding in self.engine:
        size = trt.volume(self.engine.get_binding_shape(binding)) * self.max_batch_size
        print("size:",size)
        print("binding:",self.engine.get_binding_shape(binding))
        
        host_mem = cuda.pagelocked_empty(size, self.dtype)
        print(f"host_mem : {host_mem}")
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        print(f"device_mem : {device_mem}")
        bindings.append(int(device_mem))

        if self.engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings, stream
   
        
def __call__(self, frame, gamma, strength):
    frame = np.ascontiguousarray(frame)
    gamma = np.ascontiguousarray(gamma)
    strength = np.ascontiguousarray(strength)

    cuda.memcpy_htod_async(self.inputs[0].device, frame, self.stream)
    cuda.memcpy_htod_async(self.inputs[1].device, gamma, self.stream)
    cuda.memcpy_htod_async(self.inputs[2].device, strength, self.stream)

    
    self.context.execute_async_v2(bindings=[int(d_inp.device) for d_inp in self.inputs] + [int(d_out.device) for d_out in self.outputs], stream_handle=self.stream.handle)
    #bindings=[int(d_inp.device) for d_inp in self.inputs] + [int(d_out.device) for d_out in self.outputs]
    #print(f"Bindings: {bindings}")
    #self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
    print(f"Bindings: {self.bindings}")
    for out in self.outputs:
        cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
        
    self.stream.synchronize()
    print(f"Output shape: {self.outputs[0].host.shape}")
 
    #return [out.host.reshape((-1)) for out in self.outputs]
    return [out.host.copy() for out in self.outputs]

How to solve the error?