After loading a trt engine with 3 inputs and 1 output, I wrote the code to allocate a total of 4 outputs of memory space at once, but I get the following error statement.
The first input is an image of (1, 3, 512, 512)
The second and third inputs have floating point values in the form of np.float32, and the fourth inputs have outputs in the form of (1, 3, 512, 512).
I didn’t understand why the following error statement is occurring.
[TRT] [E] 1: [convBaseRunner.cpp::execute::295] Error Code 1: Cask (Cask convolution execution)
[TRT] [E] 1: [checkMacros.cpp::catchCudaError::203] Error Code 1: Cuda Runtime (invalid resource handle)
Environment
TensorRT Version: 8.6.1.6
GPU Type: GeForce RTX 3090
Nvidia Driver Version: 11.4
CUDA Version: 11.3
CUDNN Version: 8.2
Operating System + Version: ubuntu 20.04.5 LTS
Python Version (if applicable): 3.7.4
PyTorch Version (if applicable): 1.12.1+cu113
This is the original code.
Import necessary libraries and modules
import tensorrt as trt
import numpy as np
import os
import pycuda.driver as cuda
import pycuda.autoinit
cuda.init()
Define a class for managing host and device memory
class HostDeviceMem(object):
def init(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
Define a class for managing a TensorRT model
class TrtModel:
def init(self, engine_path, max_batch_size=1, dtype=np.float32):
self.engine_path = engine_path
self.dtype = dtype
self.logger = trt.Logger(trt.Logger.ERROR)
self.runtime = trt.Runtime(self.logger)
self.engine = self.load_engine(self.runtime, self.engine_path)
self.max_batch_size = max_batch_size
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
self.context = self.engine.create_execution_context()
@staticmethod
def load_engine(trt_runtime, engine_path):
trt.init_libnvinfer_plugins(None, "")
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def allocate_buffers(self):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * self.max_batch_size
print("size:",size)
print("binding:",self.engine.get_binding_shape(binding))
host_mem = cuda.pagelocked_empty(size, self.dtype)
print(f"host_mem : {host_mem}")
device_mem = cuda.mem_alloc(host_mem.nbytes)
print(f"device_mem : {device_mem}")
bindings.append(int(device_mem))
if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def __call__(self, frame, gamma, strength):
frame = np.ascontiguousarray(frame)
gamma = np.ascontiguousarray(gamma)
strength = np.ascontiguousarray(strength)
cuda.memcpy_htod_async(self.inputs[0].device, frame, self.stream)
cuda.memcpy_htod_async(self.inputs[1].device, gamma, self.stream)
cuda.memcpy_htod_async(self.inputs[2].device, strength, self.stream)
self.context.execute_async_v2(bindings=[int(d_inp.device) for d_inp in self.inputs] + [int(d_out.device) for d_out in self.outputs], stream_handle=self.stream.handle)
#bindings=[int(d_inp.device) for d_inp in self.inputs] + [int(d_out.device) for d_out in self.outputs]
#print(f"Bindings: {bindings}")
#self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
print(f"Bindings: {self.bindings}")
for out in self.outputs:
cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
self.stream.synchronize()
print(f"Output shape: {self.outputs[0].host.shape}")
#return [out.host.reshape((-1)) for out in self.outputs]
return [out.host.copy() for out in self.outputs]
How to solve the error?