I have 3 scripts:
1- My main script where I load a trt engine that has 2 inputs and 1 output, then reads two types of inputs (here I am just creating random tensors with the same shape). For the audo_data tensors I need to convert them to run on the GPU so I can preprocess them using torchaudio (due to no MKL support for ARM CPUs) and then convert these tensors back to CPU to pass them for the TRT engine.
import torch
from libs.model_classes.trt_model import TRTMODEL
model = TRTMODEL("./sample.engine", 1)
model._load_model()
for _ in range(10):
video_data = torch.rand((1,8,3,224,224))
audio_data = torch.rand((1,8,18,64)).to("cuda:0")
input = [video_data, audio_data.to("cpu")]
preds, preds_label = model._run_inference(input)
print(preds, preds_label, sep=":")
2- My TRT model class, initalizes the model and passes its params to an inference function:
from libs.model_classes.abstract_model_class import ABSTRACT_MODEL
from libs.video_frame_prediction_fun import video_frame_prediction_trt
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
class TRTMODEL(ABSTRACT_MODEL):
def __init__(self,
model_path: str,
batch_size: int):
self.model_path = model_path
self.batch_size = batch_size
def _load_model(self):
f = open(self.model_path, "rb")
self.runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
self.engine = self.runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
bindings = [(bind,self.engine.binding_is_input(bind)) for bind in self.engine] #[(input_bind, True), (output_bind, False)]
# TRT MODEL DATA
self.input_shape = []
self.input_size = []
self.device_input = []
# self.output_shape = []
# self.host_output = []
# self.device_output = []
# Create a stream in which to copy inputs/outputs and run inference.
self.stream = cuda.Stream()
for bind, isInput in bindings:
temp_shape = self.engine.get_binding_shape(bind)
print("[DEBUG] Shape for ", bind, " isInput = ", isInput, " shape: ", temp_shape)
if isInput: #input layer
self.input_shape.append(temp_shape)
self.input_size.append(trt.volume(self.input_shape[-1]) * self.engine.max_batch_size * np.dtype(np.float32).itemsize) # in bytes
self.device_input.append(cuda.mem_alloc(self.input_size[-1]))
else: # TODO handle multiple outputs
self.output_shape = temp_shape
# create page-locked memory buffers (i.e. won't be swapped to disk)
self.host_output = cuda.pagelocked_empty(trt.volume(self.output_shape) * self.engine.max_batch_size, dtype=np.float32)
self.device_output = cuda.mem_alloc(self.host_output.nbytes)
def _run_inference(self, input_frames):
preds, preds_label = video_frame_prediction_trt(input_frames, self.device_input, self.device_output, self.stream, self.context, self.host_output, self.batch_size)
return preds, preds_label
3- My inference function:
def video_frame_prediction_trt(input_frames, device_input, device_output, stream, context, host_output, batch_size):
host_input = []
for i in range(len(input_frames)):
host_input.append(np.array(input_frames[i].numpy(), dtype=np.float32, order='C'))
cuda.memcpy_htod_async(device_input[i], host_input[-1], stream)
# run inference
bindings_device_list = [int(dev) for dev in device_input]
bindings_device_list.append(int(device_output))
context.execute_async(bindings=bindings_device_list, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_output, device_output, stream)
stream.synchronize()
preds = torch.Tensor(host_output).reshape(batch_size, -1, 2)
preds = F.softmax(preds, dim=-1)
_, preds_label = preds.max(dim=-1)
preds_label = preds_label.reshape(1,-1)[0]
return preds, preds_label
I get the cuda runtime error only when I convert the audio_data to cuda:0 device but if I don’t everything work as expected, the problem is I need to convert the audio data to cuda:0 so any suggestions to solve that are really appreciated, thank you !
The error that I get is:
[TensorRT] ERROR: 1: [reformat.cu::NCHWToNCHHW2::1038] Error Code 1: Cuda Runtime (invalid resource handle)
I am assuming when converting my audio data to cuda:0 somehow it messes up the memory internally, not sure how to properly check that. I tried different models with changing the worspace variable while converting my onnx model to trt engine thinking maybe I run out of memory but according to my tests this wasn’t the case.