Description
I’m encountering a Error
when trying to pass CUDA data directly as TensorRT engine input. Specifically, I’m working with data already on the GPU (allocated via PyTorch), and I perform some preprocessing using torch.nn.functional.interpolate
on the GPU before passing the result to TensorRT for inference.
When I attempt to perform cuda.memcpy_dtod_async(...)
using a stream (from TensorRT or elsewhere), I get the following error:
IExecutionContext::enqueueV3: Error Code 1: Cuda Runtime (invalid resource handle)
I’m not sure if this issue is caused by PyTorch interfering with TensorRT because of its internal CUDA stream usage. However, it is totally fine when I change my input and all the preprocessing on cpu. Has anyone else encountered this, and is there an established way to safely interoperate PyTorch CUDA tensors with TensorRT execution?
Environment
TensorRT Version: 10.3.x
GPU Type: NVIDIA Orin NX
Operating System + Version: Ubuntu 22.04
Relevant Files
Unfortunately, the codebase is not open-source, but the minimal reproducible logic is as follows:
def __init__(self, engine=None, input_names=None, output_names=None):
super(TRTModule_neuflow, self).__init__()
self.engine = engine
self.stream = cuda.Stream()
self.context = engine.create_execution_context()
self.input_names = input_names
self.output_names = output_names
self.input_buffers = {}
self.output_buffers = {}
self.device_buffers = {}
self.input_buffers_test = {}
tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
for i, tensor in enumerate(self.output_names):
shape = self.context.get_tensor_shape(tensor)
size = trt.volume(shape)
dtype = trt.nptype(engine.get_tensor_dtype(tensor))
nbytes = size * np.dtype(dtype).itemsize #
output_buffer = cuda.pagelocked_empty(size, dtype)
output_memory = cuda.mem_alloc(nbytes)
self.context.set_tensor_address(tensor, int(output_memory))
self.output_buffers[tensor] = output_buffer # record cpu buffer
self.device_buffers[tensor] = output_memory # record gpu buffer
def forward(self, *inputs):
engine = self.engine
context = self.context
# tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
for i, tensor in enumerate(self.input_names):
shape = context.get_tensor_shape(tensor)
size = trt.volume(shape)
dtype = trt.nptype(engine.get_tensor_dtype(tensor))
nbytes = size * np.dtype(dtype).itemsize
# print(nbytes)
if engine.get_tensor_mode(tensor) == trt.TensorIOMode.INPUT:
context.set_input_shape(tensor, shape)
input_buffer = inputs[i].contiguous()
assert input_buffer.is_cuda, "input must be on CUDA"
input_memory = cuda.mem_alloc(nbytes)
context.set_tensor_address(tensor, int(input_memory))
self.input_buffers[tensor] = input_buffer
self.device_buffers[tensor] = input_memory # record gpu buffer
# Transfer input data to the GPU.
for tensor in self.input_buffers:
cuda.memcpy_dtod_async(
int(self.device_buffers[tensor]),
self.input_buffers[tensor].data_ptr(),
self.input_buffers[tensor].numel() * self.input_buffers[tensor].element_size(),
self.stream
)
# print(f"Input buffer for tensor {tensor}: {self.input_buffers[tensor]}")
# print(f"Device buffer for tensor {tensor}: {self.device_buffers[tensor]}")
# print(f"Input buffer for tensor {tensor}: {self.input_buffers_test[tensor]}")
# Run inference
print("Stream handle:", self.stream.handle)
context.execute_async_v3(stream_handle=self.stream.handle)