Description
I’m in the process of migrating from TensorRT 8.6 to 10.3. Following the migration guide provided in the documentation, I was able to get inference working on 10.3. However, I’m seeing a significant drop in performance compared to 8.6(getting wrong answers not lower inference time), particularly when dealing with changes on dynamic input shapes.
I am currently working on a two-stage module where the output of the first network serves as the input to the second network. The two networks are connected in sequence.
Has anyone encountered similar issues or could provide guidance on how to handle memory management in TensorRT 10.3 when using dynamic input shapes?
Any help would be greatly appreciated!
Environment
TensorRT Version: 10.3.0.30
GPU Type: NVIDIA Jetson Orin NX (16GB ram),aarch64
Nvidia Driver Version: Jetpack 6.1
CUDA Version: 12.6.68
CUDNN Version: 9.3.0.75
Operating System + Version: Ubunto 22.04
Python Version (if applicable): 3.10.12
TensorFlow Version (if applicable):
PyTorch Version (if applicable):
Baremetal or Container (if container which image + tag):
Relevant Files
common_trt_10.py:
import os
import numpy as np
import tensorrt as trt
from cuda import cuda, cudart
import ctypes
from typing import List, Tuple
def check_cuda_err(err):
if isinstance(err, cuda.CUresult):
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("Cuda Error: {}".format(err))
if isinstance(err, cudart.cudaError_t):
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError("Cuda Runtime Error: {}".format(err))
else:
raise RuntimeError("Unknown error type: {}".format(err))
def cuda_call(call):
err, res = call[0], call[1:]
check_cuda_err(err)
if len(res) == 1:
res = res[0]
return res
class HostDeviceMem:
def __init__(self, size: int, dtype: np.dtype):
nbytes = size * dtype.itemsize
host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))
self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
self._device = cuda_call(cudart.cudaMalloc(nbytes))
self._nbytes = nbytes
@property
def host(self) -> np.ndarray:
return self._host
@host.setter
def host(self, arr: np.ndarray):
if arr.size > self.host.size:
raise ValueError(
f"Tried to fit an array of size {arr.size} into host memory of size {self.host.size}"
)
np.copyto(self.host[: arr.size], arr.flat)
@property
def device(self) -> int:
return self._device
@property
def nbytes(self) -> int:
return self._nbytes
def __str__(self):
return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"
def __repr__(self):
return self.__str__()
def free(self):
cuda_call(cudart.cudaFree(self.device))
cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))
def allocate_buffers(engine: trt.ICudaEngine, inputs_shape: List[Tuple[int]]):
inputs = []
outputs = []
bindings = []
stream = cuda_call(cudart.cudaStreamCreate())
tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
for shape, binding in zip(inputs_shape, tensor_names):
size = trt.volume(shape)
dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding)))
bindingMemory = HostDeviceMem(size, dtype)
bindings.append(int(bindingMemory.device))
if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
inputs.append(bindingMemory)
else:
outputs.append(bindingMemory)
return inputs, outputs, bindings, stream
def free_buffers(
inputs: List[HostDeviceMem],
outputs: List[HostDeviceMem],
stream: cudart.cudaStream_t,
):
for mem in inputs + outputs:
mem.free()
cuda_call(cudart.cudaStreamDestroy(stream))
def memcpy_host_to_device(device_ptr: int, host_arr: np.ndarray):
nbytes = host_arr.size * host_arr.itemsize
cuda_call(
cudart.cudaMemcpy(
device_ptr, host_arr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
)
)
def memcpy_device_to_host(host_arr: np.ndarray, device_ptr: int):
nbytes = host_arr.size * host_arr.itemsize
cuda_call(
cudart.cudaMemcpy(
host_arr, device_ptr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
)
)
def _do_inference_base(inputs, outputs, stream, execute_async):
kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
[
cuda_call(
cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)
)
for inp in inputs
]
execute_async()
kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
[
cuda_call(
cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)
)
for out in outputs
]
cuda_call(cudart.cudaStreamSynchronize(stream))
return [out.host for out in outputs]
def do_inference_v3(context, engine, bindings, inputs, outputs, stream):
def execute_async():
# Set tensor addresses before executing
for i in range(engine.num_io_tensors):
context.set_tensor_address(engine.get_tensor_name(i), bindings[i])
context.execute_async_v3(stream_handle=stream)
return _do_inference_base(inputs, outputs, stream, execute_async)
common_trt_8.py
import os
import numpy as np
import tensorrt as trt
from cuda import cuda, cudart
import ctypes
from typing import List, Tuple
def check_cuda_err(err):
if isinstance(err, cuda.CUresult):
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("Cuda Error: {}".format(err))
if isinstance(err, cudart.cudaError_t):
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError("Cuda Runtime Error: {}".format(err))
else:
raise RuntimeError("Unknown error type: {}".format(err))
def cuda_call(call):
err, res = call[0], call[1:]
check_cuda_err(err)
if len(res) == 1:
res = res[0]
return res
class HostDeviceMem:
def __init__(self, size: int, dtype: np.dtype):
nbytes = size * dtype.itemsize
host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))
self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
self._device = cuda_call(cudart.cudaMalloc(nbytes))
self._nbytes = nbytes
@property
def host(self) -> np.ndarray:
return self._host
@host.setter
def host(self, arr: np.ndarray):
if arr.size > self.host.size:
raise ValueError(
f"Tried to fit an array of size {arr.size} into host memory of size {self.host.size}"
)
np.copyto(self.host[: arr.size], arr.flat)
@property
def device(self) -> int:
return self._device
@property
def nbytes(self) -> int:
return self._nbytes
def __str__(self):
return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"
def __repr__(self):
return self.__str__()
def free(self):
cuda_call(cudart.cudaFree(self.device))
cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))
def allocate_buffers(engine: trt.ICudaEngine, inputs_shape: List[Tuple[int]]):
inputs = []
outputs = []
bindings = []
stream = cuda_call(cudart.cudaStreamCreate())
tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
for shape, binding in zip(inputs_shape, tensor_names):
size = trt.volume(shape)
dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding)))
bindingMemory = HostDeviceMem(size, dtype)
bindings.append(int(bindingMemory.device))
if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
inputs.append(bindingMemory)
else:
outputs.append(bindingMemory)
return inputs, outputs, bindings, stream
def free_buffers(
inputs: List[HostDeviceMem],
outputs: List[HostDeviceMem],
stream: cudart.cudaStream_t,
):
for mem in inputs + outputs:
mem.free()
cuda_call(cudart.cudaStreamDestroy(stream))
def memcpy_host_to_device(device_ptr: int, host_arr: np.ndarray):
nbytes = host_arr.size * host_arr.itemsize
cuda_call(
cudart.cudaMemcpy(
device_ptr, host_arr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
)
)
def memcpy_device_to_host(host_arr: np.ndarray, device_ptr: int):
nbytes = host_arr.size * host_arr.itemsize
cuda_call(
cudart.cudaMemcpy(
host_arr, device_ptr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
)
)
def _do_inference_base(inputs, outputs, stream, execute_async):
kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
[
cuda_call(
cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)
)
for inp in inputs
]
execute_async()
kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
[
cuda_call(
cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)
)
for out in outputs
]
cuda_call(cudart.cudaStreamSynchronize(stream))
return [out.host for out in outputs]
def do_inference_v2(context, bindings, inputs, outputs, stream):
def execute_async():
context.execute_async_v2(bindings=bindings, stream_handle=stream)
return _do_inference_base(inputs, outputs, stream, execute_async)
first model inference:
# Allocate buffers based on input shape
input_name = self.engine.get_tensor_name(0)
self.context.set_input_shape(input_name, input0.shape)
model_shapes = [input0.shape, output0.shape, output1.shape]
self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers(self.engine, model_shapes)
# Run inference on input
np.copyto(self.inputs[0].host, input0.ravel())
out = common.do_inference_v3(self.context, self.engine, self.bindings, self.inputs, self.outputs, self.stream)
out0 = torch.from_numpy(out[0].reshape(output0.shape).clone()
out1 = torch.from_numpy(out[1].reshape(output1.shape).clone()
#out0 = torch.from_numpy(self.outputs[0].host.reshape(output0.shape).copy())
#out1 = torch.from_numpy(self.outputs[1].host.reshape(output1.shape).copy())
common.free_buffers(self.inputs, self.outputs, self.stream)
second model inference:
model_shapes = [
input0.shape,
input1.shape,
input2.shape,
input3.shape,
input4.shape,
input5.shape,
output0.shape,
]
# Set binding for context base on the input shape
input_binding_index = self.engine.get_tensor_name(0)
self.context.set_input_shape(input_binding_index, input0.shape)
input_binding_index = self.engine.get_tensor_name(1)
self.context.set_input_shape(input_binding_index, input1.shape)
input_binding_index = self.engine.get_tensor_name(2)
self.context.set_input_shape(input_binding_index, input2.shape)
input_binding_index = self.engine.get_tensor_name(3)
self.context.set_input_shape(input_binding_index, input3.shape)
input_binding_index = self.engine.get_tensor_name(4)
self.context.set_input_shape(input_binding_index, input4.shape)
input_binding_index = self.engine.get_tensor_name(5)
self.context.set_input_shape(input_binding_index, input5.shape)
self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers(
self.engine, model_shapes
)
# Transfer data to Host memory
np.copyto(self.inputs[0].host, input0.ravel())
np.copyto(self.inputs[1].host, input1.ravel())
np.copyto(self.inputs[2].host, input2.ravel())
np.copyto(self.inputs[3].host, input3.ravel())
np.copyto(self.inputs[4].host, input4.ravel())
np.copyto(self.inputs[5].host, input5.ravel())
# Do inference
output = common.do_inference_v3(
self.context, self.engine, self.bindings, self.inputs, self.outputs, self.stream
)
# Post proccess the output of model
out0 = torch.from_numpy(
self.outputs[0]
.host.reshape(output0.shape)
.copy()
)
#out0 = torch.from_numpy(output[0].reshape(output0.shape).clone()
common.free_buffers(self.inputs, self.outputs, self.stream)