Imigaration from tensorRT‌ 8 to 10

Description

I’m in the process of migrating from TensorRT 8.6 to 10.3. Following the migration guide provided in the documentation, I was able to get inference working on 10.3. However, I’m seeing a significant drop in performance compared to 8.6(getting wrong answers not lower inference time), particularly when dealing with changes on dynamic input shapes.
I am currently working on a two-stage module where the output of the first network serves as the input to the second network. The two networks are connected in sequence.
Has anyone encountered similar issues or could provide guidance on how to handle memory management in TensorRT 10.3 when using dynamic input shapes?

Any help would be greatly appreciated!

Environment

TensorRT Version: 10.3.0.30
GPU Type: NVIDIA‌ Jetson Orin NX (16GB‌ ram),aarch64
Nvidia Driver Version: Jetpack 6.1
CUDA Version: 12.6.68
CUDNN Version: 9.3.0.75
Operating System + Version: Ubunto 22.04
Python Version (if applicable): 3.10.12
TensorFlow Version (if applicable):
PyTorch Version (if applicable):
Baremetal or Container (if container which image + tag):

Relevant Files

common_trt_10.py:

import os
import numpy as np
import tensorrt as trt
from cuda import cuda, cudart
import ctypes
from typing import List, Tuple


def check_cuda_err(err):
    if isinstance(err, cuda.CUresult):
        if err != cuda.CUresult.CUDA_SUCCESS:
            raise RuntimeError("Cuda Error: {}".format(err))
    if isinstance(err, cudart.cudaError_t):
        if err != cudart.cudaError_t.cudaSuccess:
            raise RuntimeError("Cuda Runtime Error: {}".format(err))
    else:
        raise RuntimeError("Unknown error type: {}".format(err))


def cuda_call(call):
    err, res = call[0], call[1:]
    check_cuda_err(err)
    if len(res) == 1:
        res = res[0]
    return res


class HostDeviceMem:
    def __init__(self, size: int, dtype: np.dtype):
        nbytes = size * dtype.itemsize
        host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
        pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))

        self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
        self._device = cuda_call(cudart.cudaMalloc(nbytes))
        self._nbytes = nbytes

    @property
    def host(self) -> np.ndarray:
        return self._host

    @host.setter
    def host(self, arr: np.ndarray):
        if arr.size > self.host.size:
            raise ValueError(
                f"Tried to fit an array of size {arr.size} into host memory of size {self.host.size}"
            )
        np.copyto(self.host[: arr.size], arr.flat)

    @property
    def device(self) -> int:
        return self._device

    @property
    def nbytes(self) -> int:
        return self._nbytes

    def __str__(self):
        return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"

    def __repr__(self):
        return self.__str__()

    def free(self):
        cuda_call(cudart.cudaFree(self.device))
        cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))


def allocate_buffers(engine: trt.ICudaEngine, inputs_shape: List[Tuple[int]]):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda_call(cudart.cudaStreamCreate())
    tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    for shape, binding in zip(inputs_shape, tensor_names):
        size = trt.volume(shape)
        dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding)))

        bindingMemory = HostDeviceMem(size, dtype)
        bindings.append(int(bindingMemory.device))
        

        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            inputs.append(bindingMemory)
        else:
            outputs.append(bindingMemory)

    return inputs, outputs, bindings, stream


def free_buffers(
    inputs: List[HostDeviceMem],
    outputs: List[HostDeviceMem],
    stream: cudart.cudaStream_t,
):
    for mem in inputs + outputs:
        mem.free()
    cuda_call(cudart.cudaStreamDestroy(stream))


def memcpy_host_to_device(device_ptr: int, host_arr: np.ndarray):
    nbytes = host_arr.size * host_arr.itemsize
    cuda_call(
        cudart.cudaMemcpy(
            device_ptr, host_arr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
        )
    )


def memcpy_device_to_host(host_arr: np.ndarray, device_ptr: int):
    nbytes = host_arr.size * host_arr.itemsize
    cuda_call(
        cudart.cudaMemcpy(
            host_arr, device_ptr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
        )
    )


def _do_inference_base(inputs, outputs, stream, execute_async):
    kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
    [
        cuda_call(
            cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)
        )
        for inp in inputs
    ]
    execute_async()
    kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
    [
        cuda_call(
            cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)
        )
        for out in outputs
    ]
    cuda_call(cudart.cudaStreamSynchronize(stream))
    return [out.host for out in outputs]


def do_inference_v3(context, engine, bindings, inputs, outputs, stream):
    def execute_async():
        # Set tensor addresses before executing
        for i in range(engine.num_io_tensors):
            context.set_tensor_address(engine.get_tensor_name(i), bindings[i])
        context.execute_async_v3(stream_handle=stream)

    return _do_inference_base(inputs, outputs, stream, execute_async)

common_trt_8.py

import os
import numpy as np
import tensorrt as trt
from cuda import cuda, cudart
import ctypes
from typing import List, Tuple


def check_cuda_err(err):
    if isinstance(err, cuda.CUresult):
        if err != cuda.CUresult.CUDA_SUCCESS:
            raise RuntimeError("Cuda Error: {}".format(err))
    if isinstance(err, cudart.cudaError_t):
        if err != cudart.cudaError_t.cudaSuccess:
            raise RuntimeError("Cuda Runtime Error: {}".format(err))
    else:
        raise RuntimeError("Unknown error type: {}".format(err))


def cuda_call(call):
    err, res = call[0], call[1:]
    check_cuda_err(err)
    if len(res) == 1:
        res = res[0]
    return res


class HostDeviceMem:
    def __init__(self, size: int, dtype: np.dtype):
        nbytes = size * dtype.itemsize
        host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
        pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))

        self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
        self._device = cuda_call(cudart.cudaMalloc(nbytes))
        self._nbytes = nbytes

    @property
    def host(self) -> np.ndarray:
        return self._host

    @host.setter
    def host(self, arr: np.ndarray):
        if arr.size > self.host.size:
            raise ValueError(
                f"Tried to fit an array of size {arr.size} into host memory of size {self.host.size}"
            )
        np.copyto(self.host[: arr.size], arr.flat)

    @property
    def device(self) -> int:
        return self._device

    @property
    def nbytes(self) -> int:
        return self._nbytes

    def __str__(self):
        return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"

    def __repr__(self):
        return self.__str__()

    def free(self):
        cuda_call(cudart.cudaFree(self.device))
        cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))


def allocate_buffers(engine: trt.ICudaEngine, inputs_shape: List[Tuple[int]]):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda_call(cudart.cudaStreamCreate())
    tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    for shape, binding in zip(inputs_shape, tensor_names):
        size = trt.volume(shape)
        dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding)))

        bindingMemory = HostDeviceMem(size, dtype)
        bindings.append(int(bindingMemory.device))

        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            inputs.append(bindingMemory)
        else:
            outputs.append(bindingMemory)

    return inputs, outputs, bindings, stream


def free_buffers(
    inputs: List[HostDeviceMem],
    outputs: List[HostDeviceMem],
    stream: cudart.cudaStream_t,
):
    for mem in inputs + outputs:
        mem.free()
    cuda_call(cudart.cudaStreamDestroy(stream))


def memcpy_host_to_device(device_ptr: int, host_arr: np.ndarray):
    nbytes = host_arr.size * host_arr.itemsize
    cuda_call(
        cudart.cudaMemcpy(
            device_ptr, host_arr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
        )
    )


def memcpy_device_to_host(host_arr: np.ndarray, device_ptr: int):
    nbytes = host_arr.size * host_arr.itemsize
    cuda_call(
        cudart.cudaMemcpy(
            host_arr, device_ptr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
        )
    )


def _do_inference_base(inputs, outputs, stream, execute_async):
    kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
    [
        cuda_call(
            cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)
        )
        for inp in inputs
    ]
    execute_async()
    kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
    [
        cuda_call(
            cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)
        )
        for out in outputs
    ]
    cuda_call(cudart.cudaStreamSynchronize(stream))
    return [out.host for out in outputs]


def do_inference_v2(context, bindings, inputs, outputs, stream):
    def execute_async():
        context.execute_async_v2(bindings=bindings, stream_handle=stream)

    return _do_inference_base(inputs, outputs, stream, execute_async)

first model inference:

        # Allocate buffers based on input shape
        input_name = self.engine.get_tensor_name(0)
        self.context.set_input_shape(input_name, input0.shape)

        model_shapes = [input0.shape, output0.shape, output1.shape]
        self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers(self.engine, model_shapes)

        # Run inference on input
        np.copyto(self.inputs[0].host, input0.ravel())
        out = common.do_inference_v3(self.context, self.engine, self.bindings, self.inputs, self.outputs, self.stream)
        out0 = torch.from_numpy(out[0].reshape(output0.shape).clone()
        out1 = torch.from_numpy(out[1].reshape(output1.shape).clone()
        #out0 = torch.from_numpy(self.outputs[0].host.reshape(output0.shape).copy())
        #out1 = torch.from_numpy(self.outputs[1].host.reshape(output1.shape).copy())
        common.free_buffers(self.inputs, self.outputs, self.stream)

second model inference:

            model_shapes = [
            input0.shape,
            input1.shape,
            input2.shape,
            input3.shape,
            input4.shape,
            input5.shape,
            output0.shape,
        ]
        # Set binding for context base on the input shape
        input_binding_index = self.engine.get_tensor_name(0)
        self.context.set_input_shape(input_binding_index, input0.shape)
        input_binding_index = self.engine.get_tensor_name(1)
        self.context.set_input_shape(input_binding_index, input1.shape)
        input_binding_index = self.engine.get_tensor_name(2)
        self.context.set_input_shape(input_binding_index, input2.shape)
        input_binding_index = self.engine.get_tensor_name(3)
        self.context.set_input_shape(input_binding_index, input3.shape)
        input_binding_index = self.engine.get_tensor_name(4)
        self.context.set_input_shape(input_binding_index, input4.shape)
        input_binding_index = self.engine.get_tensor_name(5)
        self.context.set_input_shape(input_binding_index, input5.shape)
        self.inputs, self.outputs, self.bindings, self.stream = common.allocate_buffers(
            self.engine, model_shapes
        )
        
        # Transfer data to Host memory
        np.copyto(self.inputs[0].host, input0.ravel())
        np.copyto(self.inputs[1].host, input1.ravel())
        np.copyto(self.inputs[2].host, input2.ravel())
        np.copyto(self.inputs[3].host, input3.ravel())
        np.copyto(self.inputs[4].host, input4.ravel())
        np.copyto(self.inputs[5].host, input5.ravel())

        # Do inference
        output = common.do_inference_v3(
            self.context, self.engine, self.bindings, self.inputs, self.outputs, self.stream
        )
        # Post proccess the output of model
        out0 = torch.from_numpy(
            self.outputs[0]
            .host.reshape(output0.shape)
            .copy()
        )
        #out0 = torch.from_numpy(output[0].reshape(output0.shape).clone()
        common.free_buffers(self.inputs, self.outputs, self.stream)

Hi @octagpt01 ,
Can you please share your model with us.

Thanks