Onnx and trt output has a large gap

my onnx model output and converted trt model output has large gap
it seems not allowing me to attach onnx model file
trt 8.2 jp 4.6
image
image

./trtexec --onnx=*.onnx --saveEngine=*.trt --workspace=4096 --fp16 --verbose

model link nv_sample_model - Google Drive

Dear @QQQQ,
May I know the used platform? If it is Jetson AGX Xavier, can you check with jetpack 5.1.3? Also, could you share repro code and steps?

We can not try a different version of jp, since it’s already in production. I already shared the model, please use that model with random input and you will find the gap between them easily. And it is under AGX Xavier.

def get_onnx_output(path, a):
    ort_sess = ort.InferenceSession(path)
    input_name = ort_sess.get_inputs()[0].name
    output_names = [i.name for i in ort_sess.get_outputs()]
    output = ort_sess.run(output_names, {
        input_name: a
    })
    return output

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem
    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
    def __repr__(self):
        return self.__str__()

class TrtModel:
    def __init__(self, engine_path, input_shape=[512,512], batch_size=1, max_batch_size=1, dtype=np.float32):
        self.input_shape = input_shape
        self.engine_path = engine_path
        self.batch_size = batch_size
        self.dtype = dtype
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.runtime = trt.Runtime(self.logger)
        self.engine = self.load_engine(self.runtime, self.engine_path)
        self.max_batch_size = max_batch_size
        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
        self.context = self.engine.create_execution_context()
        # self.context.set_binding_shape(batch_size, (1, 512, 512))
    @staticmethod
    def load_engine(trt_runtime, engine_path):
        trt.init_libnvinfer_plugins(None, "")
        with open(engine_path, 'rb') as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine
    def allocate_buffers(self):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()
        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(binding))
            if self.engine.get_binding_shape(binding)[0] == -1:
                size = -1 * size * self.batch_size
            host_mem = cuda.pagelocked_empty(size, self.dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings.append(int(device_mem))
            if self.engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))
        return inputs, outputs, bindings, stream
    def __call__(self, x: np.ndarray, batch_size=1):
        x = x.astype(self.dtype)
        np.copyto(self.inputs[0].host, x.ravel())
        for inp in self.inputs:
            cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
        self.context.execute_async(
            batch_size=batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
        for out in self.outputs:
            cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
        self.stream.synchronize()
        return [out.host.reshape(batch_size, -1) for out in self.outputs]

class TensorRTInfer:
    """
    Implements inference for the TensorRT engine.
    """
    def __init__(self, engine_path, batch_size=4):
        """
        :param engine_path: The path to the serialized engine to load from disk.
        """
        # Load TRT engine
        self.logger = trt.Logger(trt.Logger.ERROR)
        with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:
            self.engine = runtime.deserialize_cuda_engine(f.read())
        self.context = self.engine.create_execution_context()
        assert self.engine
        assert self.context
        # Setup I/O bindings
        self.inputs = []
        self.outputs = []
        self.allocations = []
        for i in range(self.engine.num_bindings):
            is_input = False
            if self.engine.binding_is_input(i):
                is_input = True
            name = self.engine.get_binding_name(i)
            dtype = self.engine.get_binding_dtype(i)
            shape = self.engine.get_binding_shape(i)
            self.batch_size = batch_size
            size = np.dtype(trt.nptype(dtype)).itemsize
            for s in shape:
                size *= s
            if size < 0:
                size = size * -1
            allocation = cuda.mem_alloc(size)
            binding = {
                'index': i,
                'name': name,
                'dtype': np.dtype(trt.nptype(dtype)),
                'shape': list(shape),
                'allocation': allocation,
            }
            self.allocations.append(allocation)
            if self.engine.binding_is_input(i):
                self.inputs.append(binding)
            else:
                self.outputs.append(binding)
        assert self.batch_size > 0
        assert len(self.inputs) > 0
        assert len(self.outputs) > 0
        assert len(self.allocations) > 0
    def input_spec(self):
        return self.inputs[0]['shape'], self.inputs[0]['dtype']
    def output_spec(self):
        return self.outputs[0]['shape'], self.outputs[0]['dtype']
    def infer(self, batch, top=1):
        # Prepare the output data
        output = np.zeros(*self.output_spec())
        # Process I/O and execute the network
        cuda.memcpy_htod(self.inputs[0]['allocation'], np.ascontiguousarray(batch))
        self.context.execute_v2(self.allocations)
        cuda.memcpy_dtoh(output, self.outputs[0]['allocation'])
        return output

a = np.random.random((1, 1, 512, 512)).astype(np.float32)

@SivaRamaKrishnaNV any updates?

Dear @QQQQ ,
Just checking if it could be due to fp16 precision. How much % of difference is noticed on average? Also, what is the % of effect on the final outcome?
Did you try with fp32 and noticed the same? Also, could you please share complete repro code for a quick test?

# -*-coding: utf-8 -*-
import os
import pathlib
import sys

# import cv2
import onnxruntime as ort
import tensorrt as trt
import argparse

#######################
# export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
# pip install numpy==1.19.4
## trt python
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit

a = np.random.random((1, 1, 512, 512)).astype(np.float32)

def get_onnx_output(path, a):
    ort_sess = ort.InferenceSession(path)
    input_name = ort_sess.get_inputs()[0].name
    output_names = [i.name for i in ort_sess.get_outputs()]
    output = ort_sess.run(output_names, {
        input_name: a
    })
    return output

class TensorRTInfer:
    """
    Implements inference for the TensorRT engine.
    """
    def __init__(self, engine_path, batch_size=4):
        """
        :param engine_path: The path to the serialized engine to load from disk.
        """
        # Load TRT engine
        self.logger = trt.Logger(trt.Logger.ERROR)
        with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:
            self.engine = runtime.deserialize_cuda_engine(f.read())
        self.context = self.engine.create_execution_context()
        assert self.engine
        assert self.context
        # Setup I/O bindings
        self.inputs = []
        self.outputs = []
        self.allocations = []
        for i in range(self.engine.num_bindings):
            is_input = False
            if self.engine.binding_is_input(i):
                is_input = True
            name = self.engine.get_binding_name(i)
            dtype = self.engine.get_binding_dtype(i)
            shape = self.engine.get_binding_shape(i)
            self.batch_size = batch_size
            size = np.dtype(trt.nptype(dtype)).itemsize
            for s in shape:
                size *= s
            if size < 0:
                size = size * -1
            allocation = cuda.mem_alloc(size)
            binding = {
                'index': i,
                'name': name,
                'dtype': np.dtype(trt.nptype(dtype)),
                'shape': list(shape),
                'allocation': allocation,
            }
            self.allocations.append(allocation)
            if self.engine.binding_is_input(i):
                self.inputs.append(binding)
            else:
                self.outputs.append(binding)
        assert self.batch_size > 0
        assert len(self.inputs) > 0
        assert len(self.outputs) > 0
        assert len(self.allocations) > 0
    def input_spec(self):
        return self.inputs[0]['shape'], self.inputs[0]['dtype']
    def output_spec(self):
        return self.outputs[0]['shape'], self.outputs[0]['dtype']
    def infer(self, batch, top=1):
        # Prepare the output data
        output = np.zeros(*self.output_spec())
        # Process I/O and execute the network
        cuda.memcpy_htod(self.inputs[0]['allocation'], np.ascontiguousarray(batch))
        self.context.execute_v2(self.allocations)
        cuda.memcpy_dtoh(output, self.outputs[0]['allocation'])
        return output

op_onnx = get_onnx_output("onnx", a)

trt_engine = TensorRTInfer("trt")
op_trt = trt_engine.infer(a)

please, i already shared the code it’s actually quite obvious, the answer you are asking is actually already there.
I am pretty sure i already provide all the things needed would you please at least try download it and run it? It shouldn’t be like this, waiting for a week and still asking for the same thing already provided.
@SivaRamaKrishnaNV

I verified the issue with your model on Jetpack 5.1.3 (Last release for Jetson Xavier) . When using FP32 precision, the onnx and tensorrt output blob matches. But when TRT FP 16 engine is used for inference, noticed < 0.2 % difference(compared to onnx output blob) in values in output blob. May I know how much difference in the final output task prediction is noticed in your case due to the difference in output blob?

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.