Different inference results of onnxruntime and tensorrt models built on the same onnx model
onnxruntime works correctly but tensorrt does not.
TensorRT Version: 10.0.1
GPU Type: GeForce 2080Ti
Nvidia Driver Version: 552.22
CUDA Version: 12.2
Operating System + Version : Windows 10 22H2 19045.3448
Python Version: 3.10
onnxruntime version: 1.16.0
numpy version: 1.23.1
cupy version: 13.2.0
Relevant Files
Here:
mask_decoder_vit_l.onnx - onnx model
create_trt_engine.bat - create trt model from onnx model using trtexec
image_embeddings.npy, image_pe.npy, dense_embeddings.npy, sparse_embeddings.npy - input data
mask_decoder_vit_l.trt - trt model for my PC
run_example.py - script for reproduse
Steps To Reproduce
- Create trt model using trtexec:
.\trtexec.exe --onnx=mask_decoder_vit_l.onnx --saveEngine=mask_decoder_vit_l.trt - run run_example.py
run_example.py:
import numpy as np
import cupy
import onnxruntime
import os
import tensorrt as trt
print(f"onnxruntime version: {onnxruntime.__version__}")
print(f"numpy version: {np.__version__}")
print(f"tensorrt version: {trt.__version__}")
print(f"cupy version: {cupy.__version__}")
work_folder = os.path.dirname(__file__)
# ================================================================
# Run "create_trt_engine.bat" before
# ================================================================
image_embeddings = np.load(os.path.join(work_folder, "image_embeddings.npy"))
image_pe = np.load(os.path.join(work_folder, "image_pe.npy"))
sparse_embeddings = np.load(os.path.join(work_folder, "sparse_embeddings.npy"))
dense_embeddings = np.load(os.path.join(work_folder, "dense_embeddings.npy"))
session = onnxruntime.InferenceSession(os.path.join(work_folder, "mask_decoder_vit_l.onnx"), providers=['AzureExecutionProvider', 'CPUExecutionProvider'])
low_res_masks_onnx, iou_predictions_onnx = session.run(None, {"image_embeddings": image_embeddings, "image_pe": image_pe, "sparse_prompt_embeddings":sparse_embeddings, "dense_prompt_embeddings":dense_embeddings})
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
with open(os.path.join(work_folder, f"mask_decoder_vit_l.trt"), "rb") as f:
serialized_engine = f.read()
engine = runtime.deserialize_cuda_engine(serialized_engine)
context = engine.create_execution_context()
# set inputs for trt
image_embeddings_cu = cupy.array(image_embeddings)
image_pe_cu = cupy.array(image_pe)
sparse_embeddings_cu = cupy.array(sparse_embeddings)
dense_embeddings_cu = cupy.array(dense_embeddings)
res = sparse_embeddings_cu.data.ptr
# set outputs for trt
low_res_masks_cu = cupy.zeros((1, 3, 256, 256), dtype=cupy.float32)
iou_predictions_cu = cupy.zeros((1, 3), dtype=cupy.float32)
num_of_inputs = engine.num_io_tensors
print("IO tensors: ")
for ii in range(engine.num_io_tensors):
tensor_name = engine.get_tensor_name(ii)
print(f"{ii}, Name: {tensor_name}, Mode: {engine.get_tensor_mode(tensor_name)}, Shape: {engine.get_tensor_shape(tensor_name)}, Format: {engine.get_tensor_format(tensor_name)}, Dtype: {engine.get_tensor_dtype(tensor_name)}")
buffers = [image_embeddings_cu.data.ptr, image_pe_cu.data.ptr, sparse_embeddings_cu.data.ptr, dense_embeddings_cu.data.ptr, low_res_masks_cu.data.ptr, iou_predictions_cu.data.ptr]
context.execute_v2(buffers)
# iou_predictions must be same
print(f"Tensorrt iou predict: {iou_predictions_cu}")
print(f"onnxruntime iou predict: {iou_predictions_onnx}")
# max mask difference must be ~ 0
print(f"Max mask difference: {np.max(np.abs(low_res_masks_cu.get() - low_res_masks_onnx))} ")
Please advise how can I make onnx and trt give the same result