Description
I have simple two layer LSTMCell model followed by 4 dense layers for 4 outputs. I get expected outputs with ONNXruntime. So, I proceed for the TensorRT conversion using the ONNX parser. The model is successfully converted to tensorrt ( i have attached to logs below ) but I see different outputs for the same input and now I have no idea whats going wrong maybe some op is not supported in tensorrt I wonder?
Environment
TensorRT Version: 8.4.3.1
GPU Type: RTX 2080Ti
Nvidia Driver Version: 460.73.01
CUDA Version: 11.2
CUDNN Version: 8
Operating System + Version: ubuntu 20.04
Python Version (if applicable):
TensorFlow Version (if applicable):
PyTorch Version (if applicable):
Baremetal or Container (if container which image + tag):
Relevant Files
ONNX model
model_static.onnx (98.1 KB)
script to convert to tensorrt
import os
import argparse
import time
import numpy as np
import tensorrt as trt
import pycuda.autoinit # noqa # pylint: disable=unused-import
import pycuda.driver as cuda
def parse_args():
"""Parse input arguments."""
parser = argparse.ArgumentParser(description='train_network')
parser.add_argument('--onnx_path', dest='onnx_path', help='path to onnx model',
default="/opt/vineet-workspace/lstm_tracker/evaluater/model_static.engine/")
parser.add_argument('--trt_path', dest='trt_path', help='path to trt engine',
default="/opt/vineet-workspace/lstm_tracker/evaluater/model_fp32.engine/")
arguments = parser.parse_args()
return arguments
def convert(onnx_path, trt_engine_path, fp16=False):
"""Convert ONNX to TensorRT.
"""
# pylint: disable=no-member
# Checks if onnx path exists.
if not os.path.exists(onnx_path):
raise FileNotFoundError(
f"[Error] {onnx_path} does not exists.")
# Check if onnx_path is valid.
if ".onnx" not in onnx_path:
raise TypeError(
f"[Error] Expected onnx weight file, instead {onnx_path} is given."
)
# Specify that the network should be created with an explicit batch dimension.
batch_size = 1 << (int)(
trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
trt_logger = trt.Logger(trt.Logger.INFO)
# Build and serialize engine.
with trt.Builder(trt_logger) as builder, \
builder.create_network(batch_size) as network, \
trt.OnnxParser(network, trt_logger) as parser:
# Setup builder config.
config = builder.create_builder_config()
config.max_workspace_size = 512 * 1 << 20 # 512 MB
builder.max_batch_size = 1
# FP16 quantization.
if builder.platform_has_fast_fp16 and fp16:
print("[INFO] Setting fp16 to true.")
trt_engine_path = trt_engine_path.replace('.engine', '_fp16.engine')
config.flags = 1 << (int)(trt.BuilderFlag.FP16)
else:
trt_engine_path = trt_engine_path.replace('.engine', '_fp32.engine')
if os.path.exists(trt_engine_path):
print(f"{trt_engine_path} already exists.",
f"if you wish to regenerate Please delete or change trt_path with --trt_path \"your_engine_file_path.engine\"")
return None
# Parse onnx model.
with open(onnx_path, 'rb') as onnx_file:
if not parser.parse(onnx_file.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
print(network.get_input(0).shape)
# optimization progile
# profile = builder.create_optimization_profile()
# profile.set_shape("input_tensor", (1, 10, 13), (1, 10, 13), (1, 10, 13))
# config.add_optimization_profile(profile)
# Build engine.
engine = builder.build_engine(network, config)
with open(trt_engine_path, 'wb') as trt_engine_file:
trt_engine_file.write(engine.serialize())
print("[INFO] Engine serialized and saved !")
return engine
class LSTMTrackerTensorRT:
"""
"""
def __init__(self, trt_path, is_fp16=False):
# Create a Context on this device,
self._ctx = cuda.Device(0).make_context()
self._logger = trt.Logger(trt.Logger.INFO)
self._stream = cuda.Stream()
self._is_fp16 = is_fp16
self.trt_engine_path = trt_path
# initiate engine related class attributes
self._engine = None
self._context = None
self._inputs = None
self._outputs = None
self._bindings = None
self._load_model(trt_path)
self._allocate_buffers()
def _deserialize_engine(self, trt_engine_path):
"""Deserialize TensorRT Cuda Engine
Args:
trt_engine_path (str): path to engine file
Returns:
trt.tensorrt.ICudaEngine: deserialized engine
"""
with open(trt_engine_path, 'rb') as engine_file:
with trt.Runtime(self._logger) as runtime:
engine = runtime.deserialize_cuda_engine(engine_file.read())
return engine
def _allocate_buffers(self) -> None:
"""Allocates memory for inference using TensorRT engine.
"""
inputs, outputs, bindings = [], [], []
for binding in self._engine:
size = trt.volume(self._engine.get_binding_shape(binding))
dtype = trt.nptype(self._engine.get_binding_dtype(binding))
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if self._engine.binding_is_input(binding):
inputs.append({'host': host_mem, 'device': device_mem})
else:
outputs.append({'host': host_mem, 'device': device_mem})
# set buffers
self._inputs = inputs
self._outputs = outputs
self._bindings = bindings
def _load_model(self, engine_path):
print("[INFO] Deserializing TensorRT engine ...")
# build engine with given configs and load it
if not os.path.exists(engine_path):
raise FileNotFoundError(f"[Error]TensorRT engine does not exist {engine_path}.")
# deserialize and load engine
self._engine = self._deserialize_engine(engine_path)\
if not self._engine:
raise Exception("[Error] Couldn't deserialize engine successfully !")
# create execution context
self._context = self._engine.create_execution_context()
if not self._context:
raise Exception(
"[Error] Couldn't create execution context from engine successfully !")
def __call__(self, inputs):
if len(inputs.shape) < 3:
inputs= np.expand_dims(inputs, axis=0).astype(np.float32)
if inputs.shape != (1,10,13):
raise ValueError(f"[Error] Expected inputs with shape (1,10,13)" \
f"Instead got {inputs.shape}.")
print(inputs)
self._inputs[0]['host'] = inputs
# transfer data to the gpu
t1 = time.time()
cuda.memcpy_htod_async(
self._inputs[0]['device'], self._inputs[0]['host'], self._stream)
# run inference
self._context.execute_async_v2(bindings=self._bindings,
stream_handle=self._stream.handle)
# fetch outputs from gpu
for out in self._outputs:
cuda.memcpy_dtoh_async(out['host'], out['device'], self._stream)
t2 = time.time()
# synchronize stream
self._stream.synchronize()
self._ctx.pop()
return [out['host'] for out in self._outputs], t2 - t1
def postprocess(self, outputs):
pass
def destroy(self):
"""Destroy if any context in the stack.
"""
try:
self._ctx.pop()
except Exception as exception:
pass
if __name__=="__main__":
args = parse_args()
convert(onnx_path=args.onnx_path trt_engine_path='model.engine')
lstm_trt = LSTMTrackerTensorRT('./model_fp32.engine')
x = np.array([[0.97263074, 0.51486486, 0.05135135, 0.00735294, 0., 1., 0., 0., 0., 0., 0., 0., 0. ],
[0.96936274, 0.51351351, 0.04864865, 0.0122549, 0., 1., 0., 0., 0., 0., 0., 0., 0. ],
[0.96486926, 0.50810808, 0.03783784, 0.01470588,0., 1., 0., 0., 0., 0., 0., 0., 0. ],
[0.96119279, 0.51351351, 0.04864865, 0.02859477,0., 1., 0., 0., 0., 0., 0., 0., 0. ],
[0.95710784, 0.50810808, 0.03783784, 0.03022876,0., 1., 0., 0., 0., 0., 0., 0., 0. ],
[0.95710784, 0.51756757, 0.05675676, 0.04166667,0., 1., 0., 0., 0., 0., 0., 0., 0. ],
[0.95465684, 0.51891893, 0.05945946, 0.04820262,0., 1., 0., 0., 0., 0., 0., 0., 0. ],
[0.94934642, 0.51081079, 0.04324324, 0.04901961,0., 1., 0., 0., 0., 0., 0., 0., 0. ],
[0.95629084, 0.54459459, 0.11081081, 0.0759804, 0., 1., 0., 0., 0., 0., 0., 0., 0. ],
[0.95343137, 0.54459459, 0.11081081, 0.08169935, 0., 1., 0., 0., 0., 0., 0., 0., 0. ]])
x = np.ascontiguousarray(x, dtype=np.float32)
outputs = lstm_trt(x)
pred = np.array([out for out in outputs[0]])
predictions = np.expand_dims(pred, axis=0)
print(predictions)
output with onnxruntime & original tensorflow model
[[[-3.8089973e+01 1.0527807e+00 -1.1312099e+01 -4.7424686e+01
-1.6849737e+01 -1.8623383e+00 -2.0075350e-03 -7.0134908e-02
7.3413447e-02 -8.2136042e-02]
[-3.0445314e+01 1.3442774e+01 -1.9049225e+01 -5.7058174e+01
-3.4950836e+01 1.4261748e-01 5.4409849e-03 9.4003871e-02
1.0161684e+00 1.0353836e+00]
[-3.1522646e+01 9.7168951e+00 -8.3950176e+00 -3.7055126e+01
-5.4838821e+01 3.2517239e-01 -5.6228298e-04 -7.6879263e-02
-1.4470614e+00 -8.8311404e-01]
[-3.3481293e+01 1.5044321e+01 -1.2752264e+01 -3.4801022e+01
-3.4844067e+01 1.0599079e+00 1.6894076e-02 -1.9138572e-01
-1.6254437e+00 5.1925839e-03]]]
output with tensorrt inference script.
[[[ 5.54971956e-03 2.59425223e-01 -1.08758863e-02 5.15959859e-01
-2.51265216e+00 -1.77742504e-02 1.25943532e-03 -8.10287893e-02
7.69704580e-02 0.00000000e+00]
[-2.47224450e+00 -1.42135555e-02 -3.17309201e-01 2.47923434e-01
2.52945334e-01 0.00000000e+00 3.70394462e-03 1.72122329e-01
1.76020004e-02 1.38143199e-02]
[-2.19900131e+00 2.62142438e-02 3.06411609e-02 -2.21815658e+00
-2.19525838e+00 0.00000000e+00 -3.10053700e-04 5.26212975e-02
0.00000000e+00 0.00000000e+00]
[ 1.04971834e-01 5.76606728e-02 2.09427029e-01 -2.57200122e+00
6.33783340e-01 3.11132912e-02 -1.61380402e-03 2.08504926e-02
0.00000000e+00 -3.66387353e-03]]]
Steps To Reproduce
- to build and run inference with tensorrt just run the script given above.
I have seen that people have faced this issue in the past with LSTMs. hoping to get some pointers to take this forward.
Thanks in advance.