I’d like to reopen the case for this issue, based on my last post from tensorrt-8-6-not-running-properly-on-orin-nx-with-jetpack-6.
The generated TRT engines in JetPack6 only achieve half of the tensorrt engines inference speed in JetPack5.
- Using host trtexec
- Batch size of 2, warmup 100 avgruns 1000
Jetpack 5
Jetpack 6
This is the code to create the onnx models:
import torch
import timm
from timm.utils.model import reparameterize_model
model_names = [
"fastvit_t8",
"fastvit_t12",
"fastvit_s12",
"tiny_vit_5m_224",
"tiny_vit_11m_224",
"tiny_vit_21m_224",
"mobilevit_xs",
"mobilevit_s",
"mobilevitv2_050",
"repvit_m1_0",
"repvit_m0_9",
"mobileone_s0",
"mobileone_s1",
# other models are mobilesam/nanosam image encoder with 1024
]
resolution = [[440, 800], [640, 640], [800, 800], [1024, 1024]]
for model_name in model_names:
for res in resolution:
model = timm.create_model(model_name=model_name, pretrained=True, features_only=True).eval()
x = torch.rand(2, 3, *res)
model = reparameterize_model(model)
torch.onnx.export(model, x, f"{model_name}_{res[0]}_{res[1]}.onnx", export_params=True, opset_version=15, do_constant_folding=True)
I use the following trtexec
command to generate the engine and profiles:
/usr/src/tensorrt/bin/trtexec --onnx=path_to_onnx_file.onnx --saveEngine=path_to_engine_fp16/int8.engine --workspace=4096 --fp16 [--int8] --profilingVerbosity=detailed \
--warmUp=100 --avgRuns=1000 --exportProfile=model_name_{precision}_profile.json --exportLayerInfo=model_name_{precision}_layer.json"
The scrip to generate the plots using notebook:
import os
import json
from glob import glob
import matplotlib.pyplot as plt
from matplotlib.ticker import (AutoMinorLocator, MultipleLocator)
time_profile = None
list_files = glob("*_profile.json")
# # using tuple unpacking for multiple Axes
fig, axs = plt.subplots(1, 2, sharex=True, figsize=(12, 6))
axs = axs.ravel()
latency_data_fp16 = []
latency_data_int8 = []
for file in list_files:
filename = os.path.basename(file)
component = filename.split("_")
model_name = '_'.join(component[:-2])
precision = component[-2]
with open (file) as f:
time_profile = json.loads(f.read())
layer_time_profile = []
for layer in time_profile[1:]:
layer_time_profile.append((layer['name'], layer['averageMs']))
total_time = sum(x[1] for x in layer_time_profile) # time in ms
if precision == "fp16":
latency_data_fp16.append((model_name, 1000/total_time))
else:
latency_data_int8.append((model_name, 1000/total_time))
latency_data_fp16 = sorted(latency_data_fp16, key=lambda x: x[1])
latency_data_int8 = sorted(latency_data_int8, key=lambda x: x[1])
names_fp16 = [d[0] for d in latency_data_fp16]
values_fp16 = [d[1] for d in latency_data_fp16]
names_int8 = [d[0] for d in latency_data_int8]
values_int8 = [d[1] for d in latency_data_int8]
axs[0].bar(names_fp16, values_fp16)
axs[0].tick_params(axis='x', rotation=90)
axs[0].title.set_text(f'FP16 Precision')
axs[0].set_ylabel('QPS')
axs[0].yaxis.grid(True)
# Change major ticks to show every 20.
axs[0].yaxis.set_major_locator(MultipleLocator(5))
axs[1].bar(names_int8, values_int8)
axs[1].tick_params(axis='x', rotation=90)
axs[1].title.set_text(f'INT8 Precision')
axs[1].set_ylabel('QPS')
axs[1].yaxis.grid(True)
axs[1].yaxis.set_major_locator(MultipleLocator(5))
plt.ylim([1, 50])
plt.show()