I have two scripts, one for converting the model to trt format:
#!/bin/bash
MODEL_PATH=$1
OUTPUT_PATH=$2
trtexec --onnx=$MODEL_PATH --saveEngine=$OUTPUT_PATH --skipInference
I add additional arguments (–fp16, --int8, etc.) manually depending on the precision that I am benchmarking.
I then run inference using the following python script:
import tensorrt as trt
import numpy as np
import argparse
import os
import time
from onnx_helper import ONNXClassifierWrapper
PRECISION = np.float32
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--engine_model', help='Path to compiled TesnsorRT model.')
parser.add_argument('-i', '--input_path', help='Path to data for inference to be run on.')
parser.add_argument('-is', '--input_shape', help='Expected shape of input data. Expected in the format [b,c,h,w]')
parser.add_argument('-os', '--output_shape', help='Expected shape of output data. Expected in the format [b,c,h,w]')
parser.add_argument('-o', '--output_path', help='Path to save model outputs to.')
parser.add_argument('-v', '--verbose', help='Display inference metrics for every inference.', action='store_true')
args = parser.parse_args()
output_shape = np.array(args.output_shape.strip('[]').split(',')).astype(int)
trt_model = ONNXClassifierWrapper(args.engine_model, output_shape, target_dtype = PRECISION)
input_shape = np.array(args.input_shape.strip('[]').split(',')).astype(int)
totalTime = 0
numfiles = 0
for filename in os.listdir(args.input_path):
if filename.endswith(".bin"):
with open(os.path.join(args.input_path, filename), 'rb') as input_file:
image = np.fromfile(input_file, dtype=PRECISION).reshape(input_shape)
start_time = time.time()
output = trt_model.predict(image)
end_time = time.time()
inference_time_ms = (end_time-start_time)*1000
totalTime += inference_time_ms
numfiles += 1
if args.verbose:
print(f'Inference time: {inference_time_ms:.2f} ms')
fps = 1000/inference_time_ms
print(f"Throughput: {fps:.2f} FPS")
with open(args.output_path + '/' + filename, "wb") as output_file:
output_file.write(output)
avg_inference_time = totalTime/numfiles
print(f'Average inference time: {avg_inference_time:.2f} ms')
print(f'Average throughput: {1000/avg_inference_time:.1f} FPS')
I should also specify that I have benchmarked nine different DNN architectures on the AGX Orin and the Orin Nano, and the Nano has only outperformed the AGX when running the UNet model.
Thanks again for your help.