How to save time by converting ONNX to TensorRT

Description

This is a very basic question. When I run the attached source code, it takes time to convert the ONNX model to a TensorRT model every time. How can I eliminate the time it takes to convert to TensorRT? I’m implementing it while looking at the sample source code below, but I don’t understand.

For example, can I use the model.trt generated by running trtexec instead of ONNX?

Environment

TensorRT Version: 10.3.0.30
GPU Type: NVIDIA Jetson Orin NX 8GB(VIA AMOS-9100)
Nvidia Driver Version: JetPack 6.1?
CUDA Version: 12.6.68
CUDNN Version: 9.3.0.75
Operating System + Version: JetPack 6.1 [L4T 36.4.0]
Python Version (if applicable): 3.10.12
TensorFlow Version (if applicable): None
PyTorch Version (if applicable): None
Baremetal or Container (if container which image + tag): Baremetal

Relevant Files

import tensorrt as trt
import cv2
import numpy as np
import common

ENGINE_FILE_PATH = "/home/via/sandbox/python/segmentation/model.trt"
ONNX_FILE_PATH = "/home/via/sandbox/python/segmentation/model.onnx"
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class ModelData(object):
    MODEL_PATH = ONNX_FILE_PATH
    INPUT_SHAPE = (3, 288, 288)
    # We can convert TensorRT data types to numpy types with trt.nptype()
    DTYPE = trt.float32

# The Onnx path is used for Onnx models.
def build_engine_onnx(model_file):
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(0)
    config = builder.create_builder_config()
    parser = trt.OnnxParser(network, TRT_LOGGER)

    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, common.GiB(1))
    # Load the Onnx model and parse it in order to populate the TensorRT network.
    with open(model_file, "rb") as model:
        if not parser.parse(model.read()):
            print("ERROR: Failed to parse the ONNX file.")
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None

    engine_bytes = builder.build_serialized_network(network, config)
    runtime = trt.Runtime(TRT_LOGGER)
    return runtime.deserialize_cuda_engine(engine_bytes)

def get_input_image_tensor():
    # PreProcess
    bgr_image = cv2.imread("./dog.jpg")
    rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
    height, width, channel = rgb_image.shape

    size = min(height, width)
    top = int((height - size) / 2)
    left = int((width - size) / 2)
    bottom = top + size
    right = left + size
    crop_img = rgb_image[top:bottom, left:right]

    rgb_ds = cv2.resize(crop_img,(288, 288))
    rgb_nchw = np.transpose(rgb_ds, (2, 0, 1))
    rgb_nchw = (rgb_nchw / 128.0) - 1.0
    rgb_batch = rgb_nchw[np.newaxis,:]

    return rgb_batch

def main():
    onnx_model_file = ONNX_FILE_PATH
    engine = build_engine_onnx(onnx_model_file)
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    context = engine.create_execution_context()
    input_tensor = get_input_image_tensor()
    inputs[0].host = np.array(input_tensor, dtype='<f4')
    trt_outputs = common.do_inference(
        context,
        engine=engine,
        bindings=bindings,
        inputs=inputs,
        outputs=outputs,
        stream=stream,
    )
    print(trt_outputs)
    quit()

if __name__ == "__main__":
    main()

Steps To Reproduce

The problem occurs when you unpack the above tar.gz file, go to “sandbox/python/segmentation”, and run “python trt_resnet.py”. common.py and common_runtime.py were copied from the sample source code below.

The model converted from ONNX is converted to byte data by “build_serialized_network”, so I saved this and made it into a cache. Does this serialized data have a name or extension?

# The Onnx path is used for Onnx models.
def build_engine_onnx(model_file, cache_file):
    is_file = os.path.isfile(cache_file)
    if is_file:
        print("cache exist:", cache_file)
        with open(cache_file, 'rb') as f:
            engine_bytes = f.read()
    else:
        print("cache not exist:", cache_file)
        print("generate TensorRT model from onnx:", model_file)
        builder = trt.Builder(TRT_LOGGER)
        network = builder.create_network(0)
        config = builder.create_builder_config()
        parser = trt.OnnxParser(network, TRT_LOGGER)

        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, common.GiB(1))
        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(model_file, "rb") as model:
            if not parser.parse(model.read()):
                print("ERROR: Failed to parse the ONNX file.")
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                return None

        engine_bytes = builder.build_serialized_network(network, config)
        with open(cache_file, 'wb') as f:
            f.write(engine_bytes)

    runtime = trt.Runtime(TRT_LOGGER)
    return runtime.deserialize_cuda_engine(engine_bytes)

I looked at the source code below and found that the appropriate extension for serialized data is *.engine.

All my questions are answered.
Thank you.