How to predict with (.trt) model

Hi , I did the following conversion:
keras(.h5) ----> tensorflow(.pb) ----> onnx ----> tensorrt(.trt)
I haven’t seen an obvious way to predict with tensorrt model(.trt) and my pre-trained model is multi view CNN so I have multiple input for each prediction .
Do you have any code example that shows me how to predict in this case with trt model ?(I’m using jetson nano)
thanks for your attention .

Environment

TensorRT Version : 7.1.3.0
CUDA Version : 10.2.89
CUDNN Version : 8.0.0.180
Operating System + Version : ubuntu 18.04
Python Version (if applicable) : 3.6.9
TensorFlow Version (if applicable) : 2.3.1

Hi,

You can inference TensorRT model with the below sample.

inference.py

import cv2
import time
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)

host_inputs  = []
cuda_inputs  = []
host_outputs = []
cuda_outputs = []
bindings = []


def Inference(engine):
    image = cv2.imread("/usr/src/tensorrt/data/resnet50/airliner.ppm")
    image = (2.0 / 255.0) * image.transpose((2, 0, 1)) - 1.0

    np.copyto(host_inputs[0], image.ravel())
    stream = cuda.Stream()
    context = engine.create_execution_context()

    start_time = time.time()
    cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
    stream.synchronize()
    print("execute times "+str(time.time()-start_time))

    output = host_outputs[0].reshape(np.concatenate(([1],engine.get_binding_shape(1))))
    print(np.argmax(output))


def PrepareEngine():
    runtime = trt.Runtime(TRT_LOGGER)
    with open('./trt.plan', 'rb') as f:
        buf = f.read()
        engine = runtime.deserialize_cuda_engine(buf)

    # create buffer
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
        cuda_mem = cuda.mem_alloc(host_mem.nbytes)

        bindings.append(int(cuda_mem))
        if engine.binding_is_input(binding):
            host_inputs.append(host_mem)
            cuda_inputs.append(cuda_mem)
        else:
            host_outputs.append(host_mem)
            cuda_outputs.append(cuda_mem)

    return engine


if __name__ == "__main__":
    engine = PrepareEngine()
    Inference(engine)

For example:

$ /usr/src/tensorrt/bin/trtexec --onnx=/usr/src/tensorrt/data/resnet50/ResNet50.onnx --saveEngine=trt.plan
$ python3 inference.py

For your use case, you may need to update the input buffer size to multiple cases.
Ex. host_inputs[0], host_inputs[1], host_inputs[2], …

Thanks.