Engine Plan Inference on JetsonTX2

Hey,
thanks for install advice it works great.

Now I want to use the engine file to do some inference and here is what I use:

import tensorrt as trt
import argparse
from onnx import ModelProto
import pycuda.driver as cuda
import numpy as np
import pycuda.autoinit

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
def build_engine(onnx_path, shape = [64,1280,1280,3]):

   """
   This is the function to create the TensorRT engine
   Args:
      onnx_path : Path to onnx_file.
      shape : Shape of the input of the ONNX file.
  """
   with trt.Builder(TRT_LOGGER) as builder, builder.create_network(1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
       builder.max_workspace_size = (1 << 16)
       with open(onnx_path, 'rb') as model:
           parser.parse(model.read())
       network.get_input(0).shape = shape
       engine = builder.build_cuda_engine(network)
       return engine
def save_engine(engine, file_name):
   buf = engine.serialize()
   with open(file_name, 'wb') as f:
       f.write(buf)
	   print('Engine saved')
def load_engine(trt_runtime, engine_path):
   with open(engine_path, 'rb') as f:
       engine_data = f.read()
   engine = trt_runtime.deserialize_cuda_engine(engine_data)
   return engine

 #########################################################################


def allocate_buffers(engine, batch_size, data_type):

   """
   This is the function to allocate buffers for input and output in the device
   Args:
      engine : The path to the TensorRT engine.
      batch_size : The batch size for execution time.
      data_type: The type of the data for input and output, for example trt.float32.

   Output:
      h_input_1: Input in the host.
      d_input_1: Input in the device.
      h_output_1: Output in the host.
      d_output_1: Output in the device.
      stream: CUDA stream.

   """

   # Determine dimensions and create page-locked memory buffers (which won't be swapped to disk) to hold host inputs/outputs.
   h_input_1 = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(data_type))
   h_output = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(data_type))
   # Allocate device memory for inputs and outputs.
   d_input_1 = cuda.mem_alloc(h_input_1.nbytes)

   d_output = cuda.mem_alloc(h_output.nbytes)
   # Create a stream in which to copy inputs/outputs and run inference.
   stream = cuda.Stream()
   return h_input_1, d_input_1, h_output, d_output, stream

def load_images_to_buffer(pics, pagelocked_buffer):
   preprocessed = np.asarray(pics).ravel()
   np.copyto(pagelocked_buffer, preprocessed)

def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width):
   """
   This is the function to run the inference
   Args:
      engine : Path to the TensorRT engine
      pics_1 : Input images to the model.
      h_input_1: Input in the host
      d_input_1: Input in the device
      h_output_1: Output in the host
      d_output_1: Output in the device
      stream: CUDA stream
      batch_size : Batch size for execution time
      height: Height of the output image
      width: Width of the output image

   Output:
      The list of output images

   """

   load_images_to_buffer(pics_1, h_input_1)

   with engine.create_execution_context() as context:
       # Transfer input data to the GPU.
       cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

       # Run inference.

       context.profiler = trt.Profiler()
       context.execute(batch_size=batch_size, bindings=[int(d_input_1), int(d_output)])

       # Transfer predictions back from the GPU.
       cuda.memcpy_dtoh_async(h_output, d_output, stream)
       # Synchronize the stream
       stream.synchronize()
       # Return the host output.
       out = h_output.reshape((batch_size,-1, height, width))
       return out

And the file to execute using the trt file:

import tensorrt as trt
import pycuda.driver as cuda
import numpy as np
import pycuda.autoinit
import argparse
from onnx import ModelProto
from function import *
import sys

TRT_LOGGER = trt.Logger(trt.Logger.INFO)
trt.init_libnvinfer_plugins(TRT_LOGGER,'')
trt_runtime = trt.Runtime(TRT_LOGGER)
batch_size = 64
data_type = trt.float16
height = 1280
width = 1280

engine_path = sys.argv[1]

engine = load_engine(trt_runtime,engine_path)

h_input_1, d_input_1, h_output, d_output, stream = allocate_buffers(engine,batch_size,data_type)

out = do_inferencedo_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width)
print(out)

But when I do that I encounter this error:

(cv) nvidia@nvidia-desktop:~/test_trt$ python trt_infer.py rn50engine.trt 439.jpg
[TensorRT] ERROR: ../rtSafe/cuda/cudaActivationRunner.cpp (103) - Cudnn Error in execute: 3 (CUDNN_STATUS_BAD_PARAM)
[TensorRT] ERROR: FAILED_EXECUTION: std::exception
[TensorRT] ERROR: engine.cpp (179) - Cuda Error in ~ExecutionContext: 719 (unspecified launch failure)
[TensorRT] ERROR: INTERNAL_ERROR: std::exception
[TensorRT] ERROR: Parameter check failed at: ../rtSafe/safeContext.cpp::terminateCommonContext::155, condition: cudnnDestroy(context.cudnn) failure.
[TensorRT] ERROR: Parameter check failed at: ../rtSafe/safeContext.cpp::terminateCommonContext::165, condition: cudaEventDestroy(context.start) failure.
[TensorRT] ERROR: Parameter check failed at: ../rtSafe/safeContext.cpp::terminateCommonContext::170, condition: cudaEventDestroy(context.stop) failure.
[TensorRT] ERROR: ../rtSafe/safeRuntime.cpp (32) - Cuda Error in free: 719 (unspecified launch failure)
terminate called after throwing an instance of 'nvinfer1::CudaError'
  what():  std::exception
Aborted (core dumped)

My problem is that it’s not really clear what the workflow is to execute an engine on a image or video.

Thanks