Hey,
thanks for install advice it works great.
Now I want to use the engine file to do some inference and here is what I use:
import tensorrt as trt
import argparse
from onnx import ModelProto
import pycuda.driver as cuda
import numpy as np
import pycuda.autoinit
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
def build_engine(onnx_path, shape = [64,1280,1280,3]):
"""
This is the function to create the TensorRT engine
Args:
onnx_path : Path to onnx_file.
shape : Shape of the input of the ONNX file.
"""
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = (1 << 16)
with open(onnx_path, 'rb') as model:
parser.parse(model.read())
network.get_input(0).shape = shape
engine = builder.build_cuda_engine(network)
return engine
def save_engine(engine, file_name):
buf = engine.serialize()
with open(file_name, 'wb') as f:
f.write(buf)
print('Engine saved')
def load_engine(trt_runtime, engine_path):
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
#########################################################################
def allocate_buffers(engine, batch_size, data_type):
"""
This is the function to allocate buffers for input and output in the device
Args:
engine : The path to the TensorRT engine.
batch_size : The batch size for execution time.
data_type: The type of the data for input and output, for example trt.float32.
Output:
h_input_1: Input in the host.
d_input_1: Input in the device.
h_output_1: Output in the host.
d_output_1: Output in the device.
stream: CUDA stream.
"""
# Determine dimensions and create page-locked memory buffers (which won't be swapped to disk) to hold host inputs/outputs.
h_input_1 = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(data_type))
h_output = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(data_type))
# Allocate device memory for inputs and outputs.
d_input_1 = cuda.mem_alloc(h_input_1.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()
return h_input_1, d_input_1, h_output, d_output, stream
def load_images_to_buffer(pics, pagelocked_buffer):
preprocessed = np.asarray(pics).ravel()
np.copyto(pagelocked_buffer, preprocessed)
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width):
"""
This is the function to run the inference
Args:
engine : Path to the TensorRT engine
pics_1 : Input images to the model.
h_input_1: Input in the host
d_input_1: Input in the device
h_output_1: Output in the host
d_output_1: Output in the device
stream: CUDA stream
batch_size : Batch size for execution time
height: Height of the output image
width: Width of the output image
Output:
The list of output images
"""
load_images_to_buffer(pics_1, h_input_1)
with engine.create_execution_context() as context:
# Transfer input data to the GPU.
cuda.memcpy_htod_async(d_input_1, h_input_1, stream)
# Run inference.
context.profiler = trt.Profiler()
context.execute(batch_size=batch_size, bindings=[int(d_input_1), int(d_output)])
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Synchronize the stream
stream.synchronize()
# Return the host output.
out = h_output.reshape((batch_size,-1, height, width))
return out
And the file to execute using the trt file:
import tensorrt as trt
import pycuda.driver as cuda
import numpy as np
import pycuda.autoinit
import argparse
from onnx import ModelProto
from function import *
import sys
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
trt.init_libnvinfer_plugins(TRT_LOGGER,'')
trt_runtime = trt.Runtime(TRT_LOGGER)
batch_size = 64
data_type = trt.float16
height = 1280
width = 1280
engine_path = sys.argv[1]
engine = load_engine(trt_runtime,engine_path)
h_input_1, d_input_1, h_output, d_output, stream = allocate_buffers(engine,batch_size,data_type)
out = do_inferencedo_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width)
print(out)
But when I do that I encounter this error:
(cv) nvidia@nvidia-desktop:~/test_trt$ python trt_infer.py rn50engine.trt 439.jpg
[TensorRT] ERROR: ../rtSafe/cuda/cudaActivationRunner.cpp (103) - Cudnn Error in execute: 3 (CUDNN_STATUS_BAD_PARAM)
[TensorRT] ERROR: FAILED_EXECUTION: std::exception
[TensorRT] ERROR: engine.cpp (179) - Cuda Error in ~ExecutionContext: 719 (unspecified launch failure)
[TensorRT] ERROR: INTERNAL_ERROR: std::exception
[TensorRT] ERROR: Parameter check failed at: ../rtSafe/safeContext.cpp::terminateCommonContext::155, condition: cudnnDestroy(context.cudnn) failure.
[TensorRT] ERROR: Parameter check failed at: ../rtSafe/safeContext.cpp::terminateCommonContext::165, condition: cudaEventDestroy(context.start) failure.
[TensorRT] ERROR: Parameter check failed at: ../rtSafe/safeContext.cpp::terminateCommonContext::170, condition: cudaEventDestroy(context.stop) failure.
[TensorRT] ERROR: ../rtSafe/safeRuntime.cpp (32) - Cuda Error in free: 719 (unspecified launch failure)
terminate called after throwing an instance of 'nvinfer1::CudaError'
what(): std::exception
Aborted (core dumped)
My problem is that it’s not really clear what the workflow is to execute an engine on a image or video.
Thanks