Hi,
WE have used either deepstream or our own script, and in both cases the performance is degraded to the one we obtain using TAO. Is there any “official” script you have that can be used?
I attach our script.
import os
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
from PIL import Image
import pdb
import codecs
import glob
import datetime
import shutil
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def load_engine(trt_runtime, engine_path):
with open(engine_path, "rb") as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
# def allocate_buffers(engine, batch_size=-1):
def allocate_buffers(engine, batch_size=1):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
# pdb.set_trace()
size = trt.volume(engine.get_binding_shape(binding)) * batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
# print(f"input: shape:{engine.get_binding_shape(binding)} dtype:{engine.get_binding_dtype(binding)}")
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
# print(f"output: shape:{engine.get_binding_shape(binding)} dtype:{engine.get_binding_dtype(binding)}")
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(
batch_size=batch_size, bindings=bindings, stream_handle=stream.handle
)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def post_processing(label_ids, classes):
top_five_indexes = label_ids[0].argsort()[-5:][::-1]
top_five_classes = []
for index in top_five_indexes:
# [ [clase,probabilidad], [clase,probabilidad], ...] ]
top_five_classes.append([classes[index], label_ids[0][index]])
# iterate label using label ids
max_value_index = top_five_indexes[0]
max_value = top_five_classes[0][1]
print("Index max value: " + str(max_value_index))
print("Max value: " + str(max_value))
return top_five_classes
def model_loading(trt_engine_path, input_shape):
# TensorRT logger singleton
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# trt_engine_path = "/opt/smarg/surveillance_gateway_prod/surveillance_ai_model/x86_64/Secondary_NumberPlateClassification/lpr_us_onnx_b16.engine"
trt_runtime = trt.Runtime(TRT_LOGGER)
# pdb.set_trace()
trt_engine = load_engine(trt_runtime, trt_engine_path)
# Execution context is needed for inference
context = trt_engine.create_execution_context()
# NPR input shape
# input_shape = (3,48,96)
context.set_binding_shape(0, input_shape)
# This allocates memory for network inputs/outputs on both CPU and GPU
inputs, outputs, bindings, stream = allocate_buffers(trt_engine)
return inputs, outputs, bindings, stream, context
def infer_image(classes, imageToInfer, model_parameters):
image_count = 1
start_time = datetime.datetime.now()
print("Image name :", imageToInfer)
image = [cv2.imread(imageToInfer)]
image = np.array([(cv2.resize(img, (240 , 240))) for img in image], dtype=np.float32)
image = image.transpose(0 , 3 , 1 , 2)
np.copyto(model_parameters['inputs'][0].host, image.ravel())
output = do_inference(model_parameters['context'], bindings=model_parameters['bindings'], inputs=model_parameters['inputs'], outputs=model_parameters['outputs'], stream=model_parameters['stream'])
top_five_classes = post_processing(output, classes)
print("TOP FIVE PREDICTIONS: " + str(top_five_classes))
print("BEST PREDICTION: " + str(top_five_classes[0]))
"""
for image_path in glob.glob(images_folder_path + "*.jpg"):
print("Image name :", image_path)
image = [cv2.imread(image_path)]
image = np.array([(cv2.resize(img, (240 , 240))) for img in image], dtype=np.float32)
image= image.transpose(0 , 3 , 1 , 2)
np.copyto(model_parameters['inputs'][0].host, image.ravel())
output = do_inference(model_parameters['context'], bindings=model_parameters['bindings'], inputs=model_parameters['inputs'], outputs=model_parameters['outputs'], stream=model_parameters['stream'])
top_five_classes = post_processing(output, classes)
image_count += 1
print("TOP FIVE PREDICTIONS: " + str(top_five_classes))
print("BEST PREDICTION: " + str(top_five_classes[0]))
"""
end_time = datetime.datetime.now()
total_time = end_time - start_time
print("Total image processed : {} Total Time : {} ".format(image_count, total_time))
return top_five_classes
Regards,
Alberto