Tensorrt Batch Inference

Description

A bug occurs when using trt to inference batches of images
A clear and concise description of the bug or issue.

Environment

TensorRT Version: TensorRT-7.1.3.4
GPU Type: Tesla V100-PCIE
Nvidia Driver Version: 418.87.00
CUDA Version: 10.2
CUDNN Version: 8.0.2.39
Operating System + Version: Ubuntu18.04
Python Version (if applicable): 3.7.7
TensorFlow Version (if applicable): 2.3.0
PyTorch Version (if applicable): 1.5.1
Baremetal or Container (if container which image + tag):

Relevant Files

Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)
trt_engine:
https://drive.google.com/file/d/1vde7ggGFl08giUpz3HIJkK0aWhyEODxp/view?usp=sharing
onnx_file:
https://drive.google.com/file/d/1h3dHXbC3c3JINRmUUrrizOaIvJWFNbiZ/view?usp=sharing

Steps To Reproduce

I can successfully inference a single image, but as soon as I loop through a list of images the output of the first image is copied in the output of other images. Below is the related code:
1、to generate dynamic onnx
def transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W):
model = Yolov4(n_classes=n_classes, inference=True)
pretrained_dict = torch.load(weight_file, map_location=torch.device(‘cuda’))
model.load_state_dict(pretrained_dict)
input_names = [“input”]
output_names = [‘boxes’, ‘confs’]
x = torch.randn((1, 3, IN_IMAGE_H, IN_IMAGE_W), requires_grad=True)
onnx_file_name = “yolov4_-1_3_{}_{}_dynamic.onnx”.format(IN_IMAGE_H, IN_IMAGE_W)
dynamic_axes = {“input”: {0: “batch_size”}, “boxes”: {0: “batch_size”}, “confs”: {0: “batch_size”}}
print(‘Export the onnx model …’)

torch.onnx.export(model,x,onnx_file_name,export_params=True,opset_version=11,do_constant_folding=True,
                                input_names=input_names, output_names=output_names,dynamic_axes=dynamic_axes)
print('Onnx model exporting done')

2、convert to tensorrt engine
def create_optimization_profiles(builder, inputs, batch_size):
# Creates tensorRT optimizations profiles for a given batch size
profiles = [ ]
for inp in inputs:
profile = builder.create_optimization_profile()
shape = inp.shape[1:]
profile.set_shape(inp.name, min=(batch_size, *shape), opt=(batch_size, *shape), max=(batch_size, *shape))
profiles.append(profile)
return profiles

def build_engine(onnx_file_path, engine_file_path, batch_size, verbose=True):
logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger()
builder = trt.Builder(logger)
config = builder.create_builder_config()
# Specifies that network should have an explicit batch size (required in tensorRT 7.0.0+)
explicit_batch = [1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)]
network = builder.create_network(*explicit_batch)
parser = trt.OnnxParser(network, logger)
# Define standard settings for tensorRT builder environment
builder.max_workspace_size = 1 << 30
builder.max_batch_size = batch_size
builder.fp16_mode = True
# builder.strict_type_constraints = True
# Parse onnx model
with open(onnx_file_path, ‘rb’) as onnx_model:
if not parser.parse(onnx_model.read()):
print(“ERROR: Failed to parse onnx model.”)
for error in range(parser.num_errors):
print(parser.get_error(error))
return
# Add optimization profiles
inputs = [network.get_input(i) for i in range(network.num_inputs)]
opt_profiles = create_optimization_profiles(builder, inputs, batch_size)
for profile in opt_profiles:
config.add_optimization_profile(profile)
# Explicitly set the the output layer so engine knows where to expect final outputs
last_layer = network.get_layer(network.num_layers - 1)
if not last_layer.get_output(0):
network.mark_output(last_layer.get_output(0))
print(‘Building tensorRT engine…’)
engine = builder.build_engine(network, config)
print(‘Successfully built engine’)
with open(engine_file_path, ‘wb’) as f:
f.write(engine.serialize())
return onnx_file_name

Please include:

  • Exact steps/commands to build your repro
  • Exact steps/commands to run your repro
  • Full traceback of errors encountered

Thanks in advance. I want to inference multiple images successfully. I think this has something to do with context.enqueue. I am not quite sure

Hi @1531002208,
Kindly allow access to the files.
Thanks!

Now, you can access to the files

Hi @1531002208
Can you please help me with the complete script you are using for inference.
Thanks!

Here is my inference.
Thanks!
coin.nams
zero
one
two

import sys

import os

import time

import argparse

import numpy as np

import cv2

from PIL import Image

import tensorrt as trt

import pycuda.driver as cuda

import pycuda.autoinit

from tool.utils import *

try:

# Sometimes python2 does not understand FileNotFoundError

FileNotFoundError

except NameError:

FileNotFoundError = IOError

def GiB(val):

return val * 1 << 30

def find_sample_data(description=“Runs a TensorRT Python sample”, subfolder="", find_files=):

'''

Parses sample arguments.

Args:

    description (str): Description of the sample.

    subfolder (str): The subfolder containing data relevant to this sample

    find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.

Returns:

    str: Path of data directory.

Raises:

    FileNotFoundError

'''

# Standard command-line arguments for all samples.

kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")

parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory.", default=kDEFAULT_DATA_ROOT)

args, unknown_args = parser.parse_known_args()

# If data directory is not specified, use the default.

data_root = args.datadir

# If the subfolder exists, append it to the path, otherwise use the provided path as-is.

subfolder_path = os.path.join(data_root, subfolder)

data_path = subfolder_path

if not os.path.exists(subfolder_path):

    print("WARNING: " + subfolder_path + " does not exist. Trying " + data_root + " instead.")

    data_path = data_root

# Make sure data directory exists.

if not (os.path.exists(data_path)):

    raise FileNotFoundError(data_path + " does not exist. Please provide the correct data path with the -d option.")

# Find all requested files.

for index, f in enumerate(find_files):

    find_files[index] = os.path.abspath(os.path.join(data_path, f))

    if not os.path.exists(find_files[index]):

        raise FileNotFoundError(find_files[index] + " does not exist. Please provide the correct data path with the -d option.")

return data_path, find_files

Simple helper data class that’s a little nicer to use than a 2-tuple.

class HostDeviceMem(object):

def __init__(self, host_mem, device_mem):

    self.host = host_mem

    self.device = device_mem

def __str__(self):

    return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

def __repr__(self):

    return self.__str__()

Allocates all buffers required for an engine, i.e. host/device inputs/outputs.

def allocate_buffers(engine):

inputs = []

outputs = []

bindings = []

stream = cuda.Stream()

for binding in engine:

    size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size

    dtype = trt.nptype(engine.get_binding_dtype(binding))

    # Allocate host and device buffers

    host_mem = cuda.pagelocked_empty(size, dtype)

    device_mem = cuda.mem_alloc(host_mem.nbytes)

    # Append the device buffer to device bindings.

    bindings.append(int(device_mem))

    # Append to the appropriate list.

    if engine.binding_is_input(binding):

        inputs.append(HostDeviceMem(host_mem, device_mem))

    else:

        outputs.append(HostDeviceMem(host_mem, device_mem))

return inputs, outputs, bindings, stream

This function is generalized for multiple inputs/outputs.

inputs and outputs are expected to be lists of HostDeviceMem objects.

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):

# Transfer input data to the GPU.

[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]

# Run inference.

context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)

# Transfer predictions back from the GPU.

[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]

# Synchronize the stream

stream.synchronize()

# Return only the host outputs.

return [out.host for out in outputs]

TRT_LOGGER = trt.Logger()

def main(engine_path, image_path, image_size, batch_size):

with get_engine(engine_path) as engine, engine.create_execution_context() as context:

    buffers = allocate_buffers(engine)

    IN_IMAGE_H, IN_IMAGE_W = image_size

    context.set_binding_shape(0, (1, 3, IN_IMAGE_H, IN_IMAGE_W))

    image_src = cv2.imread(image_path)

    num_classes = 3

    for i in range(2):  # This 'for' loop is for speed check

                        # Because the first iteration is usually longer

        boxes = detect(context, buffers, image_src, image_size, num_classes, batch_size)

    if num_classes == 20:

        namesfile = 'data/voc.names'

    elif num_classes == 80:

        namesfile = 'data/coco.names'

    elif num_classes == 3:

        namesfile = 'data/coin.names'

    class_names = load_class_names(namesfile)

    for i in range(batch_size):

        plot_boxes_cv2(image_src, boxes[i], savename=os.path.join("./batch_result" , str(i) + "-" + 'predictions_trt.jpg'), class_names=class_names)

def get_engine(engine_path):

# If a serialized engine exists, use it instead of building an engine.

print("Reading engine from file {}".format(engine_path))

with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:

    return runtime.deserialize_cuda_engine(f.read())

def detect(context, buffers, image_src, image_size, num_classes, batch_size):

IN_IMAGE_H, IN_IMAGE_W = image_size

ta = time.time()

# Input

resized = cv2.resize(image_src, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)

img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)

img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)

img_in = np.expand_dims(img_in, axis=0)

img_in /= 255.0

img_in = np.ascontiguousarray(img_in)

img_in = np.vstack((img_in,img_in,img_in,img_in,img_in, img_in, img_in, img_in))

print("Shape of the network input: ", img_in.shape)

# print(img_in)

inputs, outputs, bindings, stream = buffers

print('Length of inputs: ', len(inputs))

inputs[0].host = img_in

trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size)

print('Len of outputs: ', len(trt_outputs))

trt_outputs[0] = trt_outputs[0].reshape(batch_size, -1, 1, 4)

trt_outputs[1] = trt_outputs[1].reshape(batch_size, -1, num_classes)

print(trt_outputs[0][0])

exit()

tb = time.time()

print('-----------------------------------')

print('    TRT inference time: %f' % (tb - ta))

print('-----------------------------------')

boxes = post_processing(img_in, 0.4, 0.6, trt_outputs)

return boxes

if name == ‘main’:

engine_path = sys.argv[1]

image_path = sys.argv[2]

batch_size = int(sys.argv[3])

if len(sys.argv) < 5:

    image_size = (416, 416)

elif len(sys.argv) < 5:

    image_size = (int(sys.argv[4]), int(sys.argv[4]))

else:

    image_size = (int(sys.argv[4]), int(sys.argv[5]))



main(engine_path, image_path, image_size, batch_size)

Hi @1531002208
Request you to upload the file in .py or .txt format,

copy pasting the script might have missed the entries along with the indentation.

Thanks!

Thank you very much! I have uploaded the code in this link.
https://drive.google.com/drive/folders/1ZImXFXAfzUvHWWcE7XGxcqzygbzD2jo6?usp=sharing

Hi, do u find out solutions for this problem? I came across same situation.

Hi @1531002208,
Apologies for the miss.
Are you still facing the issue?

Thanks!