A clear and concise description of the bug or issue.
TensorRT Version - 7.0:
GPU Type - RTX 2080:
Nvidia Driver Version → 440.64.00:
CUDA Version - 10.2:
CUDNN Version:
Operating System + Version:
Python Version (if applicable):
TensorFlow Version (if applicable):
PyTorch Version (if applicable):
**Baremetal or Container (if container which image + tag) → **:
Relevant Files
Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)
Steps To Reproduce
Please include:
- Exact steps/commands to build your repro
- Exact steps/commands to run your repro
- Full traceback of errors encountered
I can successfully inference a single image, but as soon as I loop through a list of images the output of the first image is copied in the output of other images. Below is the related code:
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem): = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str( + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device,, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(, out.device, stream) for out in outputs]
# Synchronize the stream
# Return only the host outputs.
return [ for out in outputs]
TRT_LOGGER = trt.Logger()
def get_engine(engine_path):
# If a serialized engine exists, use it instead of building an engine.
print("Reading engine from file {}".format(engine_path))
with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(
def build_engine(onnx_path, using_half, engine_file="yolov5_1_fp32_common.engine"):
if os.path.exists(engine_file):
return get_engine(engine_file)
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_batch_size = 1 # always 1 for explicit batch
config = builder.create_builder_config()
config.max_workspace_size = GiB(1)
if using_half:
# Load the Onnx model and parse it in order to populate the TensorRT network.
with open(onnx_path, 'rb') as model:
if not parser.parse(
print ('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print (parser.get_error(error))
return None
return builder.build_engine(network, config)
def detect_yolo(engine, context, buffers, image_src, image_size):
IN_IMAGE_H, IN_IMAGE_W = 640, 640
dataset = LoadImages(image_src, img_size=image_size)
for path, img, im0s, vid_cap in dataset:
input_img = img.astype(np.float)
input_img /= 255.0
input_img = np.expand_dims(input_img, axis=0)
img = torch.from_numpy(input_img).float().numpy()
# print(img.shape)
trt_output = detect(engine, context, buffers, img)
def detect(engine, context, buffers, img_in):
ta = time.time()
print("Shape of the network input: ", img_in.shape)
# print(img_in)
inputs, outputs, bindings, stream = buffers
print('Length of inputs: ', len(inputs))
inputs[0].host = img_in
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
print('Len of outputs: ', len(trt_outputs))
num_classes = 80
# print(trt_outputs)
trt_output = trt_outputs[0].reshape(1, -1, 5 + num_classes)
tb = time.time()
print(' TRT inference time: %f' % (tb - ta))
return trt_output
def main():
using_half = False
with build_engine(opt.onnx, False) as engine, engine.create_execution_context() as context:
buffers = allocate_buffers(engine)
detect_yolo(engine, context, buffers, opt.source, opt.img_size)
Thanks in advance. I want to inference multiple images successfully. I think this has something to do with context.enqueue. I am not quite sure