Batch processing on tao engine

Dear @Morganh,

I have customize one inference script for the mobileNet_V1 classification trained on tao.

I have converted the etlt file in engine using tao-converter for batch size 1 and it is working fine.

below is the working code for batch size 1.

import os
import time

import cv2
#import matplotlib.pyplot as plt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
from PIL import Image
import pdb
import codecs
import glob
import datetime
import shutil
import matplotlib.pyplot as plt

# input_shape = (3,236,236)
input_shape = (3,354,354)
fallen_label = ["Fallen","Normal"]

# Input Params

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem): = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str( + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

def load_engine(trt_runtime, engine_path):
    with open(engine_path, "rb") as f:
        engine_data =
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
# def allocate_buffers(engine, batch_size=-1):
def allocate_buffers(engine, batch_size=1):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        # pdb.set_trace()
        size = trt.volume(engine.get_binding_shape(binding)) * batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            # print(f"input: shape:{engine.get_binding_shape(binding)} dtype:{engine.get_binding_dtype(binding)}")
            outputs.append(HostDeviceMem(host_mem, device_mem))
            # print(f"output: shape:{engine.get_binding_shape(binding)} dtype:{engine.get_binding_dtype(binding)}")
    return inputs, outputs, bindings, stream

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device,, stream) for inp in inputs]
    # Run inference.
        batch_size=batch_size, bindings=bindings, stream_handle=stream.handle
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(, out.device, stream) for out in outputs]
    # Synchronize the stream
    # Return only the host outputs.
    return [ for out in outputs]

def model_loading(trt_engine_path):
    # TensorRT logger singleton
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    # trt_engine_path = "/opt/smarg/surveillance_gateway_prod/surveillance_ai_model/x86_64/Secondary_NumberPlateClassification/lpr_us_onnx_b16.engine"

    trt_runtime = trt.Runtime(TRT_LOGGER)
    # pdb.set_trace()
    trt_engine = load_engine(trt_runtime, trt_engine_path)
    # Execution context is needed for inference
    context = trt_engine.create_execution_context()
    # input shape
    context.set_binding_shape(0, input_shape)
    # This allocates memory for network inputs/outputs on both CPU and GPU
    inputs, outputs, bindings, stream = allocate_buffers(trt_engine)
    return inputs, outputs, bindings, stream, context

def preprocess_res18(image):
    # image = np.asarray(image.resize((224, 224), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(trt.float32)).ravel()
    image = np.asarray(image.resize((input_shape[1], input_shape[2]), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(trt.float32)).ravel()
    return image

trt_engine_path = "./Models/V2.2/MobileNet_V1/MobileNetV1_ReTrain_Ep70_ZeroConfAug2_SIZE_416X416_FallenObjectClassification_V2.2_fp16_b1.engine"
camera_frames_path = "./TestSampleFrame/73_43/"
output_folder_path = "./output/73_430_output/"
acc_threshold = 95

cropped_images = "./output/Accuracy_wise_analysis_73_430_output/"

if not os.path.exists(output_folder_path):

if not os.path.exists(cropped_images):

inputs, outputs, bindings, stream, context = model_loading(trt_engine_path)

cropped_img_count = 1
image_count = 0

import glob
from PIL import Image, ImageDraw

# Function to perform classification on each cropped area and draw rectangles

def classify_and_draw(image, fallen_area, context, bindings, inputs, outputs, stream, fallen_label, acc_threshold):
        # image =
        draw = ImageDraw.Draw(image)
        global cropped_img_count
        global image_count

        box_coordinates = []  # List to store box coordinates

        for area_coordinates in fallen_area:
            x1, y1, x2, y2 = area_coordinates['XMIN'], area_coordinates['YMIN'], area_coordinates['XMAX'], area_coordinates['YMAX']
            if (x2-x1)>60 and (y2-y1)>60:
                area_image_cropped = image.crop((x1, y1, x2, y2))
                area_image = preprocess_res18(area_image_cropped)  # Preprocess the cropped area
                # print(area_image.shape)
                np.copyto(inputs[0].host, area_image)  # Copy preprocessed image to input buffer
                output = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)  # Perform inference

                max_index_row = np.argmax(output[0], axis=0)
                fallen_normal_acc = int('{:.0f}'.format(output[0][max_index_row]*100))
                fallen_label_info = fallen_label[max_index_row]

                color = "green"
                if fallen_label_info == "Fallen":
                    # color = "yellow"
                    color = "red"
                    if fallen_normal_acc>20:
                        image_name = f"{image_count}.jpg"
                        # crop_img_name = cropped_images + "/" + image_name + "_" + str(cropped_img_count)+"_"+str(fallen_normal_acc)+".jpg"
                        crop_img_name = cropped_images + "/"+str(fallen_normal_acc)+"_" + image_name + "_" + str(cropped_img_count)+".jpg"
                        cropped_img_count += 1

                box_coordinates.append((x1, y1, x2, y2, color, fallen_label_info, fallen_normal_acc))

        # Draw all boxes after the loop
        for box_info in box_coordinates:
            x1, y1, x2, y2, color, fallen_label_info, fallen_normal_acc = box_info
            draw.rectangle([x1, y1, x2, y2], outline=color)
            # inner_rect = [x1 + border_width, y1 + border_width, x2 - border_width, y2 - border_width]
            # draw.rectangle(inner_rect,outline=color, fill=None)

            if fallen_normal_acc > acc_threshold:
                draw.text((x1, y1), f"{fallen_label_info} ({fallen_normal_acc}%)", fill=color)

        print(f"processed images are : {image_count}")
    except Exception as e:
        print("exception as : ",e)

# Wrong violation analysis

# for camera_dir in glob.glob(camera_frames_path+"/*"):
#     camera_name = camera_dir.split("/")[-1]
#     fallen_area_data = fallen_area[100 + int(camera_name_mapping[camera_name])]

#     for image_path in glob.glob(camera_dir+"/*"):
#         classify_and_draw(image_path, fallen_area_data, context, bindings, inputs, outputs, stream, fallen_label, acc_threshold)
camera_image_mapping = {


# next 59_330 : 31

area = fallen_area[137]
video_path = "./TestSampleFrame/fallen_22March_16_1630/22march_4_430/73+430_DT_2024-03-22_16:00:01.550223_DT_003d.mkv"
frame_interval = 10
frame_count = 0
# Initialize OpenCV video capture
cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    print("Error: Unable to open video.")

while cap.isOpened():
    ret, frame =

    # if not ret:
    #     break  # Break the loop if there are no more frames
        frame = cv2.resize(frame, (1920, 1080))
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image_pil = Image.fromarray(image_rgb)
        if frame_count%frame_interval==0:
            classify_and_draw(image_pil, area, context, bindings, inputs, outputs, stream, fallen_label, acc_threshold)
    except Exception as e:

    # Press 'q' to exit the loop
    if cv2.waitKey(1) & 0xFF == ord('q'):


Can you please suggest how can i modify it for batch size n (n>1) ?
also please suggest how to process batch input before passing to the model.

I have tried but getting size issues while feeding batch input.


For 5.0 or later version, after training and exporting, there will be an onnx file instead of onnx file.
Can you decrypt the etlt model to onnx file? Refer to tao_toolkit_recipes/tao_forum_faq/ at main · NVIDIA-AI-IOT/tao_toolkit_recipes · GitHub.
Then use trtexec TRTEXEC with Classification TF1/TF2/PyT - NVIDIA Docs to generate dynamic batch of engine.

Thanks @Morganh

But I have trained on TAO 4.0.1. Can we achieve it using etlt->engine->inference.
Because script is working fine with batch size 1.

Please help if there is another script or suggestion how to pass batch input.


How did you run tao-converter?
I am afraid you generated a tensorrt engine with fixed batch-size 1.
So, I suggest you decode it to .onnx file to double check.

Hi @Morganh

Below is the command to convert etlt to engine. I had downloaded the tao-converter and then convert etlt to engine.

./tao-converter  ./Models/V2.2/MobileNet_V1/MobileNetV1_Train_Ep51_ZeroConfAug2_SIZE_354X354_FallenObjectClassification_V2.2.etlt \
               -k key  \
               -o predictions/Softmax \
               -d 3,354,354 \
               -i nchw \
               -m 1 -t fp16 \
               -e ./Models/V2.2/MobileNet_V1/MobileNetV1_Train_Ep51_ZeroConfAug2_SIZE_354X354_FallenObjectClassification_V2.2_fp16_b1.engine \
               -b 1

after engine generation, I ran the script mentioned above and it is working fine with batch size 1.

Yes, it is fixed batch-size 1 instead of dynamic batch. So it is only working with batch size 1.
You can decode to onnx and open it with Netron to double check.
For code side, you can leverage Python run LPRNet with TensorRT show pycuda._driver.MemoryError: cuMemHostAlloc failed: out of memory - #8 by Morganh.

You can set to -b 2 and generate a new engine, then run with batch-size 2.

Yes @Morganh

I tried with batch size 32 as well.

./tao-converter  ./Models/V2.2/MobileNet_V1/MobileNetV1_ReTrain_Ep70_ZeroConfAug2_SIZE_354X354_FallenObjectClassification_V2.2.etlt \
               -k key \
               -o predictions/Softmax \
               -d 3,354,354 \
               -i nchw \
               -m 32 -t fp16 \
               -e ./Models/V2.2/MobileNet_V1/MobileNetV1_ReTrain_Ep70_ZeroConfAug2_SIZE_354X354_FallenObjectClassification_V2.2_fp16_b32.engine \
               -b 32

and engine is generated successfully but the issues was i was unable to prepare the batch_data for batch size 32 model input.

Please check the below code and suggest what can be change.

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem): = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str( + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

def load_engine(trt_runtime, engine_path):
    with open(engine_path, "rb") as f:
        engine_data =
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine, batch_size=32):
# def allocate_buffers(engine, batch_size=1):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        # pdb.set_trace()
        size = trt.volume(engine.get_binding_shape(binding)) * batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            print(f"input: shape:{engine.get_binding_shape(binding)} dtype:{engine.get_binding_dtype(binding)}")
            outputs.append(HostDeviceMem(host_mem, device_mem))
            print(f"output: shape:{engine.get_binding_shape(binding)} dtype:{engine.get_binding_dtype(binding)}")
    print("inputs : ",len(inputs[0].host))
    print("outputs : ",outputs[0].host)

    return inputs, outputs, bindings, stream

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device,, stream) for inp in inputs]
    # Run inference.
        batch_size=batch_size, bindings=bindings, stream_handle=stream.handle
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(, out.device, stream) for out in outputs]
    # Synchronize the stream
    # Return only the host outputs.
    return [ for out in outputs]

def model_loading(trt_engine_path,BATCH_SIZE):
    # TensorRT logger singleton
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    # trt_engine_path = "/opt/smarg/surveillance_gateway_prod/surveillance_ai_model/x86_64/Secondary_NumberPlateClassification/lpr_us_onnx_b16.engine"

    trt_runtime = trt.Runtime(TRT_LOGGER)
    # pdb.set_trace()
    trt_engine = load_engine(trt_runtime, trt_engine_path)
    # Execution context is needed for inference
    context = trt_engine.create_execution_context()
    # input shape
    context.set_binding_shape(0, input_shape)
    # This allocates memory for network inputs/outputs on both CPU and GPU
    inputs, outputs, bindings, stream = allocate_buffers(trt_engine,BATCH_SIZE)
    return inputs, outputs, bindings, stream, context

def preprocess_res18(image):
    # image = np.asarray(image.resize((224, 224), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(trt.float32)).ravel()
    # image = np.asarray(image.resize((input_shape[1], input_shape[2]), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(trt.float32)).ravel()
    image = image.resize((input_shape[1], input_shape[2]), Image.ANTIALIAS)
    return image

inputs, outputs, bindings, stream, context = model_loading(trt_engine_path,BATCH_SIZE)

def classify_and_draw(image, fallen_area, context, bindings, inputs, outputs, stream, fallen_label, acc_threshold):
        draw = ImageDraw.Draw(image)
        global cropped_img_count
        global image_count

        box_coordinates = []  # List to store box coordinates

        # Determine batch size based on the number of area coordinates
        batch_size = 32
        num_batches = (len(fallen_area) + batch_size - 1) // batch_size  # Ceiling division to compute number of batches

        print("Number of batches:", num_batches)

        for batch_idx in range(num_batches):
            batch_start = batch_idx * batch_size
            batch_end = min((batch_idx + 1) * batch_size, len(fallen_area))
            batch_areas = fallen_area[batch_start:batch_end]

            # Prepare batch inputs
            batch_images = []

            for area_coordinates in batch_areas:
                x1, y1, x2, y2 = area_coordinates['XMIN'], area_coordinates['YMIN'], area_coordinates['XMAX'], area_coordinates['YMAX']
                if (x2 - x1) > 60 and (y2 - y1) > 60:
                    area_image_cropped = image.crop((x1, y1, x2, y2))
                    area_image = preprocess_res18(area_image_cropped)

            # Pad batch_images with empty images if the batch size is less than 32
            while len(batch_images) < batch_size:
                empty_image = np.zeros(input_shape, dtype=np.float32)
            batch_images = np.array(batch_images, dtype=np.float32).transpose([2, 0, 1]).astype(trt.nptype(trt.float32)).ravel()

            np.copyto(inputs[0].host, batch_images)

            # Copy batch images to GPU memory individually

            # Perform inference
            outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs[:batch_size], stream=stream, batch_size=len(batch_images))

            for i, area_coordinates in enumerate(batch_areas):
                x1, y1, x2, y2 = area_coordinates['XMIN'], area_coordinates['YMIN'], area_coordinates['XMAX'], area_coordinates['YMAX']
                if (x2 - x1) > 60 and (y2 - y1) > 60:
                    max_index_row = np.argmax(outputs[i][0], axis=0)
                    fallen_normal_acc = int('{:.0f}'.format(outputs[i][0][max_index_row]*100))
                    fallen_label_info = fallen_label[max_index_row]

                    color = "green"
                    if fallen_label_info == "Fallen":
                        color = "red"
                        if fallen_normal_acc > 20:
                            image_name = f"{image_count}.jpg"
                            crop_img_name = cropped_images + "/"+str(fallen_normal_acc)+"_" + image_name + "_" + str(cropped_img_count)+".jpg"
                            cropped_img_count += 1
                            area_image_cropped = Image.fromarray((cuda.from_device(batch_images_gpu[i], area_image.shape).transpose(1, 2, 0) * 255).astype(np.uint8))

                    box_coordinates.append((x1, y1, x2, y2, color, fallen_label_info, fallen_normal_acc))

        # Draw all boxes after processing all batches
        for box_info in box_coordinates:
            x1, y1, x2, y2, color, fallen_label_info, fallen_normal_acc = box_info
            border_width = 10
            draw.rectangle([x1, y1, x2, y2], outline=color)
            if fallen_normal_acc > acc_threshold:
                draw.text((x1, y1), f"{fallen_label_info} ({fallen_normal_acc}%)", fill=color)"{output_folder_path}/{image_count}.jpg")
        image_count += 1
        print(f"Processed images: {image_count}")
    except Exception as e:
        print("Exception:", e)


camera_image_mapping = {


# next 59_330 : 31

area = fallen_area[137]

# Initialize OpenCV video capture
cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    print("Error: Unable to open video.")

while cap.isOpened():
    ret, frame =

    # if not ret:
    #     break  # Break the loop if there are no more frames
        frame = cv2.resize(frame, (1920, 1080))
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image_pil = Image.fromarray(image_rgb)
        if frame_count%frame_interval==0:
            classify_and_draw(image_pil, area, context, bindings, inputs, outputs, stream, fallen_label, acc_threshold)
    except Exception as e:

    # Press 'q' to exit the loop
    if cv2.waitKey(1) & 0xFF == ord('q'):


For the above code getting below exception.

input: shape:(3, 354, 354) dtype:DataType.FLOAT
output: shape:(2, 1, 1) dtype:DataType.FLOAT
inputs :  12030336
outputs :  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Number of batches: 3 FutureWarning: The input object of type 'Image' is an array-like implementing one of the corresponding protocols (`__array__`, `__array_interface__` or `__array_struct__`); but not a sequence (or 0-D). In the future, this object will be coerced as if it was first converted using `np.array(obj)`. To retain the old behaviour, you have to either modify the type 'Image', or assign to an empty array created with `np.empty(correct_shape, dtype=object)`.
  batch_images = np.array(batch_images, dtype=np.float32).transpose([2, 0, 1]).astype(trt.nptype(trt.float32)).ravel()
**Exception: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (32,) + inhomogeneous part.**

You can refer to Python run LPRNet with TensorRT show pycuda._driver.MemoryError: cuMemHostAlloc failed: out of memory to check if it helps.

Hi @Morganh

It is not helping actually, It only work with single image not for batch and for single image I am also able to achieve the result but the problem is with batch.

There is no update from you for a period, assuming this is not an issue anymore. Hence we are closing this topic. If need further support, please open a new one. Thanks

You can refer to tao_deploy/nvidia_tao_deploy/cv/multitask_classification/scripts/ at 31c7e0ed3fe48942c254b3b85517e7418eea17b3 · NVIDIA/tao_deploy · GitHub
and tao_deploy/nvidia_tao_deploy/cv/multitask_classification/ at 31c7e0ed3fe48942c254b3b85517e7418eea17b3 · NVIDIA/tao_deploy · GitHub.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.