Parameter Check Failed with no Implicit Batch Dimension


After converting a Mobilenetv2 model to an engine, I attempt to perform inference on the model. The process fails with this error:

    [TensorRT] VERBOSE: Deserialize required 1043134 microseconds.
    [TensorRT] VERBOSE: Allocated persistent device memory of size 1121792
    [TensorRT] VERBOSE: Allocated activation device memory of size 6940672
    [TensorRT] VERBOSE: Assigning persistent memory blocks for various profiles
    [TensorRT] ERROR: Parameter check failed at: engine.cpp::enqueueV2::605, condition: !mEngine.hasImplicitBatchDimension()

    [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


TensorRT Version:
GPU Type: Tesla T4
Nvidia Driver Version: 450.51.06
CUDA Version: 11.0
CUDNN Version: 8.0
Python Version (if applicable): 3.7.9
TensorFlow Version (if applicable): 1.15.4
Baremetal or Container (if container which image + tag): nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04

Steps To Reproduce

Steps to convert the model:

import tensorflow as tf
from keras.models import load_model                                     
import tensorflow.keras.backend as K
from tensorflow.python.framework import graph_io
import uff
import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)

def keras_to_frozen_pb(model_in_path, 
    Converter that transforms keras model to frozen pb model
        model_in_path (str): Input model path (.h5) 
        model_out_path (str): Output model path (dir)
        tensor_out_name (str, optional): Specified name of output tensor. 
                                         If None, it will get default tensor name from keras model.
                                         Defaults to None.

    graph = tf.Graph()
    with graph.as_default():
        sess = tf.compat.v1.Session()

        # load the model to graph and sess
        model = tf.keras.models.load_model(model_in_path)

        # freeze the graph
        graphdef = tf.compat.v1.graph_util.convert_variables_to_constants(sess, graph.as_graph_def(), [tensor_out_name])
        graphdef = tf.compat.v1.graph_util.remove_training_nodes(graphdef)
        graph_io.write_graph(graphdef, './', model_out_path, as_text=False)

def frozen_pb_to_plan(model_path, 

    # convert TF frozen graph to uff model
    uff_model = uff.from_tensorflow_frozen_model(model_path, [tensor_out_name], output_filename="tmp/model.uff")
    # create uff parser
    parser = trt.UffParser()
    parser.register_input(tensor_in_name, input_size)

    # create trt logger and builder
    trt_logger = trt.Logger(trt.Logger.INFO)
    builder = trt.Builder(trt_logger)
    builder.max_batch_size = max_batch_size
    builder.max_workspace_size = max_workspace
    builder.fp16_mode = (data_type == trt.float16)

    # parse the uff model to trt builder
    network = builder.create_network()
    parser.parse_buffer(uff_model, network)

    # build optimized inference engine
    engine = builder.build_cuda_engine(network)

    # save inference engine
    with open(output_path, "wb") as f:

if __name__ == "__main__":

    model_path = "mobilenetv2.h5"

    keras_to_frozen_pb(model_path, 'tmp/frozen_model.pb', tensor_out_name="dense_3/Softmax")
    frozen_pb_to_plan('tmp/frozen_model.pb'), 'tmp/model.engine', 'dense_3/Softmax', (3,224,224))

Steps to infer on the model:

import uff
import tensorrt as trt
from PIL import Image
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from keras.applications.imagenet_utils import preprocess_input

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)

def inference(model_path, img_path, input_shape):
    img = load_and_preproc_img(img_path, input_shape)

    with open(model_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(

    with engine.create_execution_context() as context:
        h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
        h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)

        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output = cuda.mem_alloc(h_output.nbytes)
        stream = cuda.Stream()

        cuda.memcpy_htod_async(d_input, img, stream)
        context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
        cuda.memcpy_dtoh_async(h_output, d_output, stream)

        return h_output

def load_and_preproc_img(img_path, preprocessor, input_shape):
    img =
    rgb_im = img.convert('RGB')
    img = img.resize(input_shape[1:])
    img = np.array(img).astype('float32')
    reshaped_img = np.transpose(input_array, (2, 0, 1))
    return np.ascontiguousarray(preprocess_imagenet(reshaped_img))

def preprocess_imagenet(img):
    return preprocess_input(img, mode='tf')

if __name__ == "__main__":
    input_path = "tmp/model.engine"
    img_path = "dog.jpg"

    print(inference(input_path, img_path, (3,224,224)))

Hi @videet,
Can you please share your onnx model so that we can check this from our end.


Hi @AakankshaS, I’ve attached the model and a sample image here. The model I’m working with is a Keras .h5, not an ONNX model.


Hi @videet ,
Looking at your script, suggest you to use ONNX parser with explicit batch flag, as UFF parser have been deprecated