Description
After converting a Mobilenetv2 model to an engine, I attempt to perform inference on the model. The process fails with this error:
[TensorRT] VERBOSE: Deserialize required 1043134 microseconds.
[TensorRT] VERBOSE: Allocated persistent device memory of size 1121792
[TensorRT] VERBOSE: Allocated activation device memory of size 6940672
[TensorRT] VERBOSE: Assigning persistent memory blocks for various profiles
[TensorRT] ERROR: Parameter check failed at: engine.cpp::enqueueV2::605, condition: !mEngine.hasImplicitBatchDimension()
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Environment
TensorRT Version: 7.2.1.6
GPU Type: Tesla T4
Nvidia Driver Version: 450.51.06
CUDA Version: 11.0
CUDNN Version: 8.0
Python Version (if applicable): 3.7.9
TensorFlow Version (if applicable): 1.15.4
Baremetal or Container (if container which image + tag): nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04
Steps To Reproduce
Steps to convert the model:
import tensorflow as tf
from keras.models import load_model
import tensorflow.keras.backend as K
from tensorflow.python.framework import graph_io
import uff
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
def keras_to_frozen_pb(model_in_path,
model_out_path,
tensor_out_name=None):
"""
Converter that transforms keras model to frozen pb model
Args:
model_in_path (str): Input model path (.h5)
model_out_path (str): Output model path (dir)
tensor_out_name (str, optional): Specified name of output tensor.
If None, it will get default tensor name from keras model.
Defaults to None.
"""
graph = tf.Graph()
with graph.as_default():
sess = tf.compat.v1.Session()
K.set_session(sess)
K.set_learning_phase(0)
# load the model to graph and sess
model = tf.keras.models.load_model(model_in_path)
# freeze the graph
graphdef = tf.compat.v1.graph_util.convert_variables_to_constants(sess, graph.as_graph_def(), [tensor_out_name])
graphdef = tf.compat.v1.graph_util.remove_training_nodes(graphdef)
graph_io.write_graph(graphdef, './', model_out_path, as_text=False)
def frozen_pb_to_plan(model_path,
output_path,
tensor_in_name,
tensor_out_name,
input_size,
data_type=trt.float32,
max_batch_size=1,
max_workspace=1<<30,
tensorboard_dir=None):
# convert TF frozen graph to uff model
uff_model = uff.from_tensorflow_frozen_model(model_path, [tensor_out_name], output_filename="tmp/model.uff")
# create uff parser
parser = trt.UffParser()
parser.register_input(tensor_in_name, input_size)
parser.register_output(tensor_out_name)
# create trt logger and builder
trt_logger = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(trt_logger)
builder.max_batch_size = max_batch_size
builder.max_workspace_size = max_workspace
builder.fp16_mode = (data_type == trt.float16)
# parse the uff model to trt builder
network = builder.create_network()
parser.parse_buffer(uff_model, network)
# build optimized inference engine
engine = builder.build_cuda_engine(network)
# save inference engine
with open(output_path, "wb") as f:
f.write(engine.serialize())
if __name__ == "__main__":
model_path = "mobilenetv2.h5"
keras_to_frozen_pb(model_path, 'tmp/frozen_model.pb', tensor_out_name="dense_3/Softmax")
frozen_pb_to_plan('tmp/frozen_model.pb'), 'tmp/model.engine', 'dense_3/Softmax', (3,224,224))
Steps to infer on the model:
import uff
import tensorrt as trt
from PIL import Image
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from keras.applications.imagenet_utils import preprocess_input
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
def inference(model_path, img_path, input_shape):
img = load_and_preproc_img(img_path, input_shape)
with open(model_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
with engine.create_execution_context() as context:
h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
stream = cuda.Stream()
cuda.memcpy_htod_async(d_input, img, stream)
context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
cuda.memcpy_dtoh_async(h_output, d_output, stream)
stream.synchronize()
return h_output
def load_and_preproc_img(img_path, preprocessor, input_shape):
img = Image.open(img_path)
rgb_im = img.convert('RGB')
img = img.resize(input_shape[1:])
img = np.array(img).astype('float32')
reshaped_img = np.transpose(input_array, (2, 0, 1))
return np.ascontiguousarray(preprocess_imagenet(reshaped_img))
def preprocess_imagenet(img):
return preprocess_input(img, mode='tf')
if __name__ == "__main__":
input_path = "tmp/model.engine"
img_path = "dog.jpg"
print(inference(input_path, img_path, (3,224,224)))