Hello,
I am trying to benchmark performance of TensorRT (using python API) vs Keras (TensorFlow & PlaidML backends) by running inference of the same Resnet50 model on each framework.
I’ve run in to an issue where I cannot create a TensorRT engine of MAX_BATCHSIZE greater than 2 without getting the following error:
- [TensorRT] ERROR: resources.cpp (199) - Cuda Error in gieCudaMalloc: 2
I’ve tried experimenting with different MAX_WORKSPACE values to no avail. I’ve attached my code below. It’s worth noting that this exact same code works if the MAX_BATCHSIZE is set to either 2 or 1.
#!/usr/bin/python
from __future__ import division
from __future__ import print_function
import os
import sys
from random import randint
import numpy as np
from numpy import array
try:
from PIL import Image
import pycuda.driver as cuda
import pycuda.autoinit
import argparse
except ImportError as err:
sys.stderr.write("""ERROR: failed to import module ({})
Please make sure you have pycuda and the example dependencies installed.
https://wiki.tiker.net/PyCuda/Installation/Linux
pip(3) install tensorrt[examples]
""".format(err))
exit(1)
try:
import uff
except ImportError:
raise ImportError("""Please install the UFF Toolkit""")
try:
import tensorrt as trt
from tensorrt.parsers import uffparser
except ImportError as err:
sys.stderr.write("""ERROR: failed to import module ({})
Please make sure you have pycuda and the example dependencies installed.
https://wiki.tiker.net/PyCuda/Installation/Linux
pip(3) install tensorrt[examples]
""".format(err))
exit(1)
G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO)
MAX_WORKSPACE = 1<<20
MAX_BATCHSIZE = 3
batch_size = 3
channel_order = "channels_first"
import time
from keras.models import load_model
import tensorflow as tf
import os.path as osp
from keras import backend as K
import keras.applications as kapp
from keras.datasets import cifar10
from keras.applications.resnet50 import preprocess_input, decode_predictions
from tensorflow.python.framework import graph_util
from tensorflow.python.framework import graph_io
from tensorflow.tools.graph_transforms import TransformGraph
#Run inference on device
def tensorrt_infer(context, input_img, batch_size):
# Create the engine runtime
runtime = trt.infer.create_infer_runtime(G_LOGGER)
# load engine
engine = context.get_engine()
assert(engine.get_nb_bindings() == 2)
# create output array to receive data
dims = engine.get_binding_dimensions(1).to_DimsCHW()
elt_count = dims.C() * dims.H() * dims.W() * batch_size
input_img = input_img.astype(np.float32)
# Allocate pagelocked memory
output = cuda.pagelocked_empty(elt_count, dtype=np.float32)
# alocate device memory
d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()
# transfer input data to device
cuda.memcpy_htod_async(d_input, input_img, stream)
# execute model
context.enqueue(batch_size, bindings, stream.handle, None)
# transfer predictions back
cuda.memcpy_dtoh_async(output, d_output, stream)
# synchronize threads
stream.synchronize()
# return predictions
return output
def main():
K.set_learning_phase(0)
K.set_image_data_format(channel_order)
sess = K.get_session()
output_fld = '.'
frozen_model_file = 'resnet50_frozen.pb'
full_model_path = osp.join(output_fld, frozen_model_file)
# Compile model
model = kapp.ResNet50()
print("Compile Keras Model")
model.compile(optimizer='sgd', loss='categorical_crossentropy',
metrics=['accuracy'])
num_output = 1
pred = [None]*num_output
pred_node_names = [None]*num_output
for i in range(num_output):
pred_node_names[i] = 'output_node'+str(i)
pred[i] = tf.identity(model.outputs[i], name=pred_node_names[i])
print('output nodes names are: ', pred_node_names)
print(pred)
# convert variables to constants and save
constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), pred_node_names)
constant_graph = graph_util.remove_training_nodes(constant_graph)
for i, node in enumerate(constant_graph.node):
print('%i %s: "%s"' % (i + 1, node.op, node.name))
graph_io.write_graph(constant_graph, output_fld, frozen_model_file, as_text=False)
print('saved the freezed graph (ready for inference) at: ', full_model_path)
# convert to uff model from tensorflow frozen (.pb) model
uff_model = uff.from_tensorflow_frozen_model(full_model_path, ["fc1000/Softmax"])
# create uff parser
parser = uffparser.create_uff_parser()
parser.register_input("input_1", (3, 224, 224), 0)
parser.register_output("fc1000/Softmax")
# convert uff to tensorrt engine
print("Pre-engine memory: ", cuda.mem_get_info())
engine = trt.utils.uff_to_trt_engine(G_LOGGER,
uff_model,
parser,
MAX_BATCHSIZE,
MAX_WORKSPACE)
assert(engine)
context = engine.create_execution_context()
print("Post-engine memory: ", cuda.mem_get_info())
# destroy the parser, don't need it anymore
parser.destroy()
# Load cifar10 data from keras and print parameters
print(" ")
print("This is the MAX_BATCHSIZE: " , MAX_BATCHSIZE)
print("This is the MAX_WORKSPACE: " , MAX_WORKSPACE)
print("The Channel ordering is: ", channel_order)
print("This is the batch size: " , batch_size)
print(" ")
#load data
(x_train, y_train_cats), (x_test, y_test_cats) = cifar10.load_data()
x_train = x_train[:batch_size]
if channel_order == 'channels_last':
x_train = np.repeat(np.repeat(x_train, 7, axis=1), 7, axis=2)
if channel_order == 'channels_first':
x_train = np.repeat(np.repeat(x_train, 7, axis=2), 7, axis=3)
######## Run TensorRT inference
print(" ")
print("[INFO] Start processing/timing for TensorRT inference... ")
start_time_trt = time.time()
tensorrt_output = tensorrt_infer(context, x_train, batch_size)
end_time_trt = time.time()
tensorrt_output = tensorrt_output.reshape(batch_size, 1000)
for i in range(batch_size):
print('Predicted:', decode_predictions(tensorrt_output, top=3)[i])
print('\r[INFO] TensorRT inference Finished in %.2f seconds.' % (end_time_trt-start_time_trt))
print(" ")
if __name__ == "__main__":
main()
Any help/troubleshooting tips would be much appreciated - I am relatively new to TensorRT. Thanks!