TensorRT Engine Creation with Resnet50: [TensorRT] ERROR: resources.cpp (199) - Cuda Error in gieCudaMalloc: 2

Hello,

I am trying to benchmark performance of TensorRT (using python API) vs Keras (TensorFlow & PlaidML backends) by running inference of the same Resnet50 model on each framework.

I’ve run in to an issue where I cannot create a TensorRT engine of MAX_BATCHSIZE greater than 2 without getting the following error:

  • [TensorRT] ERROR: resources.cpp (199) - Cuda Error in gieCudaMalloc: 2

I’ve tried experimenting with different MAX_WORKSPACE values to no avail. I’ve attached my code below. It’s worth noting that this exact same code works if the MAX_BATCHSIZE is set to either 2 or 1.

#!/usr/bin/python
from __future__ import division
from __future__ import print_function

import os
import sys
from random import randint
import numpy as np
from numpy import array


try:
    from PIL import Image
    import pycuda.driver as cuda
    import pycuda.autoinit
    import argparse
except ImportError as err:
    sys.stderr.write("""ERROR: failed to import module ({})
Please make sure you have pycuda and the example dependencies installed.
https://wiki.tiker.net/PyCuda/Installation/Linux
pip(3) install tensorrt[examples]
""".format(err))
    exit(1)

try:
    import uff
except ImportError:
    raise ImportError("""Please install the UFF Toolkit""")

try:
    import tensorrt as trt
    from tensorrt.parsers import uffparser

except ImportError as err:
    sys.stderr.write("""ERROR: failed to import module ({})
Please make sure you have pycuda and the example dependencies installed.
https://wiki.tiker.net/PyCuda/Installation/Linux
pip(3) install tensorrt[examples]
""".format(err))
    exit(1)

G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO)
MAX_WORKSPACE = 1<<20
MAX_BATCHSIZE = 3
batch_size = 3 
channel_order = "channels_first"

import time

from keras.models import load_model
import tensorflow as tf
import os.path as osp
from keras import backend as K
import keras.applications as kapp
from keras.datasets import cifar10
from keras.applications.resnet50 import preprocess_input, decode_predictions

from tensorflow.python.framework import graph_util
from tensorflow.python.framework import graph_io
from tensorflow.tools.graph_transforms import TransformGraph


#Run inference on device
def tensorrt_infer(context, input_img, batch_size):
    
    # Create the engine runtime
    runtime = trt.infer.create_infer_runtime(G_LOGGER)

    # load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)

    # create output array to receive data
    dims = engine.get_binding_dimensions(1).to_DimsCHW()
    elt_count = dims.C() * dims.H() * dims.W() * batch_size

    input_img = input_img.astype(np.float32)
    # Allocate pagelocked memory
    output = cuda.pagelocked_empty(elt_count, dtype=np.float32)
    # alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    # transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    # execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    # transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)
    
    # synchronize threads
    stream.synchronize()
    # return predictions
    return output


def main():

    K.set_learning_phase(0)

    K.set_image_data_format(channel_order)
    
    sess = K.get_session()

    output_fld = '.'
    frozen_model_file = 'resnet50_frozen.pb'
    full_model_path = osp.join(output_fld, frozen_model_file)
    
    # Compile model
    model = kapp.ResNet50()
    print("Compile Keras Model")
    model.compile(optimizer='sgd', loss='categorical_crossentropy',
                  metrics=['accuracy'])


    num_output = 1
    pred = [None]*num_output
    pred_node_names = [None]*num_output
    for i in range(num_output):
        pred_node_names[i] = 'output_node'+str(i)
        pred[i] = tf.identity(model.outputs[i], name=pred_node_names[i])
    print('output nodes names are: ', pred_node_names)
    print(pred)

    # convert variables to constants and save
    constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), pred_node_names)
    constant_graph = graph_util.remove_training_nodes(constant_graph)
    for i, node in enumerate(constant_graph.node):
        print('%i %s: "%s"' % (i + 1, node.op, node.name))
    graph_io.write_graph(constant_graph, output_fld, frozen_model_file, as_text=False)
    print('saved the freezed graph (ready for inference) at: ', full_model_path)

    # convert to uff model from tensorflow frozen (.pb) model
    uff_model = uff.from_tensorflow_frozen_model(full_model_path, ["fc1000/Softmax"])

    # create uff parser
    parser = uffparser.create_uff_parser()
    parser.register_input("input_1", (3, 224, 224), 0) 
    parser.register_output("fc1000/Softmax")
    
    # convert uff to tensorrt engine
    print("Pre-engine memory: ", cuda.mem_get_info())
    engine = trt.utils.uff_to_trt_engine(G_LOGGER,
                                              uff_model,
                                              parser,
                                              MAX_BATCHSIZE,
                                              MAX_WORKSPACE)
    assert(engine)
    context = engine.create_execution_context()
    print("Post-engine memory: ", cuda.mem_get_info())
  
    # destroy the parser, don't need it anymore
    parser.destroy()

    # Load cifar10 data from keras and print parameters
    print(" ")
    print("This is the MAX_BATCHSIZE: " , MAX_BATCHSIZE)
    print("This is the MAX_WORKSPACE: " , MAX_WORKSPACE)
    print("The Channel ordering is: ", channel_order)
    print("This is the batch size: " , batch_size)
    print(" ")

    #load data
    (x_train, y_train_cats), (x_test, y_test_cats) = cifar10.load_data()
    x_train = x_train[:batch_size]
    if channel_order == 'channels_last':
        x_train = np.repeat(np.repeat(x_train, 7, axis=1), 7, axis=2)
    if channel_order == 'channels_first': 
        x_train = np.repeat(np.repeat(x_train, 7, axis=2), 7, axis=3)

    ######## Run TensorRT inference
    print(" ")
    print("[INFO] Start processing/timing for TensorRT inference... ")
    start_time_trt = time.time()
    tensorrt_output = tensorrt_infer(context, x_train, batch_size)
    end_time_trt = time.time()
    tensorrt_output = tensorrt_output.reshape(batch_size, 1000)
    for i in range(batch_size):
        print('Predicted:', decode_predictions(tensorrt_output, top=3)[i])
    print('\r[INFO] TensorRT inference Finished in %.2f seconds.' % (end_time_trt-start_time_trt))
    print(" ")

if __name__ == "__main__":
    main()

Any help/troubleshooting tips would be much appreciated - I am relatively new to TensorRT. Thanks!

Having similar problem, any progress for this error ?

agupta2,

No progress toward this error. That being said, I was able to use the c++ API instead of the python API to get around this. I was able to make the MAX_BATCHSIZE much larger when using the c++ API.

@ks1, Thanks for the quick updates!
I’m running for batch_size of 1, still it shows the same error (in Python).

#test the formed engine on the image by running an infernece
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt
from random import randint
from PIL import Image
from matplotlib.pyplot import imshow #to show test case
from tensorrt import parsers
from tensorflow.python.framework import graph_util

G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)
INPUT_H = 32
INPUT_W =  32
OUTPUT_SIZE = 10
DATA = '/home/d/Desktop/model_compare_caffe/svhn/svhn_test_images/'
IMAGE_MEAN = '/home/d/Desktop/model_compare_caffe/svhn/trt/svhn_trt.binaryproto'
RESULT_FILE = '/home/d/Desktop/model_compare_caffe/svhn/svhn_test_images.txt'
accuracy = 0

min_range = 0
max_range = 2600

f  = open(RESULT_FILE, "r")
fs = f.read()
words  = fs.split()
number = [int(w) for w in words]
print(len(number))

def inference_caffe(data):
	runtime = trt.infer.create_infer_runtime(G_LOGGER)
	engine = trt.utils.load_engine(G_LOGGER, "new_mnist.engine")
	context = engine.create_execution_context()

	assert(engine.get_nb_bindings() == 2)
	img = data.astype(np.float32)
	output = np.empty(OUTPUT_SIZE, dtype = np.float32)

	d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize)
	d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize)
	bindings = [int(d_input), int(d_output)]
	stream = cuda.Stream()
	cuda.memcpy_htod_async(d_input, img, stream)
	context.enqueue(1, bindings, stream.handle, None)
	cuda.memcpy_dtoh_async(output, d_output, stream)
	stream.synchronize()
	return output	

# rand_file = randint(0,10)	
# path = DATA + str(rand_file) + '.png'
for current_image in range(min_range, max_range):
	# print(i)
	if(current_image%100 == 0):
		print('currently processing image...', current_image, ' | ', cuda.mem_get_info())

	path = DATA + str(current_image) + '.png'
	im = Image.open(path)
	imshow(np.asarray(im))
	arr = np.array(im)
	img = arr.ravel()

	parser = parsers.caffeparser.create_caffe_parser()
	mean_blob = parser.parse_binary_proto(IMAGE_MEAN)
	parser.destroy()
	#NOTE: This is different than the C++ API, you must provide the size of the data
	mean = mean_blob.get_data(INPUT_W ** 2)
	data = np.empty([INPUT_W ** 2])
	for i in range(INPUT_W ** 2):
	    data[i] = float(img[i]) - mean[i]
	mean_blob.destroy()

	output = inference_caffe(data)

	if(np.argmax(output) == number[current_image]):
		accuracy = accuracy+1;

print('---------------------------------------------------------------------')
print('---------------------------------------------------------------------')
print('total corect solution : ', accuracy)
print('in % : ', (accuracy/max_range)*100)
print('---------------------------------------------------------------------')

Is there anything odd you can point out?

Its working, I was not destroying engine at correct place. Correct function would be:

def inference_caffe(data):
	runtime = trt.infer.create_infer_runtime(G_LOGGER)
	engine = trt.utils.load_engine(G_LOGGER, "new_mnist.engine")
	context = engine.create_execution_context()

	assert(engine.get_nb_bindings() == 2)
	img = data.astype(np.float32)
	output = np.empty(OUTPUT_SIZE, dtype = np.float32)

	d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize)
	# print(type(d_input))
	d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize)
	bindings = [int(d_input), int(d_output)]
	stream = cuda.Stream()
	cuda.memcpy_htod_async(d_input, img, stream)
	context.enqueue(1, bindings, stream.handle, None)
	cuda.memcpy_dtoh_async(output, d_output, stream)
	stream.synchronize()
	d_input.free()
	d_output.free()
	stream = None
	context.destroy()
	engine.destroy()
	runtime.destroy()
	return output