TensorRT Engine Creation with Resnet50: [TensorRT] ERROR: resources.cpp (199) - Cuda Error in gieCudaMalloc: 2

ks1 · March 9, 2018, 9:20pm

Hello,

I am trying to benchmark performance of TensorRT (using python API) vs Keras (TensorFlow & PlaidML backends) by running inference of the same Resnet50 model on each framework.

I’ve run in to an issue where I cannot create a TensorRT engine of MAX_BATCHSIZE greater than 2 without getting the following error:

[TensorRT] ERROR: resources.cpp (199) - Cuda Error in gieCudaMalloc: 2

I’ve tried experimenting with different MAX_WORKSPACE values to no avail. I’ve attached my code below. It’s worth noting that this exact same code works if the MAX_BATCHSIZE is set to either 2 or 1.

#!/usr/bin/python
from __future__ import division
from __future__ import print_function

import os
import sys
from random import randint
import numpy as np
from numpy import array


try:
    from PIL import Image
    import pycuda.driver as cuda
    import pycuda.autoinit
    import argparse
except ImportError as err:
    sys.stderr.write("""ERROR: failed to import module ({})
Please make sure you have pycuda and the example dependencies installed.
https://wiki.tiker.net/PyCuda/Installation/Linux
pip(3) install tensorrt[examples]
""".format(err))
    exit(1)

try:
    import uff
except ImportError:
    raise ImportError("""Please install the UFF Toolkit""")

try:
    import tensorrt as trt
    from tensorrt.parsers import uffparser

except ImportError as err:
    sys.stderr.write("""ERROR: failed to import module ({})
Please make sure you have pycuda and the example dependencies installed.
https://wiki.tiker.net/PyCuda/Installation/Linux
pip(3) install tensorrt[examples]
""".format(err))
    exit(1)

G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO)
MAX_WORKSPACE = 1<<20
MAX_BATCHSIZE = 3
batch_size = 3 
channel_order = "channels_first"

import time

from keras.models import load_model
import tensorflow as tf
import os.path as osp
from keras import backend as K
import keras.applications as kapp
from keras.datasets import cifar10
from keras.applications.resnet50 import preprocess_input, decode_predictions

from tensorflow.python.framework import graph_util
from tensorflow.python.framework import graph_io
from tensorflow.tools.graph_transforms import TransformGraph


#Run inference on device
def tensorrt_infer(context, input_img, batch_size):
    
    # Create the engine runtime
    runtime = trt.infer.create_infer_runtime(G_LOGGER)

    # load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)

    # create output array to receive data
    dims = engine.get_binding_dimensions(1).to_DimsCHW()
    elt_count = dims.C() * dims.H() * dims.W() * batch_size

    input_img = input_img.astype(np.float32)
    # Allocate pagelocked memory
    output = cuda.pagelocked_empty(elt_count, dtype=np.float32)
    # alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    # transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    # execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    # transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)
    
    # synchronize threads
    stream.synchronize()
    # return predictions
    return output


def main():

    K.set_learning_phase(0)

    K.set_image_data_format(channel_order)
    
    sess = K.get_session()

    output_fld = '.'
    frozen_model_file = 'resnet50_frozen.pb'
    full_model_path = osp.join(output_fld, frozen_model_file)
    
    # Compile model
    model = kapp.ResNet50()
    print("Compile Keras Model")
    model.compile(optimizer='sgd', loss='categorical_crossentropy',
                  metrics=['accuracy'])


    num_output = 1
    pred = [None]*num_output
    pred_node_names = [None]*num_output
    for i in range(num_output):
        pred_node_names[i] = 'output_node'+str(i)
        pred[i] = tf.identity(model.outputs[i], name=pred_node_names[i])
    print('output nodes names are: ', pred_node_names)
    print(pred)

    # convert variables to constants and save
    constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), pred_node_names)
    constant_graph = graph_util.remove_training_nodes(constant_graph)
    for i, node in enumerate(constant_graph.node):
        print('%i %s: "%s"' % (i + 1, node.op, node.name))
    graph_io.write_graph(constant_graph, output_fld, frozen_model_file, as_text=False)
    print('saved the freezed graph (ready for inference) at: ', full_model_path)

    # convert to uff model from tensorflow frozen (.pb) model
    uff_model = uff.from_tensorflow_frozen_model(full_model_path, ["fc1000/Softmax"])

    # create uff parser
    parser = uffparser.create_uff_parser()
    parser.register_input("input_1", (3, 224, 224), 0) 
    parser.register_output("fc1000/Softmax")
    
    # convert uff to tensorrt engine
    print("Pre-engine memory: ", cuda.mem_get_info())
    engine = trt.utils.uff_to_trt_engine(G_LOGGER,
                                              uff_model,
                                              parser,
                                              MAX_BATCHSIZE,
                                              MAX_WORKSPACE)
    assert(engine)
    context = engine.create_execution_context()
    print("Post-engine memory: ", cuda.mem_get_info())
  
    # destroy the parser, don't need it anymore
    parser.destroy()

    # Load cifar10 data from keras and print parameters
    print(" ")
    print("This is the MAX_BATCHSIZE: " , MAX_BATCHSIZE)
    print("This is the MAX_WORKSPACE: " , MAX_WORKSPACE)
    print("The Channel ordering is: ", channel_order)
    print("This is the batch size: " , batch_size)
    print(" ")

    #load data
    (x_train, y_train_cats), (x_test, y_test_cats) = cifar10.load_data()
    x_train = x_train[:batch_size]
    if channel_order == 'channels_last':
        x_train = np.repeat(np.repeat(x_train, 7, axis=1), 7, axis=2)
    if channel_order == 'channels_first': 
        x_train = np.repeat(np.repeat(x_train, 7, axis=2), 7, axis=3)

    ######## Run TensorRT inference
    print(" ")
    print("[INFO] Start processing/timing for TensorRT inference... ")
    start_time_trt = time.time()
    tensorrt_output = tensorrt_infer(context, x_train, batch_size)
    end_time_trt = time.time()
    tensorrt_output = tensorrt_output.reshape(batch_size, 1000)
    for i in range(batch_size):
        print('Predicted:', decode_predictions(tensorrt_output, top=3)[i])
    print('\r[INFO] TensorRT inference Finished in %.2f seconds.' % (end_time_trt-start_time_trt))
    print(" ")

if __name__ == "__main__":
    main()

Any help/troubleshooting tips would be much appreciated - I am relatively new to TensorRT. Thanks!

agupta2 · April 9, 2018, 7:05pm

Having similar problem, any progress for this error ?

ks1 · April 9, 2018, 7:09pm

agupta2,

No progress toward this error. That being said, I was able to use the c++ API instead of the python API to get around this. I was able to make the MAX_BATCHSIZE much larger when using the c++ API.

agupta2 · April 9, 2018, 8:46pm

@ks1, Thanks for the quick updates!
I’m running for batch_size of 1, still it shows the same error (in Python).

#test the formed engine on the image by running an infernece
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt
from random import randint
from PIL import Image
from matplotlib.pyplot import imshow #to show test case
from tensorrt import parsers
from tensorflow.python.framework import graph_util

G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)
INPUT_H = 32
INPUT_W =  32
OUTPUT_SIZE = 10
DATA = '/home/d/Desktop/model_compare_caffe/svhn/svhn_test_images/'
IMAGE_MEAN = '/home/d/Desktop/model_compare_caffe/svhn/trt/svhn_trt.binaryproto'
RESULT_FILE = '/home/d/Desktop/model_compare_caffe/svhn/svhn_test_images.txt'
accuracy = 0

min_range = 0
max_range = 2600

f  = open(RESULT_FILE, "r")
fs = f.read()
words  = fs.split()
number = [int(w) for w in words]
print(len(number))

def inference_caffe(data):
	runtime = trt.infer.create_infer_runtime(G_LOGGER)
	engine = trt.utils.load_engine(G_LOGGER, "new_mnist.engine")
	context = engine.create_execution_context()

	assert(engine.get_nb_bindings() == 2)
	img = data.astype(np.float32)
	output = np.empty(OUTPUT_SIZE, dtype = np.float32)

	d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize)
	d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize)
	bindings = [int(d_input), int(d_output)]
	stream = cuda.Stream()
	cuda.memcpy_htod_async(d_input, img, stream)
	context.enqueue(1, bindings, stream.handle, None)
	cuda.memcpy_dtoh_async(output, d_output, stream)
	stream.synchronize()
	return output	

# rand_file = randint(0,10)	
# path = DATA + str(rand_file) + '.png'
for current_image in range(min_range, max_range):
	# print(i)
	if(current_image%100 == 0):
		print('currently processing image...', current_image, ' | ', cuda.mem_get_info())

	path = DATA + str(current_image) + '.png'
	im = Image.open(path)
	imshow(np.asarray(im))
	arr = np.array(im)
	img = arr.ravel()

	parser = parsers.caffeparser.create_caffe_parser()
	mean_blob = parser.parse_binary_proto(IMAGE_MEAN)
	parser.destroy()
	#NOTE: This is different than the C++ API, you must provide the size of the data
	mean = mean_blob.get_data(INPUT_W ** 2)
	data = np.empty([INPUT_W ** 2])
	for i in range(INPUT_W ** 2):
	    data[i] = float(img[i]) - mean[i]
	mean_blob.destroy()

	output = inference_caffe(data)

	if(np.argmax(output) == number[current_image]):
		accuracy = accuracy+1;

print('---------------------------------------------------------------------')
print('---------------------------------------------------------------------')
print('total corect solution : ', accuracy)
print('in % : ', (accuracy/max_range)*100)
print('---------------------------------------------------------------------')

Is there anything odd you can point out?

agupta2 · April 9, 2018, 9:47pm

Its working, I was not destroying engine at correct place. Correct function would be:

def inference_caffe(data):
	runtime = trt.infer.create_infer_runtime(G_LOGGER)
	engine = trt.utils.load_engine(G_LOGGER, "new_mnist.engine")
	context = engine.create_execution_context()

	assert(engine.get_nb_bindings() == 2)
	img = data.astype(np.float32)
	output = np.empty(OUTPUT_SIZE, dtype = np.float32)

	d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize)
	# print(type(d_input))
	d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize)
	bindings = [int(d_input), int(d_output)]
	stream = cuda.Stream()
	cuda.memcpy_htod_async(d_input, img, stream)
	context.enqueue(1, bindings, stream.handle, None)
	cuda.memcpy_dtoh_async(output, d_output, stream)
	stream.synchronize()
	d_input.free()
	d_output.free()
	stream = None
	context.destroy()
	engine.destroy()
	runtime.destroy()
	return output

Topic		Replies	Views
TRT5.0: Memory error when building engine TensorRT	8	6135	October 31, 2018
could not find any implementation for node 2-layer MLP, try increasing the workspace size with IBuilder::setMaxWorkspaceSize() TensorRT	4	3790	October 12, 2021
ValueError if engine batch size is unequal to number of inferenced images TensorRT	1	416	April 20, 2020
Myelin memory budget exceeded while building TensorRT engine with batch > 1 TensorRT tensorrt	4	1001	October 12, 2021
Cuda Error when running Tensorrt 3 on complete test set. TensorRT	3	884	May 2, 2018
"Engine buffer is full" TensorRT	15	3809	October 12, 2021
OutOfMemory Error in computeCosts: 0 TensorRT	2	936	October 12, 2021
In creating tensorRT engine fc1/BiasAdd: kernel weights has count 102760448 but 134217728 was expected Jetson TX2	6	1417	October 18, 2021
TensorRT - max_batch_size issue Jetson TX2	6	1684	October 18, 2021
Using TensorRT3.0 to convert tensorflow model to create TensorRT engine Jetson TX1	3	653	March 8, 2018

TensorRT Engine Creation with Resnet50: [TensorRT] ERROR: resources.cpp (199) - Cuda Error in gieCudaMalloc: 2

Related topics