Cuda Error when running Tensorrt 3 on complete test set.

I’m running a saved engine using python wrapper of tensorrt. I have ~26000 test images, it runs correctly on 368 images, but throws this error when I increase number of images from 368 to 369 (ideally it should work for 26000 images):

/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
[TensorRT] ERROR: cudnnEngine.cpp (56) - Cuda Error in initializeCommonContext: 4
terminate called after throwing an instance of 'nvinfer1::CudaError'
  what():  std::exception
Aborted (core dumped)

I am passing one image at a time in a for loop, my guess is it has something to do with memory. Here’s python code I am using

#test the formed engine on the image by running an infernece
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt
from random import randint
from PIL import Image
from matplotlib.pyplot import imshow #to show test case
from tensorrt import parsers
from tensorflow.python.framework import graph_util

G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)
INPUT_H = 32
INPUT_W =  32
OUTPUT_SIZE = 10
DATA = '/home/d/Desktop/model_compare_caffe/svhn/svhn_test_images/'
IMAGE_MEAN = '/home/d/Desktop/model_compare_caffe/svhn/trt/svhn_trt.binaryproto'
RESULT_FILE = '/home/d/Desktop/model_compare_caffe/svhn/svhn_test_images.txt'
accuracy = 0

max_range = 380

f  = open(RESULT_FILE, "r")
fs = f.read()
words  = fs.split()
number = [int(w) for w in words]
# for i in range(0, max_range):
	# print(number[i])

# rand_file = randint(0,10)	
# path = DATA + str(rand_file) + '.png'
for current_image in range(0, max_range):
	# print(i)
	path = DATA + str(current_image) + '.png'
	im = Image.open(path)
	imshow(np.asarray(im))
	arr = np.array(im)
	img = arr.ravel()

	parser = parsers.caffeparser.create_caffe_parser()
	mean_blob = parser.parse_binary_proto(IMAGE_MEAN)
	parser.destroy()
	#NOTE: This is different than the C++ API, you must provide the size of the data
	mean = mean_blob.get_data(INPUT_W ** 2)
	data = np.empty([INPUT_W ** 2])
	for i in range(INPUT_W ** 2):
	    data[i] = float(img[i]) - mean[i]
	mean_blob.destroy()

	runtime = trt.infer.create_infer_runtime(G_LOGGER)
	engine = trt.utils.load_engine(G_LOGGER, "new_mnist.engine")
	context = engine.create_execution_context()

	assert(engine.get_nb_bindings() == 2)
	#convert input data to Float32
	img = img.astype(np.float32)
	#create output array to receive data
	output = np.empty(OUTPUT_SIZE, dtype = np.float32)

	d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize)
	d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize)
	bindings = [int(d_input), int(d_output)]
	stream = cuda.Stream()

	#transfer input data to device
	cuda.memcpy_htod_async(d_input, img, stream)
	#execute model
	context.enqueue(1, bindings, stream.handle, None)
	#transfer predictions back
	cuda.memcpy_dtoh_async(output, d_output, stream)
	#syncronize threads
	stream.synchronize()

	if(np.argmax(output) == number[current_image]):
		# print('Success')
		accuracy = accuracy+1;
	# else:
		# print('Failure')

	# print(number[current_image], " -> ", str(np.argmax(output)))
print('---------------------------------------------------------------------')
print('---------------------------------------------------------------------')
print('total corect solution : ', accuracy)
print('in % : ', (accuracy/max_range)*100)
print('---------------------------------------------------------------------')

#print("runtime details : ", type(runtime))
context.destroy()
engine.destroy()
runtime.destroy()

Any suggestion ?

Thanks!

Ideally garbage collector should De-allocate the memory and handle everything. But when the allocations are very fast it may not work properly. In that case using d_input.free()
d_output.free()
can be one safety measure. Also I was not destroying engine at the correct place. Working code would look like this

#test the formed engine on the image by running an infernece
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt
from random import randint
from PIL import Image
from matplotlib.pyplot import imshow #to show test case
from tensorrt import parsers
from tensorflow.python.framework import graph_util

G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)
INPUT_H = 32
INPUT_W =  32
OUTPUT_SIZE = 10
DATA = '/home/d/Desktop/model_compare_caffe/svhn/svhn_test_images/'
IMAGE_MEAN = '/home/d/Desktop/model_compare_caffe/svhn/trt/svhn_trt.binaryproto'
RESULT_FILE = '/home/d/Desktop/model_compare_caffe/svhn/svhn_test_images.txt'
accuracy = 0

# def infer_caffe():


min_range = 0
max_range = 26000

f  = open(RESULT_FILE, "r")
fs = f.read()
words  = fs.split()
number = [int(w) for w in words]
print(len(number))

def inference_caffe(data):
	runtime = trt.infer.create_infer_runtime(G_LOGGER)
	engine = trt.utils.load_engine(G_LOGGER, "new_mnist.engine")
	context = engine.create_execution_context()

	assert(engine.get_nb_bindings() == 2)
	img = data.astype(np.float32)
	output = np.empty(OUTPUT_SIZE, dtype = np.float32)

	d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize)
	# print(type(d_input))
	d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize)
	bindings = [int(d_input), int(d_output)]
	stream = cuda.Stream()
	cuda.memcpy_htod_async(d_input, img, stream)
	context.enqueue(1, bindings, stream.handle, None)
	cuda.memcpy_dtoh_async(output, d_output, stream)
	stream.synchronize()
	d_input.free()
	d_output.free()
	stream = None
	context.destroy()
	engine.destroy()
	runtime.destroy()
	return output	

# rand_file = randint(0,10)	
# path = DATA + str(rand_file) + '.png'
for current_image in range(min_range, max_range):
	# print(i)
	if(current_image%100 == 0):
		print('currently processing image...', current_image, ' | ', cuda.mem_get_info())

	path = DATA + str(current_image) + '.png'
	im = Image.open(path)
	imshow(np.asarray(im))
	arr = np.array(im)
	img = arr.ravel()

	parser = parsers.caffeparser.create_caffe_parser()
	mean_blob = parser.parse_binary_proto(IMAGE_MEAN)
	parser.destroy()
	#NOTE: This is different than the C++ API, you must provide the size of the data
	mean = mean_blob.get_data(INPUT_W ** 2)
	data = np.empty([INPUT_W ** 2])
	for i in range(INPUT_W ** 2):
	    data[i] = float(img[i]) - mean[i]
	mean_blob.destroy()

	output = inference_caffe(data)

	if(np.argmax(output) == number[current_image]):
		accuracy = accuracy+1;



#print("runtime details : ", type(runtime))
context.destroy()
engine.destroy()
runtime.destroy()

print('---------------------------------------------------------------------')
print('---------------------------------------------------------------------')
print('total corect solution : ', accuracy)
print('in % : ', (accuracy/max_range)*100)
print('---------------------------------------------------------------------')

We created a new “Deep Learning Training and Inference” section in Devtalk to improve the experience for deep learning and accelerated computing, and HPC users:
https://devtalk.nvidia.com/default/board/301/deep-learning-training-and-inference-/

We are moving active deep learning threads to the new section.

URLs for topics will not change with the re-categorization. So your bookmarks and links will continue to work as earlier.

-Siddharth

Have you tested with TensorRT 4.0 RC to see if your issue has been fixed?