Tensorrt considerably slower than TFRT and pure Tensorflow

HI all, I recently completed a project for Acute Lymphoblastic Leukemia detection using Jetson Nano. The project included a pure Tensorflow model trained with Intel’s oneAPI and Optimization for Tensorflow, TFRT, ONNX and Tensorrt.

With pure Tensorflow and TFRT the results were good on both the Intel NUC and the Jetson, however the Jetson was much slower in both cases. My main concern is that Tensorrt on the Jetson performed considerably slower than both pure Tensorflow and TFRT.

The related code can be found here (referrence) but I will also share in this post. This is the first time I have worked with both Tensorrt and Jetson, so advise would be greatly appreciated.



import cv2
import numpy as np
import os.path
import tensorrt as trt
import time

TRT_LOGGER = trt.Logger()
EXPLICIT_BATCH = [1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)]


class engine():
	""" TensorRT engine """

	def __init__(self, helpers):
		""" Initializes the TensorRT engine class. """

		self.helpers = helpers
		self.confs = helpers.confs
		self.onnx_model_path = self.confs["model"]["onnx"]
		self.tensorrt_model_path = self.confs["model"]["tensorrt"]
		self.testing_dir = self.confs["data"]["test"]
		self.valid = self.confs["data"]["valid_types"]
		self.labels = self.confs["data"]["labels"]

		if not os.path.isfile(self.tensorrt_model_path):
			self.save_engine(self.build_engine())
			self.helpers.logger.info("TensorRT model generated.")

		self.helpers.logger.info("Engine class initialization complete.")

	def build_engine(self):
		""" Builds the TensorRT engine. """

		with trt.Builder(TRT_LOGGER) as builder, builder.create_network(*EXPLICIT_BATCH) \
				as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
			builder.max_workspace_size = 1 << 30
			builder.max_batch_size = 1
			builder.fp16_mode = False
			with open(self.onnx_model_path, 'rb') as model:
				if not parser.parse(model.read()):
					self.helpers.logger.info("ERROR: Failed to parse the ONNX file.")
					for error in range(parser.num_errors):
						self.helpers.logger.info(parser.get_error(error))
					return None
			shape = list(network.get_input(0).shape)
			shape[0] = 1
			network.get_input(0).shape = shape
			return builder.build_cuda_engine(network)

		self.helpers.logger.info("Engine build complete.")

	def save_engine(self, engine):
		""" Saves the TensorRT engine. """

		with open(self.tensorrt_model_path, 'wb') as f:
			f.write(engine.serialize())

		self.helpers.logger.info("Engine save complete.")

	def load_engine(self):
		""" Loads the TensorRT engine. """

		with open(self.tensorrt_model_path, 'rb') as f:
			engine_data = f.read()
		self.engine = trt.Runtime(TRT_LOGGER).deserialize_cuda_engine(engine_data)

		self.helpers.logger.info("Engine load complete.")

	def init_trt_buffers(self, cuda):
		""" Initialize host buffers and cuda buffers for the engine."""

		size = trt.volume((1, 100, 100, 3)) * self.engine.max_batch_size
		host_input = cuda.pagelocked_empty(size, np.float32)
		cuda_input = cuda.mem_alloc(host_input.nbytes)
		size = trt.volume((1, 2)) * self.engine.max_batch_size
		host_output = cuda.pagelocked_empty(size, np.float32)
		cuda_output = cuda.mem_alloc(host_output.nbytes)
		return host_input, cuda_input, host_output, cuda_output

		self.helpers.logger.info("Engine buffers initialized.")

	def predict(self, img):
		""" Inference the image with TensorRT engine."""

		import pycuda.autoinit
		import pycuda.driver as cuda

		with open(self.tensorrt_model_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
			engine = runtime.deserialize_cuda_engine(f.read())

		host_input, cuda_input, host_output, cuda_output = self.init_trt_buffers(
			cuda)
		stream = cuda.Stream()

		context = self.engine.create_execution_context()
		context.set_binding_shape(0, (1, 100, 100, 3))

		np.copyto(host_input, img.ravel())
		cuda.memcpy_htod_async(cuda_input, host_input, stream)

		context.execute_async_v2(bindings=[int(cuda_input), int(cuda_output)],
									stream_handle=stream.handle)

		cuda.memcpy_dtoh_async(host_output, cuda_output, stream)
		stream.synchronize()

		return host_output

	def reshape(self, img):
		""" Reshapes an image. """

		dx, dy, dz = img.shape
		input_data = img.reshape((-1, dx, dy, dz))
		input_data = input_data / 255.0

		return input_data

	def test(self):
		"""TensorRT test mode
		Loops through the test directory and classifies the images
		using the TensorRT model.
		"""

		files = 0
		tp = 0
		fp = 0
		tn = 0
		fn = 0
		totaltime = 0

		for testFile in os.listdir(self.testing_dir):
			if os.path.splitext(testFile)[1] in self.valid:
				files += 1
				fileName = self.testing_dir + "/" + testFile

				img = cv2.imread(fileName).astype(np.float32)
				self.helpers.logger.info("Loaded test image " + fileName)

				img = cv2.resize(img, (100,100))
				img = self.reshape(img)

				start = time.time()
				predictions = self.predict(img)
				predictions = predictions.argsort()[::-1]
				prediction = self.labels[predictions[0]]
				end = time.time()
				benchmark = end - start
				totaltime += benchmark

				msg = ""
				if prediction == 1 and "_1." in testFile:
					tp += 1
					msg = "Acute Lymphoblastic Leukemia correctly detected (True Positive) in " + str(benchmark) + " seconds."
				elif prediction == 1 and "_0." in testFile:
					fp += 1
					msg = "Acute Lymphoblastic Leukemia incorrectly detected (False Positive) in " + str(benchmark) + " seconds."
				elif prediction == 0 and "_0." in testFile:
					tn += 1
					msg = "Acute Lymphoblastic Leukemia correctly not detected (True Negative) in " + str(benchmark) + " seconds."
				elif prediction == 0 and "_1." in testFile:
					fn += 1
					msg = "Acute Lymphoblastic Leukemia incorrectly not detected (False Negative) in " + str(benchmark) + " seconds."
				self.helpers.logger.info(msg)

		self.helpers.logger.info("Images Classified: " + str(files))
		self.helpers.logger.info("True Positives: " + str(tp))
		self.helpers.logger.info("False Positives: " + str(fp))
		self.helpers.logger.info("True Negatives: " + str(tn))
		self.helpers.logger.info("False Negatives: " + str(fn))
		self.helpers.logger.info("Total Time Taken: " + str(totaltime))

Why are you importing modules and reloading the engine every time in your predict function? You should move the imports to the top of the module and the initialization stuff for inputs/outputs/context to the constructor

		import pycuda.autoinit
		import pycuda.driver as cuda

		with open(self.tensorrt_model_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
			engine = runtime.deserialize_cuda_engine(f.read())

		host_input, cuda_input, host_output, cuda_output = self.init_trt_buffers(
			cuda)
		stream = cuda.Stream()

		context = self.engine.create_execution_context()
		context.set_binding_shape(0, (1, 100, 100, 3))
1 Like

In fact that is redundant code, I had already loaded the engine from the parent using the load_engine method. Can’t believe I missed that, wrong version of the file uploaded.