Provide details on the platforms you are using:
Linux distro and version: Ubuntu 18.04
GPU type: Jetson TX2
CUDA version: 10.0
CUDNN version: 7.3.1
Python version: 3.6
Tensorflow version: 1.13.1
TensorRT version: 5.0.6
If Jetson, OS, hw versions
Describe the problem
I am trying to perform image classification inference by creating a TensorRT run-time engine from a .pb file (frozen_model.pb). I was able to use tensorflow.contrib.tensorrt to perform inference successfully with this frozen graph. Although I got the code running without errors using TensorRT 5, I seem to get nonsensical results when running inference. The inference produces a list of 1000 floating point numbers, which I assume are the scores for each image classification category. Then I use TensorFlow’s decode_predictions function to get the category of the largest floating point number, however the result is incorrect. What is the most likely cause of this problem and how can I go about solving it?
Files
Here are the functions I used to build the engine (storing it globally), then run inference
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt
import uff
import numpy as np
from PIL import Image
from tensorflow.keras.applications.mobilenet_v2 import decode_predictions
def build_engine(model_path):
global NVIDIA_TRT_ENGINE
#model_path = "/home/nvidia/image_classification/frozen_model.pb"
model_outputs = ["Logits/Softmax"]
model_input = "input_1"
uff_filename = "model.uff"
# Convert from frozen TensorFlow Model to uff model
TRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
uff_model = uff.from_tensorflow_frozen_model(model_path, model_outputs, output_filename = uff_filename)
# Create an empty network
builder = tensorrt.Builder(TRT_LOGGER)
network = builder.create_network()
# Create UFF parser to parse UFF file and populate the network
parser = tensorrt.UffParser()
parser.register_input(model_input, (3,224,224))
parser.register_output(*model_outputs)
parser.parse(uff_filename, network)
# Build TensorRT Inference Engine from network definition
builder.max_batch_size = 1
builder.max_workspace_size = 1<<20
builder.fp16_mode = False
NVIDIA_TRT_ENGINE = builder.build_cuda_engine(network)
def predict_with_nvidia_trt_engine(img_path):
def image_to_np_CHW(image):
return np.asarray(image.resize((224, 224), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(np.float32)
global NVIDIA_TRT_ENGINE
image_trt = image_to_np_CHW(Image.open(img_path))
image_trt = np.ascontiguousarray(image_trt, dtype=np.float32)
with NVIDIA_TRT_ENGINE as engine:
# Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
h_input = image_trt
h_output = np.empty(tensorrt.volume(NVIDIA_TRT_ENGINE.get_binding_shape(1)), dtype = np.float32)
# Allocate device memory for inputs and outputs.
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()
# Create an execution context.
with engine.create_execution_context() as context:
# Transfer input data to the GPU (h_input -> d_input).
cuda.memcpy_htod_async(d_input, h_input, stream)
# Run inference.
context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
# Transfer predictions back from the GPU (d_output -> h_output).
cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Synchronize the stream
stream.synchronize()
print("Predict: ", np.argmax(h_output), ", ", np.amax(h_output))
print("Predict: ", decode_predictions(np.asarray(h_output).reshape(1,1000), top=3)[0])
return h_output