Tensorrt wrong classification

Hello,

I m trying to use TensorRT to do inference on my Jetson TX2.
I have a neural network that works perfectly when I infer using only tensorflow.
This is the training script:

import numpy as np
import tensorflow as tf
from tensorflow.python.tools import freeze_graph
import cv2
from PIL import Image
import glob
import sys
import math

train_batch_size =32
valid_batch_size=32
img_size = 100
epochs= 5
keep_probability=0.5

classes_names = []
dataset_file = glob.glob("dataset/asl_alphabet_train/*")

for f in dataset_file:
    classes_names.append(f[27:])  #  the name of the subfolders is the name of the class
num_classes = len(classes_names)

def parser(record):
    # a parsing function to parse the tfrecords
    keys_to_features = {
        "img_raw": tf.FixedLenFeature([], tf.string),
        "label": tf.FixedLenFeature([], tf.int64)

    }
    parsed = tf.parse_single_example(record,keys_to_features)  # parsing one example from the example buffer from the tfrecord using the keys
    image = tf.decode_raw(parsed["img_raw"], tf.float32)  # decoding ( bytes -> tf.float32)
    image = tf.reshape(image, shape=[img_size, img_size, 3])  # reshaping images
    label = parsed["label"]  # casting labels to int32
    label = tf.one_hot(indices=label, depth=num_classes) # transform to one hot encoding
    return image, label

def input_fn(filenames, batch_size, train_bool=True):
    # from tfrecord to iterable data
    dataset = tf.data.TFRecordDataset(filenames=filenames)#,num_parallel_reads=40)  # instantiantion of an object from class TFRecordDataset
    dataset = dataset.map(parser)  # maps a function to the dataset
    if train_bool:
        dataset = dataset.shuffle(buffer_size=2048)
        repeat = 1  # if in training mode allow reading data infinitely
    else:
        repeat = 1  # if in validation or test allow max 1 read
    dataset = dataset.repeat(repeat)
    dataset = dataset.batch(batch_size)  #  define bach size
    return dataset

def train_input_fn():
    return input_fn(filenames=["TFrecords/train.tfrecords"],batch_size=train_batch_size)

def val_input_fn():
    return input_fn(filenames=["TFrecords/val.tfrecords"],batch_size=valid_batch_size, train_bool=False)

# def test_input_fn():
#     return input_fn(filenames=["TFrecords/test.tfrecords"])

def conv_layer_max2pool(Input, num_output_channels, conv_filter_size,conv_strides, pool_filter_size, pool_strides,POOL=True):
    # a function to create convulional layers, parameters are :
    #       num_output_channels : number of the output filters
    #       conv_filters_size: size of the convolution filter it should be a 2-D tuple
    #       conv-strides: strides of the convolution. It's assumes that the strides over the height are the same as over the width
    #       pool_filter_strides: as the conv_filter_size but for the pooling filter
    #       pool_strides: as the conv_strides but for the pooling

    filter_shape= [conv_filter_size[0], conv_filter_size[1], Input.get_shape().as_list()[3], num_output_channels] #creating the shape of the filter to create the weights of the convolution
    W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.01)) #creating the weights
    conv= tf.nn.conv2d(Input, W, [1,conv_strides,conv_strides,1], padding="SAME") # creating the convolutional layer

    bias=tf.Variable(tf.zeros([num_output_channels])) # creating the biasis

    conv=tf.nn.bias_add(conv, bias)
    conv=tf.nn.relu(conv)

    #max pooling
    if POOL :
        conv=tf.nn.max_pool(conv, [1, pool_filter_size[0], pool_filter_size[1], 1],[1, pool_strides, pool_strides, 1], padding="SAME")

    return conv

def model_fn(X, keep_prob):
    #X = tf.transpose(X, [0, 3, 1, 2])

    conv1=conv_layer_max2pool(X,num_output_channels=96, conv_filter_size=(11,11), conv_strides=4, pool_filter_size=(3,3), pool_strides=2)

    conv2=conv_layer_max2pool(conv1,num_output_channels=256, conv_filter_size=(5,5), conv_strides=1, pool_filter_size=(3,3), pool_strides=2)

    conv3=conv_layer_max2pool(conv2,num_output_channels=384, conv_filter_size=(3,3), conv_strides=1, pool_filter_size=(2,2), pool_strides=2, POOL=False)
    conv4=conv_layer_max2pool(conv3,num_output_channels=384, conv_filter_size=(3,3), conv_strides=1, pool_filter_size=(2,2), pool_strides=2, POOL=False)
    conv5=conv_layer_max2pool(conv4,num_output_channels=256, conv_filter_size=(3,3), conv_strides=1, pool_filter_size=(3,3), pool_strides=2)

flat_layer= tf.layers.flatten(conv5)

    #FC layers

    dense1=tf.layers.dense(flat_layer, 4096, activation=tf.nn.relu)
    #dense1=tf.nn.dropout(dense1, keep_prob)

    dense2=tf.layers.dense(dense1, 4096, activation=tf.nn.relu)
    #dense2=tf.nn.dropout(dense2, keep_prob)

return dense2

#Remove previous weights, bias, inputs...
tf.reset_default_graph()

# place holdes for features, labels and keep_prob

x=tf.placeholder(tf.float32, [None,  img_size, img_size, 3] , name='x')
y=tf.placeholder(tf.int64, [None, num_classes], name='y')
keep_prob=tf.placeholder(tf.float32, name='keep_prob')

#log
logits= tf.layers.dense(model_fn(x,keep_prob), num_classes, name="logits") 

#softmax
softmax=tf.identity(tf.nn.softmax_cross_entropy_with_logits( labels=y, logits=logits), name='softmax')

# loss=
loss=tf.reduce_mean(softmax)

#optimizer
optimizer=tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss)

#Accuracy
pred=tf.equal(tf.argmax(logits,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(pred,tf.float32), name='accuracy')

train_dataset= train_input_fn()
train_iterator=train_dataset.make_initializable_iterator()
features, labels= train_iterator.get_next()

valid_dataset=val_input_fn()
valid_iterator=valid_dataset.make_initializable_iterator()
valid_features, valid_labels=valid_iterator.get_next()

save_model_path="Model/model0/model.ckpt"

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(epochs):
        sess.run(train_iterator.initializer)
        sess.run(valid_iterator.initializer)
        while True  :
            try:
                img_batch, label_batch= sess.run([features,labels])
                sess.run(optimizer, feed_dict={x: img_batch, y: label_batch, keep_prob: keep_probability})
            except tf.errors.OutOfRangeError:
                print('Epoch {:>2}: '.format(epoch + 1), end='')
                l = sess.run(loss, feed_dict={x: img_batch, y: label_batch, keep_prob: 1.0})
                break

count=0
        valid_accuracy=0

        while True :
            try:
                valid_img_batch, valid_label_batch = sess.run([valid_features, valid_labels])
                valid_accuracy+=sess.run(accuracy, feed_dict={x: valid_img_batch, y:valid_label_batch, keep_prob:1.0})
            except  tf.errors.OutOfRangeError:
                break
            count += 1
        valid_accuracy=valid_accuracy/count
        print("The loss is : {0}, and the Validation Accuracy is: {1}".format(l, valid_accuracy))

    saver=tf.train.Saver()
    saver_path=saver.save(sess,save_model_path)
    graph_def=sess.graph.as_graph_def()	
    #for node in graph_def.node:
    #	print(node.name)
    tf.train.write_graph(graph_def,'./Model/model0/','graph.pbtxt',as_text=True)
    freeze_graph.freeze_graph('./Model/model0/graph.pbtxt', "", False,'./Model/model0/model.ckpt', "logits/BiasAdd",
                              "save/restore_all", "save/Const:0",
                              './Model/model0/frozen_model.pb', True, ""
                              )

I use a simple script to convert a frozen graph to UFF :

import os
import uff

FROZEN_GRAPH_FILENAME='Model/model0/frozen_model.pb'
OUTPUT_NAME='logits/BiasAdd'
OUTPUT_UFF_FILE_NAME='Inference_engine/UFF.uff'

def UFF_write():
	uff.from_tensorflow_frozen_model(
		frozen_file=FROZEN_GRAPH_FILENAME,
		output_nodes=[OUTPUT_NAME],
		output_filename=OUTPUT_UFF_FILE_NAME,
		text=False,
		)
if __name__ == '__main__' : 
	UFF_write()

Then a cpp code to tranform the UFF the a plan ( engine ) :

#include <iostream>
#include <string>
#include <sstream>
#include <fstream>
#include <NvInfer.h>
#include <NvUffParser.h>

using namespace std;
using namespace nvinfer1;
using namespace nvuffparser;

class Logger : public ILogger
{
	void log(Severity severity, const char * msg) override
	{
		cout << msg << endl;
	}
} gLogger;

int main()
{
	
	string uffFilename ="/home/nvidia/DL/ASL-sign-language-classifier-/Inference_engine/UFF.uff";
	string planFilename ="../data/plans/engine.plan";
	string inputName = "x";
	int inputHeight = 100;
	int inputWidth = 100;
	string outputName = "logits/BiasAdd";
	int maxBatchSize = 1;
	int maxWorkspaceSize= 0; 
	DataType dataType=DataType::kFLOAT;

	//parse uff
	IBuilder *builder = createInferBuilder(gLogger);
	INetworkDefinition *network = builder->createNetwork();
	IUffParser *parser = createUffParser();
	parser->registerInput(inputName.c_str(), DimsCHW(3, inputHeight, inputWidth), UffInputOrder::kNHWC);
	parser->registerOutput(outputName.c_str());
	 if (!parser->parse(uffFilename.c_str(), *network, dataType))
	  {
	    cout << "Failed to parse UFF\n";
	    builder->destroy();
	    parser->destroy();
	    network->destroy();
	    return 1;
	  }
	

	// build engine 
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(maxWorkspaceSize);
	ICudaEngine *engine = builder->buildCudaEngine(*network);

	// serialize engine and write to file
	ofstream planFile;
	planFile.open(planFilename);
	IHostMemory *serializedEngine = engine->serialize();
	planFile.write((char *)serializedEngine->data(), serializedEngine->size());
	planFile.close();
	
	builder->destroy();
	parser->destroy();
	network->destroy();
	engine->destroy();
	serializedEngine->destroy();
	
	return 0;
}

And at last the c code to infer :

#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <NvInfer.h>
#include <opencv2/opencv.hpp>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <opencv2/highgui/highgui.hpp>

using namespace std;
using namespace nvinfer1;

class Logger : public ILogger
{
  void log(Severity severity, const char * msg) override
  {
    if (severity != Severity::kINFO)
      cout << msg << endl;
  }
} gLogger;

void cvImageToTensor(const cv::Mat & image, float *tensor, nvinfer1::Dims dimensions)
{
  const size_t channels = dimensions.d[0];
  const size_t height = dimensions.d[1];
  const size_t width = dimensions.d[2];
  
  for( int i=0; i<height*width*channels; i++)
  {
   tensor[i]=image.data[i];
  }	
 
}
size_t numTensorElements(nvinfer1::Dims dimensions)
{
  if (dimensions.nbDims == 0)
    return 0;
  size_t size = 1;
  for (int i = 0; i < dimensions.nbDims; i++)
    size *= dimensions.d[i];
  return size;
}

float argmax(float *tensor, nvinfer1::Dims dimensions)
{ 
  size_t max_ind=0;
  size_t i=0;
  size_t numel=numTensorElements(dimensions);
  for(; i<numel; i++)
  {	//cout<<i<<endl;
	//cout<<*(tensor+i)<<endl;
	if( (*(tensor+i)) > (*(tensor+max_ind)) ) max_ind=i ;
  }
  return max_ind;
}

int main()
{

string imageFilename = "../dataset/B_test.jpg";
string planFilename="../data/plans/engine.plan";
string inputnodeName="x";
string outputnodeName="logits/BiasAdd";
string classes_names="../classes_names.txt";
//getting the classes names
vector<string> classes;
ifstream ReadFile;
ReadFile.open(classes_names);
string str;
if (ReadFile.is_open())
{
	while(!ReadFile.eof())
	{	getline(ReadFile,str);
		classes.push_back(str);
	}
}
//for(int i=0; i<classes.size(); i++)
//{	
//	cout<<i<<endl;
//	cout<<classes[i]<<endl;
//}

//Load the engine 
cout<<"Loading The TensorRT engine from plan file"<<endl;
ifstream planFile(planFilename);
if(!planFile.is_open()) {cout<<"Could not open plan file."<<endl; return 1;}

stringstream planBuffer;
planBuffer << planFile.rdbuf();
string plan=planBuffer.str();

//Create a runtime object to deserialize inference engine
IRuntime* runtime=createInferRuntime(gLogger);
cout<<"ok"<<endl;
ICudaEngine* engine= runtime->deserializeCudaEngine((void*)plan.data(), plan.size(), nullptr);

// Create space to store intermediate activation values
IExecutionContext *context = engine->createExecutionContext();

//Get the input / output dimensions 
int inputBindingIndex, outputBindingIndex;
inputBindingIndex = engine->getBindingIndex(inputnodeName.c_str());
outputBindingIndex = engine->getBindingIndex(outputnodeName.c_str());
if(inputBindingIndex < 0) cout << "Invalid input name." << endl;
if(outputBindingIndex < 0) cout << "invalid output name." << endl;

Dims inputDims, outputDims;
inputDims = engine->getBindingDimensions(inputBindingIndex);
outputDims = engine->getBindingDimensions(outputBindingIndex);
int inputWidth, inputHeight;
inputHeight = inputDims.d[1];
inputWidth = inputDims.d[2];

//Read image convert color and resize
cout << "Preprocessing input ..." << endl;
cv::Mat image = cv::imread(imageFilename,1);
cv::namedWindow( "Display window", CV_WINDOW_AUTOSIZE );

if(image.data == NULL ) { cout << "Could not read image from file." << endl; return 1;}
//cv::cvtColor(image, image, cv::COLOR_BGR2RGB, 3);

cv::resize(image, image, cv::Size(inputWidth, inputHeight), cv::INTER_CUBIC);
//image.convertTo(image, CV_32FC3);
cv::imshow("Display window", image);
cv::waitKey(0);

//Convert from uint8+NHWC to float+NCHW
float *inputDataHost, *outputDataHost;
size_t numInput, numOutput;
numInput = numTensorElements(inputDims);
numOutput = numTensorElements(outputDims);
inputDataHost = (float*) malloc(numInput * sizeof(float));
outputDataHost = (float*) malloc(numOutput * sizeof(float));
//*inputDataHost=image;
cvImageToTensor(image, inputDataHost, inputDims);

//Transfer to device
float *inputDataDevice, *outputDataDevice;
cudaMalloc(&inputDataDevice, numInput * sizeof(float));
cudaMalloc(&outputDataDevice, numOutput * sizeof(float));
cudaMemcpy(inputDataDevice, inputDataHost, numInput * sizeof(float), cudaMemcpyHostToDevice);
void *bindings[2];
bindings[inputBindingIndex] = (void*) inputDataDevice;
bindings[outputBindingIndex] = (void*) outputDataDevice;

//Execute engine
cout << "Executing inference engine ..." << endl;
const int kBatchSize = 1;
context->execute(kBatchSize, bindings);

//Transfer output back to host
cudaMemcpy(outputDataHost, outputDataDevice, numOutput * sizeof(float), cudaMemcpyDeviceToHost);

/* parse output */
//  vector<size_t> sortedIndices = argsort(outputDataHost, outputDims);
//cout << "\nThe top-5 indices are: ";
//  for (int i = 0; i < 5; i++)
//    cout << sortedIndices[i] << " ";

//Read Output
cout<<"The prediction  is :" << classes[argmax(outputDataHost,outputDims)] << endl; 
//clean up
runtime->destroy();
engine->destroy();
context->destroy();
free(inputDataHost);
free(outputDataHost);
cudaFree(inputDataDevice);
cudaFree(outputDataDevice);
return 0;
}

In the training script I feed images in the form of NHWC so I choose the parameter UffInputOrder::kNHWC, but the result doesn’t change evenn if I choose UffInputOrder::kNCHW.

I still can’t figure out why the results are wrong.
Can you help please ?

Thank you

Can anyone help me without my code ? It’s my first time using TensorRt ! Thank you

I solved the issue.

The problem was with the tf.layers.flatten. I replaced it with tf.reshape to flatte.
So basically I replaced :

flat_layer= tf.layers.flatten(conv5)

With

flat_layer = tf.reshape(conv5, shape=(-1, conv5.get_shape()[1]*conv5.get_shape()[2]*conv5.get_shape()[3]))

What gave me a clue is the warning I ignored when transformin the frozen graph to the UFF :

DEBUG: convert reshape to flatten node

Good luck guys !

Hi,

I have similar issues with wrong inference results but doesn’t seem to resolve it by changing reshape to flatten.

While the original model in Keras performs well, the converted model in TensorRT seems to be inconsistent.