Hello,
I m trying to use TensorRT to do inference on my Jetson TX2.
I have a neural network that works perfectly when I infer using only tensorflow.
This is the training script:
import numpy as np
import tensorflow as tf
from tensorflow.python.tools import freeze_graph
import cv2
from PIL import Image
import glob
import sys
import math
train_batch_size =32
valid_batch_size=32
img_size = 100
epochs= 5
keep_probability=0.5
classes_names = []
dataset_file = glob.glob("dataset/asl_alphabet_train/*")
for f in dataset_file:
classes_names.append(f[27:]) # the name of the subfolders is the name of the class
num_classes = len(classes_names)
def parser(record):
# a parsing function to parse the tfrecords
keys_to_features = {
"img_raw": tf.FixedLenFeature([], tf.string),
"label": tf.FixedLenFeature([], tf.int64)
}
parsed = tf.parse_single_example(record,keys_to_features) # parsing one example from the example buffer from the tfrecord using the keys
image = tf.decode_raw(parsed["img_raw"], tf.float32) # decoding ( bytes -> tf.float32)
image = tf.reshape(image, shape=[img_size, img_size, 3]) # reshaping images
label = parsed["label"] # casting labels to int32
label = tf.one_hot(indices=label, depth=num_classes) # transform to one hot encoding
return image, label
def input_fn(filenames, batch_size, train_bool=True):
# from tfrecord to iterable data
dataset = tf.data.TFRecordDataset(filenames=filenames)#,num_parallel_reads=40) # instantiantion of an object from class TFRecordDataset
dataset = dataset.map(parser) # maps a function to the dataset
if train_bool:
dataset = dataset.shuffle(buffer_size=2048)
repeat = 1 # if in training mode allow reading data infinitely
else:
repeat = 1 # if in validation or test allow max 1 read
dataset = dataset.repeat(repeat)
dataset = dataset.batch(batch_size) # define bach size
return dataset
def train_input_fn():
return input_fn(filenames=["TFrecords/train.tfrecords"],batch_size=train_batch_size)
def val_input_fn():
return input_fn(filenames=["TFrecords/val.tfrecords"],batch_size=valid_batch_size, train_bool=False)
# def test_input_fn():
# return input_fn(filenames=["TFrecords/test.tfrecords"])
def conv_layer_max2pool(Input, num_output_channels, conv_filter_size,conv_strides, pool_filter_size, pool_strides,POOL=True):
# a function to create convulional layers, parameters are :
# num_output_channels : number of the output filters
# conv_filters_size: size of the convolution filter it should be a 2-D tuple
# conv-strides: strides of the convolution. It's assumes that the strides over the height are the same as over the width
# pool_filter_strides: as the conv_filter_size but for the pooling filter
# pool_strides: as the conv_strides but for the pooling
filter_shape= [conv_filter_size[0], conv_filter_size[1], Input.get_shape().as_list()[3], num_output_channels] #creating the shape of the filter to create the weights of the convolution
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.01)) #creating the weights
conv= tf.nn.conv2d(Input, W, [1,conv_strides,conv_strides,1], padding="SAME") # creating the convolutional layer
bias=tf.Variable(tf.zeros([num_output_channels])) # creating the biasis
conv=tf.nn.bias_add(conv, bias)
conv=tf.nn.relu(conv)
#max pooling
if POOL :
conv=tf.nn.max_pool(conv, [1, pool_filter_size[0], pool_filter_size[1], 1],[1, pool_strides, pool_strides, 1], padding="SAME")
return conv
def model_fn(X, keep_prob):
#X = tf.transpose(X, [0, 3, 1, 2])
conv1=conv_layer_max2pool(X,num_output_channels=96, conv_filter_size=(11,11), conv_strides=4, pool_filter_size=(3,3), pool_strides=2)
conv2=conv_layer_max2pool(conv1,num_output_channels=256, conv_filter_size=(5,5), conv_strides=1, pool_filter_size=(3,3), pool_strides=2)
conv3=conv_layer_max2pool(conv2,num_output_channels=384, conv_filter_size=(3,3), conv_strides=1, pool_filter_size=(2,2), pool_strides=2, POOL=False)
conv4=conv_layer_max2pool(conv3,num_output_channels=384, conv_filter_size=(3,3), conv_strides=1, pool_filter_size=(2,2), pool_strides=2, POOL=False)
conv5=conv_layer_max2pool(conv4,num_output_channels=256, conv_filter_size=(3,3), conv_strides=1, pool_filter_size=(3,3), pool_strides=2)
flat_layer= tf.layers.flatten(conv5)
#FC layers
dense1=tf.layers.dense(flat_layer, 4096, activation=tf.nn.relu)
#dense1=tf.nn.dropout(dense1, keep_prob)
dense2=tf.layers.dense(dense1, 4096, activation=tf.nn.relu)
#dense2=tf.nn.dropout(dense2, keep_prob)
return dense2
#Remove previous weights, bias, inputs...
tf.reset_default_graph()
# place holdes for features, labels and keep_prob
x=tf.placeholder(tf.float32, [None, img_size, img_size, 3] , name='x')
y=tf.placeholder(tf.int64, [None, num_classes], name='y')
keep_prob=tf.placeholder(tf.float32, name='keep_prob')
#log
logits= tf.layers.dense(model_fn(x,keep_prob), num_classes, name="logits")
#softmax
softmax=tf.identity(tf.nn.softmax_cross_entropy_with_logits( labels=y, logits=logits), name='softmax')
# loss=
loss=tf.reduce_mean(softmax)
#optimizer
optimizer=tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss)
#Accuracy
pred=tf.equal(tf.argmax(logits,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(pred,tf.float32), name='accuracy')
train_dataset= train_input_fn()
train_iterator=train_dataset.make_initializable_iterator()
features, labels= train_iterator.get_next()
valid_dataset=val_input_fn()
valid_iterator=valid_dataset.make_initializable_iterator()
valid_features, valid_labels=valid_iterator.get_next()
save_model_path="Model/model0/model.ckpt"
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(epochs):
sess.run(train_iterator.initializer)
sess.run(valid_iterator.initializer)
while True :
try:
img_batch, label_batch= sess.run([features,labels])
sess.run(optimizer, feed_dict={x: img_batch, y: label_batch, keep_prob: keep_probability})
except tf.errors.OutOfRangeError:
print('Epoch {:>2}: '.format(epoch + 1), end='')
l = sess.run(loss, feed_dict={x: img_batch, y: label_batch, keep_prob: 1.0})
break
count=0
valid_accuracy=0
while True :
try:
valid_img_batch, valid_label_batch = sess.run([valid_features, valid_labels])
valid_accuracy+=sess.run(accuracy, feed_dict={x: valid_img_batch, y:valid_label_batch, keep_prob:1.0})
except tf.errors.OutOfRangeError:
break
count += 1
valid_accuracy=valid_accuracy/count
print("The loss is : {0}, and the Validation Accuracy is: {1}".format(l, valid_accuracy))
saver=tf.train.Saver()
saver_path=saver.save(sess,save_model_path)
graph_def=sess.graph.as_graph_def()
#for node in graph_def.node:
# print(node.name)
tf.train.write_graph(graph_def,'./Model/model0/','graph.pbtxt',as_text=True)
freeze_graph.freeze_graph('./Model/model0/graph.pbtxt', "", False,'./Model/model0/model.ckpt', "logits/BiasAdd",
"save/restore_all", "save/Const:0",
'./Model/model0/frozen_model.pb', True, ""
)
I use a simple script to convert a frozen graph to UFF :
import os
import uff
FROZEN_GRAPH_FILENAME='Model/model0/frozen_model.pb'
OUTPUT_NAME='logits/BiasAdd'
OUTPUT_UFF_FILE_NAME='Inference_engine/UFF.uff'
def UFF_write():
uff.from_tensorflow_frozen_model(
frozen_file=FROZEN_GRAPH_FILENAME,
output_nodes=[OUTPUT_NAME],
output_filename=OUTPUT_UFF_FILE_NAME,
text=False,
)
if __name__ == '__main__' :
UFF_write()
Then a cpp code to tranform the UFF the a plan ( engine ) :
#include <iostream>
#include <string>
#include <sstream>
#include <fstream>
#include <NvInfer.h>
#include <NvUffParser.h>
using namespace std;
using namespace nvinfer1;
using namespace nvuffparser;
class Logger : public ILogger
{
void log(Severity severity, const char * msg) override
{
cout << msg << endl;
}
} gLogger;
int main()
{
string uffFilename ="/home/nvidia/DL/ASL-sign-language-classifier-/Inference_engine/UFF.uff";
string planFilename ="../data/plans/engine.plan";
string inputName = "x";
int inputHeight = 100;
int inputWidth = 100;
string outputName = "logits/BiasAdd";
int maxBatchSize = 1;
int maxWorkspaceSize= 0;
DataType dataType=DataType::kFLOAT;
//parse uff
IBuilder *builder = createInferBuilder(gLogger);
INetworkDefinition *network = builder->createNetwork();
IUffParser *parser = createUffParser();
parser->registerInput(inputName.c_str(), DimsCHW(3, inputHeight, inputWidth), UffInputOrder::kNHWC);
parser->registerOutput(outputName.c_str());
if (!parser->parse(uffFilename.c_str(), *network, dataType))
{
cout << "Failed to parse UFF\n";
builder->destroy();
parser->destroy();
network->destroy();
return 1;
}
// build engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(maxWorkspaceSize);
ICudaEngine *engine = builder->buildCudaEngine(*network);
// serialize engine and write to file
ofstream planFile;
planFile.open(planFilename);
IHostMemory *serializedEngine = engine->serialize();
planFile.write((char *)serializedEngine->data(), serializedEngine->size());
planFile.close();
builder->destroy();
parser->destroy();
network->destroy();
engine->destroy();
serializedEngine->destroy();
return 0;
}
And at last the c code to infer :
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <NvInfer.h>
#include <opencv2/opencv.hpp>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <opencv2/highgui/highgui.hpp>
using namespace std;
using namespace nvinfer1;
class Logger : public ILogger
{
void log(Severity severity, const char * msg) override
{
if (severity != Severity::kINFO)
cout << msg << endl;
}
} gLogger;
void cvImageToTensor(const cv::Mat & image, float *tensor, nvinfer1::Dims dimensions)
{
const size_t channels = dimensions.d[0];
const size_t height = dimensions.d[1];
const size_t width = dimensions.d[2];
for( int i=0; i<height*width*channels; i++)
{
tensor[i]=image.data[i];
}
}
size_t numTensorElements(nvinfer1::Dims dimensions)
{
if (dimensions.nbDims == 0)
return 0;
size_t size = 1;
for (int i = 0; i < dimensions.nbDims; i++)
size *= dimensions.d[i];
return size;
}
float argmax(float *tensor, nvinfer1::Dims dimensions)
{
size_t max_ind=0;
size_t i=0;
size_t numel=numTensorElements(dimensions);
for(; i<numel; i++)
{ //cout<<i<<endl;
//cout<<*(tensor+i)<<endl;
if( (*(tensor+i)) > (*(tensor+max_ind)) ) max_ind=i ;
}
return max_ind;
}
int main()
{
string imageFilename = "../dataset/B_test.jpg";
string planFilename="../data/plans/engine.plan";
string inputnodeName="x";
string outputnodeName="logits/BiasAdd";
string classes_names="../classes_names.txt";
//getting the classes names
vector<string> classes;
ifstream ReadFile;
ReadFile.open(classes_names);
string str;
if (ReadFile.is_open())
{
while(!ReadFile.eof())
{ getline(ReadFile,str);
classes.push_back(str);
}
}
//for(int i=0; i<classes.size(); i++)
//{
// cout<<i<<endl;
// cout<<classes[i]<<endl;
//}
//Load the engine
cout<<"Loading The TensorRT engine from plan file"<<endl;
ifstream planFile(planFilename);
if(!planFile.is_open()) {cout<<"Could not open plan file."<<endl; return 1;}
stringstream planBuffer;
planBuffer << planFile.rdbuf();
string plan=planBuffer.str();
//Create a runtime object to deserialize inference engine
IRuntime* runtime=createInferRuntime(gLogger);
cout<<"ok"<<endl;
ICudaEngine* engine= runtime->deserializeCudaEngine((void*)plan.data(), plan.size(), nullptr);
// Create space to store intermediate activation values
IExecutionContext *context = engine->createExecutionContext();
//Get the input / output dimensions
int inputBindingIndex, outputBindingIndex;
inputBindingIndex = engine->getBindingIndex(inputnodeName.c_str());
outputBindingIndex = engine->getBindingIndex(outputnodeName.c_str());
if(inputBindingIndex < 0) cout << "Invalid input name." << endl;
if(outputBindingIndex < 0) cout << "invalid output name." << endl;
Dims inputDims, outputDims;
inputDims = engine->getBindingDimensions(inputBindingIndex);
outputDims = engine->getBindingDimensions(outputBindingIndex);
int inputWidth, inputHeight;
inputHeight = inputDims.d[1];
inputWidth = inputDims.d[2];
//Read image convert color and resize
cout << "Preprocessing input ..." << endl;
cv::Mat image = cv::imread(imageFilename,1);
cv::namedWindow( "Display window", CV_WINDOW_AUTOSIZE );
if(image.data == NULL ) { cout << "Could not read image from file." << endl; return 1;}
//cv::cvtColor(image, image, cv::COLOR_BGR2RGB, 3);
cv::resize(image, image, cv::Size(inputWidth, inputHeight), cv::INTER_CUBIC);
//image.convertTo(image, CV_32FC3);
cv::imshow("Display window", image);
cv::waitKey(0);
//Convert from uint8+NHWC to float+NCHW
float *inputDataHost, *outputDataHost;
size_t numInput, numOutput;
numInput = numTensorElements(inputDims);
numOutput = numTensorElements(outputDims);
inputDataHost = (float*) malloc(numInput * sizeof(float));
outputDataHost = (float*) malloc(numOutput * sizeof(float));
//*inputDataHost=image;
cvImageToTensor(image, inputDataHost, inputDims);
//Transfer to device
float *inputDataDevice, *outputDataDevice;
cudaMalloc(&inputDataDevice, numInput * sizeof(float));
cudaMalloc(&outputDataDevice, numOutput * sizeof(float));
cudaMemcpy(inputDataDevice, inputDataHost, numInput * sizeof(float), cudaMemcpyHostToDevice);
void *bindings[2];
bindings[inputBindingIndex] = (void*) inputDataDevice;
bindings[outputBindingIndex] = (void*) outputDataDevice;
//Execute engine
cout << "Executing inference engine ..." << endl;
const int kBatchSize = 1;
context->execute(kBatchSize, bindings);
//Transfer output back to host
cudaMemcpy(outputDataHost, outputDataDevice, numOutput * sizeof(float), cudaMemcpyDeviceToHost);
/* parse output */
// vector<size_t> sortedIndices = argsort(outputDataHost, outputDims);
//cout << "\nThe top-5 indices are: ";
// for (int i = 0; i < 5; i++)
// cout << sortedIndices[i] << " ";
//Read Output
cout<<"The prediction is :" << classes[argmax(outputDataHost,outputDims)] << endl;
//clean up
runtime->destroy();
engine->destroy();
context->destroy();
free(inputDataHost);
free(outputDataHost);
cudaFree(inputDataDevice);
cudaFree(outputDataDevice);
return 0;
}
In the training script I feed images in the form of NHWC so I choose the parameter UffInputOrder::kNHWC, but the result doesn’t change evenn if I choose UffInputOrder::kNCHW.
I still can’t figure out why the results are wrong.
Can you help please ?
Thank you