Hi all:
I am trying to run Tiny Yolo version 2 with tensorRT optimization. i am giving input image in BGR format and values in range [0 to 1]. I have approximated leaky Relu with Relu+scale+eltwise operation. I am getting output at second last layer which is a convolution layer and its output size is 12x12x125 tensor. I have implemented the last detection layer in python seperately.
Everything is working fine using caffe but tensorRT is not giving correct output or may be i am interpreting output wrongly.
As i am getting output at second last layer, tensorRT gives me output in NCHW linearized array which is of size=1x125x12x12=18000.
I take this output from TensorRT and reshape it as 125x12x12 and send it to my python implemented detection layer. i am not getting correct results but in caffe implementations i am getting correct results.
Prototxt which i am using is given below
layer {
name: "data"
type: "Input"
top: "data"
input_param {
shape {
dim: 1
dim: 3
dim: 416
dim: 416
}
}
}
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
convolution_param {
num_output: 16
bias_term: false
pad: 1
kernel_size: 3
stride: 1
}
}
layer {
name: "conv1_bn"
type: "BatchNorm"
bottom: "conv1"
top: "conv1_bn"
batch_norm_param {
use_global_stats: true
}
}
layer {
name: "conv1_scale"
type: "Scale"
bottom: "conv1_bn"
top: "conv1_scale"
scale_param {
bias_term: true
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1_scale"
top: "relu1"
}
layer {
name: "scale1"
type: "Power"
bottom: "conv1_scale"
top: "scale1"
power_param {
scale: 0.08
}
}
layer {
name: "eltwise1"
type: "Eltwise"
bottom: "relu1"
bottom: "scale1"
top: "layer1"
eltwise_param {
operation: SUM
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "layer1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
convolution_param {
num_output: 32
bias_term: false
pad: 1
kernel_size: 3
stride: 1
}
}
layer {
name: "conv2_bn"
type: "BatchNorm"
bottom: "conv2"
top: "conv2_bn"
batch_norm_param {
use_global_stats: true
}
}
layer {
name: "conv2_scale"
type: "Scale"
bottom: "conv2_bn"
top: "conv2_scale"
scale_param {
bias_term: true
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2_scale"
top: "relu2"
}
layer {
name: "scale2"
type: "Power"
bottom: "conv2_scale"
top: "scale2"
power_param {
scale: 0.08
}
}
layer {
name: "eltwise2"
type: "Eltwise"
bottom: "relu2"
bottom: "scale2"
top: "layer2"
eltwise_param {
operation: SUM
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "layer2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv3"
type: "Convolution"
bottom: "pool2"
top: "conv3"
convolution_param {
num_output: 64
bias_term: false
pad: 1
kernel_size: 3
stride: 1
}
}
layer {
name: "conv3_bn"
type: "BatchNorm"
bottom: "conv3"
top: "conv3_bn"
batch_norm_param {
use_global_stats: true
}
}
layer {
name: "conv3_scale"
type: "Scale"
bottom: "conv3_bn"
top: "conv3_scale"
scale_param {
bias_term: true
}
}
layer {
name: "relu3"
type: "ReLU"
bottom: "conv3_scale"
top: "relu3"
}
layer {
name: "scale3"
type: "Power"
bottom: "conv3_scale"
top: "scale3"
power_param {
scale: 0.08
}
}
layer {
name: "eltwise3"
type: "Eltwise"
bottom: "relu3"
bottom: "scale3"
top: "layer3"
eltwise_param {
operation: SUM
}
}
layer {
name: "pool3"
type: "Pooling"
bottom: "layer3"
top: "pool3"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv4"
type: "Convolution"
bottom: "pool3"
top: "conv4"
convolution_param {
num_output: 128
bias_term: false
pad: 1
kernel_size: 3
stride: 1
}
}
layer {
name: "conv4_bn"
type: "BatchNorm"
bottom: "conv4"
top: "conv4_bn"
batch_norm_param {
use_global_stats: true
}
}
layer {
name: "conv4_scale"
type: "Scale"
bottom: "conv4_bn"
top: "conv4_scale"
scale_param {
bias_term: true
}
}
layer {
name: "relu4"
type: "ReLU"
bottom: "conv4_scale"
top: "relu4"
}
layer {
name: "scale4"
type: "Power"
bottom: "conv4_scale"
top: "scale4"
power_param {
scale: 0.08
}
}
layer {
name: "eltwise4"
type: "Eltwise"
bottom: "relu4"
bottom: "scale4"
top: "layer4"
eltwise_param {
operation: SUM
}
}
layer {
name: "pool4"
type: "Pooling"
bottom: "layer4"
top: "pool4"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv5"
type: "Convolution"
bottom: "pool4"
top: "conv5"
convolution_param {
num_output: 256
bias_term: false
pad: 1
kernel_size: 3
stride: 1
}
}
layer {
name: "conv5_bn"
type: "BatchNorm"
bottom: "conv5"
top: "conv5_bn"
batch_norm_param {
use_global_stats: true
}
}
layer {
name: "conv5_scale"
type: "Scale"
bottom: "conv5_bn"
top: "conv5_scale"
scale_param {
bias_term: true
}
}
layer {
name: "relu5"
type: "ReLU"
bottom: "conv5_scale"
top: "relu5"
}
layer {
name: "scale5"
type: "Power"
bottom: "conv5_scale"
top: "scale5"
power_param {
scale: 0.08
}
}
layer {
name: "eltwise5"
type: "Eltwise"
bottom: "relu5"
bottom: "scale5"
top: "layer5"
eltwise_param {
operation: SUM
}
}
layer {
name: "pool5"
type: "Pooling"
bottom: "layer5"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv6"
type: "Convolution"
bottom: "pool5"
top: "conv6"
convolution_param {
num_output: 512
bias_term: false
pad: 1
kernel_size: 3
stride: 1
}
}
layer {
name: "conv6_bn"
type: "BatchNorm"
bottom: "conv6"
top: "conv6_bn"
batch_norm_param {
use_global_stats: true
}
}
layer {
name: "conv6_scale"
type: "Scale"
bottom: "conv6_bn"
top: "conv6_scale"
scale_param {
bias_term: true
}
}
layer {
name: "relu6"
type: "ReLU"
bottom: "conv6_scale"
top: "relu6"
}
layer {
name: "scale6"
type: "Power"
bottom: "conv6_scale"
top: "scale6"
power_param {
scale: 0.08
}
}
layer {
name: "eltwise6"
type: "Eltwise"
bottom: "relu6"
bottom: "scale6"
top: "layer6"
eltwise_param {
operation: SUM
}
}
layer {
name: "pool6"
type: "Pooling"
bottom: "layer6"
top: "pool6"
pooling_param {
pool: MAX
kernel_size: 2
stride: 1
}
}
layer {
name: "conv7"
type: "Convolution"
bottom: "pool6"
top: "conv7"
convolution_param {
num_output: 1024
bias_term: false
pad: 1
kernel_size: 3
stride: 1
}
}
layer {
name: "conv7_bn"
type: "BatchNorm"
bottom: "conv7"
top: "conv7_bn"
batch_norm_param {
use_global_stats: true
}
}
layer {
name: "conv7_scale"
type: "Scale"
bottom: "conv7_bn"
top: "conv7_scale"
scale_param {
bias_term: true
}
}
layer {
name: "relu7"
type: "ReLU"
bottom: "conv7_scale"
top: "relu7"
}
layer {
name: "scale7"
type: "Power"
bottom: "conv7_scale"
top: "scale7"
power_param {
scale: 0.08
}
}
layer {
name: "eltwise7"
type: "Eltwise"
bottom: "relu7"
bottom: "scale7"
top: "layer7"
eltwise_param {
operation: SUM
}
}
layer {
name: "conv8"
type: "Convolution"
bottom: "layer7"
top: "conv8"
convolution_param {
num_output: 1024
bias_term: false
pad: 1
kernel_size: 3
stride: 1
}
}
layer {
name: "conv8_bn"
type: "BatchNorm"
bottom: "conv8"
top: "conv8_bn"
batch_norm_param {
use_global_stats: true
}
}
layer {
name: "conv8_scale"
type: "Scale"
bottom: "conv8_bn"
top: "conv8_scale"
scale_param {
bias_term: true
}
}
layer {
name: "relu8"
type: "ReLU"
bottom: "conv8_scale"
top: "relu8"
}
layer {
name: "scale8"
type: "Power"
bottom: "conv8_scale"
top: "scale8"
power_param {
scale: 0.08
}
}
layer {
name: "eltwise8"
type: "Eltwise"
bottom: "relu8"
bottom: "scale8"
top: "layer8"
eltwise_param {
operation: SUM
}
}
layer {
name: "conv9"
type: "Convolution"
bottom: "layer8"
top: "result"
convolution_param {
num_output: 125
pad: 0
kernel_size: 1
stride: 1
}
}
Code which i am using for TensorRT optimization is:
#include </usr/include/opencv2/core.hpp>
#include </usr/include/opencv2/highgui.hpp>
#include </usr/include/opencv2/calib3d.hpp>
#include <new>
#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <time.h>
#include "common.h"
#include <cuda_runtime_api.h>
#include "NvInfer.h"
#include "NvCaffeParser.h"
using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace std;
using namespace cv;
//using namespace cv;
static const int INPUT_C = 3;
std::stringstream gieFileStream;
static Logger gLogger;
static const int INPUT_H = 416;
static const int INPUT_W = 416;
static const int OUTPUT_SIZE = 3*18000;
//const int batchsize=10;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "result";
// stuff we know about the network and the caffe input/output blobs
void caffeToGIEModel( const char* deployFile, // name for caffe prototxt
const char* modelFile, // name for model
const char* output, // network outputs
uint16_t maxBatchSize // batch size - NB must be at least as large as the batch we want to run with)
) // output buffer for the GIE model
{
printf("%s \n",deployFile);
printf("%s \n",modelFile);
printf("%s \n",output);
std::cout<<maxBatchSize<<"\n";
IHostMemory *gieModelStream(nullptr);
// create the builder
IBuilder* builder = createInferBuilder(gLogger);
// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
const IBlobNameToTensor* blobNameToTensor = parser->parse(deployFile,
modelFile,
*network,
nvinfer1::DataType::kFLOAT);
std::cout<<"fine till here"<<"\n";
// specify which tensors are outputs
//for (auto& s : outputs)
network->markOutput(*blobNameToTensor->find(output));
// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(maxBatchSize*16 << 20);
builder->setMinFindIterations(10);
builder->setAverageFindIterations(10);
//builder->setHalf2Mode(true);
builder->setDebugSync(true);
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);
// we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();
// serialize the engine, then close everything down
gieModelStream = engine->serialize();
//.................Storing model to file.................//
gieFileStream.seekg(0, gieFileStream.beg);
gieFileStream.write((const char*)gieModelStream->data(), gieModelStream->size());
std::cout << "printing size of bytes allocated \t" << (gieModelStream->size())<< std::endl;
std::ofstream SaveFile("optimize",std::ios::out|std::ios::binary);
SaveFile.seekp(0,std::ios::beg);
SaveFile << gieFileStream.rdbuf();
SaveFile.close();
gieFileStream.str(std::string());
gieModelStream->destroy();
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
}
void* Load_engine()
{
// deserialize the engine
clock_t Start = clock();
std::ifstream file("optimize",std::ios::in|std::ios::binary);
file.seekg(0);
gieFileStream << file.rdbuf();
gieFileStream.seekg(0, std::ios::end);
const int modelSize = gieFileStream.tellg();
gieFileStream.seekg(0, std::ios::beg);
std::cout << "printing size of read file bytes \t" << (modelSize)<< std::endl;
void* modelMem = malloc(modelSize);
if( !modelMem )
{
printf("failed to allocate %i bytes to deserialize model\n", modelSize);
}
gieFileStream.read((char*)modelMem, modelSize);
file.close();
//printf("Time taken to load engine: %.9fs\n", (double)(clock() - Start)/CLOCKS_PER_SEC);
IRuntime* runtime = createInferRuntime(gLogger);
Start = clock();
ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, nullptr);
//printf("Deserialize Time Time: %.2fs\n", (double)(clock() - Start)/CLOCKS_PER_SEC);
gieFileStream.str(std::string());
runtime->destroy();
shutdownProtobufLibrary();
free(modelMem);
return engine;
}
void doInference( void* eng, float *data, uint16_t size1,float *prob, uint16_t size2 ,uint16_t batchsize)
{
ICudaEngine *engine = static_cast< ICudaEngine *>(eng);
IExecutionContext* context=engine->createExecutionContext();
uint16_t input_HW=size1/batchsize;
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine->getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME),
outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], size1 * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], size2 * sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], data, size1 * sizeof(float), cudaMemcpyHostToDevice, stream));
context->enqueue(batchsize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(prob, buffers[outputIndex], size2*sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// write code for reporting layer timing
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
context->destroy();
engine->destroy();
}
Please tell me what I am doing wrong, either giving input in wrong format or getting output in wrong format?
Thanks in advance…