Hi,
I use TensorRT to run my own classify model,but my classifier result is wrong.when I use the same model in caffe,I can get the correct result. I have checked my code,but no errors have found. here is my code:
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <assert.h>
#include
#include
#include
#include
#include <sys/stat.h>
#include
#include <time.h>
#include <cuda_runtime_api.h>
#include
#include
#include “NvInfer.h”
#include “NvCaffeParser.h”
#include
#define LOG_GIE "[GIE] "
using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace cv;
using namespace std;
// stuff we know about the network and the caffe input/output blobs
static const int INPUT_H = 227;
static const int INPUT_W = 227;
static const int OUTPUT_SIZE =57;
const int BATCH_SIZE=1;
bool mEnableFP16=false;
bool mOverride16=false;
const char* INPUT_BLOB_NAME = “data”;
const char* OUTPUT_BLOB_NAME = “prob”;
/* Wrap the input layer of the network in separate cv::Mat objects
-
(one per channel). This way we save one memcpy operation and we
-
don’t need to rely on cudaMemcpy2D. The last preprocessing
-
operation will write the separate channels directly to the input
-
layer. /
void WrapInputLayer(std::vector<std::vectorcv::Mat >& input_channels,float buffer) {float* input_data = buffer;
for (int n = 0; n < 1; ++n) {
input_channels.push_back(std::vectorcv::Mat());
for (int i = 0; i < 3; ++i) {
cv::Mat channel(INPUT_H, INPUT_W, CV_32FC1, input_data);
input_channels[n].push_back(channel);
input_data += INPUT_H * INPUT_W;
}
}
}
#define CHECK(status)
{
if (status != 0)
{
std::cout << "Cuda failure: " << status;
abort();
}
}
// Logger for GIE info/warning/errors
class Logger : public ILogger
{
void log(Severity severity, const char* msg) override
{
// suppress info-level messages
if (severity != Severity::kINFO)
std::cout << msg << std::endl;
}
} gLogger;
cv::Mat setMean(/string mean_binaryproto/)
{
// parse the mean file
ICaffeParser* parser = createCaffeParser();
IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(“alex/alexmean.binaryproto”);
parser->destroy();
const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());
float* inputData = (float *)meanData;
vector<Mat> inputChannels;
for (int i = 0; i < dims.c; ++i)
{
cv::Mat channel(dims.h, dims.w, CV_32FC1, inputData);
inputChannels.push_back(channel);
inputData += dims.h*dims.w;
}
Mat mean;
cv::merge(inputChannels,mean);
resize(mean,mean,Size(INPUT_H,INPUT_H));//resize the mean mat
return mean;
}
void Preprocess(const cv::Mat& img, cv::Mat& processedImage) {
/* Convert the input image to the input image format of the network. */
Mat meanFile = setMean();
cv::Mat sample;
int num_channels_ = meanFile.channels();
if (img.channels() == 3 && num_channels_ == 1)
cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY);
else if (img.channels() == 4 && num_channels_ == 1)
cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY);
else if (img.channels() == 4 && num_channels_ == 3)
cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR);
else if (img.channels() == 1 && num_channels_ == 3)
cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR);
else
sample = img;
cv::Size input_geometry = cv::Size(meanFile.cols, meanFile.rows);
cv::Mat sample_resized;
/*preproc-resample */
if (sample.size() != input_geometry)
cv::resize(sample, sample_resized, input_geometry);
else
sample_resized = sample;
cv::Mat sample_float;
if (num_channels_ == 3)
sample_resized.convertTo(sample_float, CV_32FC3);
else
sample_resized.convertTo(sample_float, CV_32FC1);
/* END */
/* preproc-normalize */
cv::Mat sample_normalized(INPUT_H, INPUT_W, CV_32FC3);
bool _rescaleTo01=false;
if (_rescaleTo01)
sample_float = sample_float / 255.f;
cv::Mat submean_image;
cv::subtract(sample_float,meanFile,submean_image);
sample_float.convertTo(sample_normalized, CV_32FC3);
submean_image.convertTo(sample_normalized, CV_32FC3);
// for (int n = 0; n < BATCH_SIZE; ++n) {
// cv::split(sample_normalized, input_channels[n]);
// }
}
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);//return the binding index for the named tensor,or -1 is not found
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), //INPUT_BLOB_NUME:input
outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); //OUTPUT_BLOB_NUME:prob
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
int main(int argc, char** argv)
{
cv::Mat frame=cv::imread(“/home/wyx/pic/idcard001.jpg”,CV_LOAD_IMAGE_UNCHANGED);
Mat processedImg;
Preprocess(frame,processedImg);
float* inputData = (float*)processedImg.data;
std::stringstream gieModelStream;
//caffeToGIEModel("mnist.prototxt", "mnist.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);
// create the builder
IBuilder* builder = createInferBuilder(gLogger);
const char* prototxt="alex/alexdeploy.prototxt";
const char* caffemodel="alex/alex.caffemodel";
mEnableFP16 = (mOverride16 == true) ? false : builder->platformHasFastFp16();
printf(LOG_GIE "platform %s FP16 support.\n", mEnableFP16 ? "has" : "does not have");
printf(LOG_GIE "loading %s %s\n", prototxt, prototxt);
nvinfer1::DataType modelDataType = mEnableFP16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; // create a 16-bit model if it's natively supported
// parse the caffe model to populate the network, then set the outputs and create an engine
INetworkDefinition* network = builder->createNetwork();
ICaffeParser *parser = createCaffeParser();
const IBlobNameToTensor *blobNameToTensor =
parser->parse(prototxt, // caffe deploy file
caffemodel, // caffe model file
*network, // network definition that the parser will populate
modelDataType);
assert(blobNameToTensor != nullptr);
// the caffe file has no notion of outputs
// so we need to manually say which tensors the engine should generate
network->markOutput(*blobNameToTensor->find(OUTPUT_BLOB_NAME));
// Build the engine
builder->setMaxBatchSize(1);
builder->setMaxWorkspaceSize(16 << 20);//WORKSPACE_SIZE);
// set up the network for paired-fp16 format
if(mEnableFP16)
builder->setHalf2Mode(true);
else
std::cout<<"not half2mode"<<std::endl;
// Eliminate the side-effect from the delay of GPU frequency boost
builder->setMinFindIterations(3);
builder->setAverageFindIterations(2);
//build
ICudaEngine *engine = builder->buildCudaEngine(*network);
int maxsize = engine->getWorkspaceSize();
cout<<"workspacesize= "<<maxsize<<endl;
IExecutionContext *context = engine->createExecutionContext();
// run inference
float prob[OUTPUT_SIZE];
doInference(*context,inputData, prob, 1);
// destroy the engine
context->destroy();//new added ptr
engine->destroy();
//get the max index of classifier
unsigned int max_label = 0;
for(unsigned int i = 0; i < OUTPUT_SIZE; i++){
if(prob[i]>prob[max_label])
max_label = i;
}
std::cout<<"class label: "<<max_label<<" value: "<<prob[max_label]<<endl;
// print the output distribution
for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
std::cout << i << ": " << prob[i] << "\n";
std::cout<<"the result is : "<<max_label<<std::endl;
std::cout << "Done." << std::endl;
return 0;
}
Could you tell me where there is a mistake?
alexdeploy.prototxt.zip (819 Bytes)
alexmean.binaryproto.zip (35.1 KB)