I use TensorRT to run my own classify model,but my classifier result is wrong.when I use the same model in caffe,I can get the correct result. I have checked my code,but no errors have found. here is my code:
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <assert.h>
#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h>
#include “NvInfer.h”
#include “NvCaffeParser.h”
#define LOG_GIE "[GIE] "
using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace cv;
using namespace std;
// stuff we know about the network and the caffe input/output blobs
static const int INPUT_H = 227;
static const int INPUT_W = 227;
static const int OUTPUT_SIZE =57;
const int BATCH_SIZE=1;
bool mEnableFP16=false;
bool mOverride16=false;
const char* INPUT_BLOB_NAME = “data”;
const char* OUTPUT_BLOB_NAME = “prob”;
/* Wrap the input layer of the network in separate cv::Mat objects
(one per channel). This way we save one memcpy operation and we
don’t need to rely on cudaMemcpy2D. The last preprocessing
operation will write the separate channels directly to the input
layer. /
void WrapInputLayer(std::vector<std::vectorcv::Mat >& input_channels,float buffer) {float* input_data = buffer;
for (int n = 0; n < 1; ++n) {
for (int i = 0; i < 3; ++i) {
cv::Mat channel(INPUT_H, INPUT_W, CV_32FC1, input_data);
input_data += INPUT_H * INPUT_W;
#define CHECK(status)
if (status != 0)
std::cout << "Cuda failure: " << status;
// Logger for GIE info/warning/errors
class Logger : public ILogger
void log(Severity severity, const char* msg) override
// suppress info-level messages
if (severity != Severity::kINFO)
std::cout << msg << std::endl;
} gLogger;
cv::Mat setMean(/string mean_binaryproto/)
// parse the mean file
ICaffeParser* parser = createCaffeParser();
IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(“alex/alexmean.binaryproto”);
const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());
float* inputData = (float *)meanData;
vector<Mat> inputChannels;
for (int i = 0; i < dims.c; ++i)
cv::Mat channel(dims.h, dims.w, CV_32FC1, inputData);
inputData += dims.h*dims.w;
Mat mean;
resize(mean,mean,Size(INPUT_H,INPUT_H));//resize the mean mat
return mean;
void Preprocess(const cv::Mat& img, cv::Mat& processedImage) {
/* Convert the input image to the input image format of the network. */
Mat meanFile = setMean();
cv::Mat sample;
int num_channels_ = meanFile.channels();
if (img.channels() == 3 && num_channels_ == 1)
cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY);
else if (img.channels() == 4 && num_channels_ == 1)
cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY);
else if (img.channels() == 4 && num_channels_ == 3)
cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR);
else if (img.channels() == 1 && num_channels_ == 3)
cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR);
sample = img;
cv::Size input_geometry = cv::Size(meanFile.cols, meanFile.rows);
cv::Mat sample_resized;
/*preproc-resample */
if (sample.size() != input_geometry)
cv::resize(sample, sample_resized, input_geometry);
sample_resized = sample;
cv::Mat sample_float;
if (num_channels_ == 3)
sample_resized.convertTo(sample_float, CV_32FC3);
sample_resized.convertTo(sample_float, CV_32FC1);
/* END */
/* preproc-normalize */
cv::Mat sample_normalized(INPUT_H, INPUT_W, CV_32FC3);
bool _rescaleTo01=false;
if (_rescaleTo01)
sample_float = sample_float / 255.f;
cv::Mat submean_image;
sample_float.convertTo(sample_normalized, CV_32FC3);
submean_image.convertTo(sample_normalized, CV_32FC3);
// for (int n = 0; n < BATCH_SIZE; ++n) {
// cv::split(sample_normalized, input_channels[n]);
// }
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);//return the binding index for the named tensor,or -1 is not found
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), //INPUT_BLOB_NUME:input
outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); //OUTPUT_BLOB_NUME:prob
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
cudaStream_t stream;
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
// release the stream and the buffers
int main(int argc, char** argv)
cv::Mat frame=cv::imread(“/home/wyx/pic/idcard001.jpg”,CV_LOAD_IMAGE_UNCHANGED);
Mat processedImg;
float* inputData = (float*)processedImg.data;
std::stringstream gieModelStream;
//caffeToGIEModel("mnist.prototxt", "mnist.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);
// create the builder
IBuilder* builder = createInferBuilder(gLogger);
const char* prototxt="alex/alexdeploy.prototxt";
const char* caffemodel="alex/alex.caffemodel";
mEnableFP16 = (mOverride16 == true) ? false : builder->platformHasFastFp16();
printf(LOG_GIE "platform %s FP16 support.\n", mEnableFP16 ? "has" : "does not have");
printf(LOG_GIE "loading %s %s\n", prototxt, prototxt);
nvinfer1::DataType modelDataType = mEnableFP16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; // create a 16-bit model if it's natively supported
// parse the caffe model to populate the network, then set the outputs and create an engine
INetworkDefinition* network = builder->createNetwork();
ICaffeParser *parser = createCaffeParser();
const IBlobNameToTensor *blobNameToTensor =
parser->parse(prototxt, // caffe deploy file
caffemodel, // caffe model file
*network, // network definition that the parser will populate
assert(blobNameToTensor != nullptr);
// the caffe file has no notion of outputs
// so we need to manually say which tensors the engine should generate
// Build the engine
builder->setMaxWorkspaceSize(16 << 20);//WORKSPACE_SIZE);
// set up the network for paired-fp16 format
std::cout<<"not half2mode"<<std::endl;
// Eliminate the side-effect from the delay of GPU frequency boost
ICudaEngine *engine = builder->buildCudaEngine(*network);
int maxsize = engine->getWorkspaceSize();
cout<<"workspacesize= "<<maxsize<<endl;
IExecutionContext *context = engine->createExecutionContext();
// run inference
float prob[OUTPUT_SIZE];
doInference(*context,inputData, prob, 1);
// destroy the engine
context->destroy();//new added ptr
//get the max index of classifier
unsigned int max_label = 0;
for(unsigned int i = 0; i < OUTPUT_SIZE; i++){
max_label = i;
std::cout<<"class label: "<<max_label<<" value: "<<prob[max_label]<<endl;
// print the output distribution
for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
std::cout << i << ": " << prob[i] << "\n";
std::cout<<"the result is : "<<max_label<<std::endl;
std::cout << "Done." << std::endl;
return 0;
Could you tell me where there is a mistake?
