Get wrong result when I using tensorRT to do inference, am I wrong to use ?

I modified the sampleMnist for cifar10 model, below is my code :

#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/imgcodecs/imgcodecs.hpp>
#include <chrono>
#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "common.h"
using namespace nvinfer1;
using namespace nvcaffeparser1;

// stuff we know about the network and the caffe input/output blobs
static const int INPUT_H = 32;
static const int INPUT_W = 32;
static const int INPUT_C = 3;
static const int OUTPUT_SIZE = 10;
static Logger gLogger;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
const std::vector<std::string> directories{ "data/samples/mnist/", "data/mnist/" };
struct PPM
	std::string magic, fileName;
	int h,w,max;
	uint8_t buffer[INPUT_H*INPUT_W*INPUT_C];	

std::string locateFile(const std::string& input)
    return locateFile(input, directories);

void readPPMFile(const std::string& filename, PPM& ppm)
	ppm.fileName = filename;
	std::ifstream infile(locateFile(filename), std::ifstream::binary);
	infile >> ppm.magic >> ppm.w >> ppm.h>> ppm.max;
	infile.seekg(1,infile.cur);<char*>(ppm.buffer), ppm.w*ppm.h*3);
// simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& fileName, uint8_t buffer[INPUT_H*INPUT_W])
    readPGMFile(fileName, buffer, INPUT_H, INPUT_W);

void caffeToGIEModel(const std::string& deployFile,				// name for caffe prototxt
					 const std::string& modelFile,				// name for model 
					 const std::vector<std::string>& outputs,   // network outputs
					 unsigned int maxBatchSize,					// batch size - NB must be at least as large as the batch we want to run with)
					 IHostMemory *&gieModelStream)    // output buffer for the GIE model
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);

	// parse the caffe model to populate the network, then set the outputs
	INetworkDefinition* network = builder->createNetwork();
	ICaffeParser* parser = createCaffeParser();
	const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, directories).c_str(),
															  locateFile(modelFile, directories).c_str(),
	// specify which tensors are outputs
	for (auto& s : outputs)

	// Build the engine
	builder->setMaxWorkspaceSize(1 << 20);

	ICudaEngine* engine = builder->buildCudaEngine(*network);

	// we don't need the network any more, and we can destroy the parser

	// serialize the engine, then close everything down
	gieModelStream = engine->serialize();

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
	const ICudaEngine& engine = context.getEngine();
	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
	// of these, but in this case we know that there is exactly one input and one output.
	assert(engine.getNbBindings() == 2);
	void* buffers[2];

	// In order to bind the buffers, we need to know the names of the input and output tensors.
	// note that indices are guaranteed to be less than IEngine::getNbBindings()
	int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), 
		outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

	// create GPU buffers and a stream
	CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float)));
	CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

	cudaStream_t stream;
	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float), cudaMemcpyHostToDevice, stream));
	context.enqueue(batchSize,buffers, stream, nullptr);
	auto t_start = std::chrono::high_resolution_clock::now();
	for(int i=0;i<10;++i)
	context.enqueue(batchSize, buffers, stream, nullptr);
	auto t_end = std::chrono::high_resolution_clock::now();
	auto ms = std::chrono::duration<float, std::milli>(t_end - t_start).count();
	std::cout<<"time duration is : "<< ms/10.0 << " ms"<<std::endl;
	CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
	// release the stream and the buffers

int main(int argc, char** argv)
	// create a GIE model from the caffe model and serialize it to a stream
    IHostMemory *gieModelStream{nullptr};
   	caffeToGIEModel("cifar10_full.prototxt", "cifar10_full_iter_60000.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);

	PPM ppms;
	readPPMFile(argv[1], ppms);
	// print an ascii representation
	std::cout << "\n\n\n---------------------------" << "\n\n\n" << std::endl;
	// parse the mean file and 	subtract it from the image
    	ICaffeParser* parser = createCaffeParser();
    	IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(locateFile("cifar10_mean.binaryproto", directories).c_str());

    	const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());
	float mean_channels[3]={0};
	mean_channels[0] = 125.3069178;
	mean_channels[1] = 122.95039426;	
	mean_channels[2] = 113.86538316;

    	float data[INPUT_H*INPUT_W*INPUT_C]={0};
	// deserialize the engine 
	IRuntime* runtime = createInferRuntime(gLogger);
	ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), nullptr);
    if (gieModelStream) gieModelStream->destroy();

	IExecutionContext *context = engine->createExecutionContext();
	std::ofstream fout("/home/nvidia/shujunhua/tensorRT/sampleMNIST.txt");

	for(int c = 0;c<INPUT_C;++c){
		for(unsigned j = 0, volChl = INPUT_H * INPUT_W ; j < volChl; ++j){
			data[c*volChl + j] = float(ppms.buffer[j*INPUT_C+2 -c]) - mean_channels[c];
				fout<<data[c*volChl +j]<<std::endl;

	// run inference
	float prob[OUTPUT_SIZE];
	doInference(*context, data, prob, 1);

	// destroy the engine

	// print a histogram of the output distribution
	std::cout << "\n\n";
    float val{0.0f};
    int idx{0};
	for (unsigned int i = 0; i < 10; i++)
        val = std::max(val, prob[i]);
        if (val == prob[i]) idx = i;
		std::cout << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << " ||    "<<prob[i]<<"\n";
	std::cout << std::endl;

	return (idx == 0 && val > 0.9f) ? EXIT_SUCCESS : EXIT_FAILURE;

then I run it , get the result as below

nvidia@tegra-ubuntu:/usr/src/tensorrt/bin$ ./sample_mnist 1.ppm


time duration is : 0.300693 ms

0:  ||    0.0087738
1: *** ||    0.345947
2:  ||    0.0164337
3:  ||    0.0255432
4:  ||    0.00112057
5:  ||    0.00267792
6: * ||    0.0835571
7:  ||    0.00227356
8:  ||    0.0361633
9: ***** ||    0.477539

the label of 1.ppm should be 1, but I get 9

I use caffe c++ interface and python interface could get the right result, could anybody tell me the reason?

any help will be appreciated ! thanks!


It looks like you have two mean subtraction data.
Could you check it if there is any possible error?

IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(locateFile("cifar10_mean.binaryproto", directories).c_str());
const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());

float mean_channels[3]={0};
mean_channels[0] = 125.3069178;
mean_channels[1] = 122.95039426;	
mean_channels[2] = 113.86538316;


sorry to confuse you , I don’t use meanData, I forget to comment out.

I have write the data to a file :

std::ofstream fout("/home/nvidia/shujunhua/tensorRT/sampleMNIST.txt");

	for(int c = 0;c<INPUT_C;++c){
		for(unsigned j = 0, volChl = INPUT_H * INPUT_W ; j < volChl; ++j){
			data[c*volChl + j] = float(ppms.buffer[j*INPUT_C+2 -c]) - mean_channels[c];
				fout<<data[c*volChl +j]<<std::endl;


the I compared it to the input data of caffe c++ inferface:

std::ofstream fout("/home/nvidia/shujunhua/tensorRT/caffe_input.txt");
  for(int i=0;i<3;++i){
        for(int j=0; j<32;++j){
                float *begin = input_channels[i].ptr<float>(j);
                for(int k=0;k<32;++k){

I found they are exactly the same!!

So I am confused! why the result is not right.

can anyone help me ? much thanks

I found another wrong result!
I follow the instruction of imagenet-console at

I use bvlc_reference_caffenet model to test, the command I use is:
nvidia@tegra-ubuntu:~/project/jetson-inference/build/aarch64/bin$ ./imagenet-console /home/nvidia/project/jetson-inference/bvlc_data/cat.jpg /home/nvidia/project/jetson-inference/bvlc_data/cat_output.jpg --prototxt=/home/nvidia/project/jetson-inference/bvlc_data/deploy.prototxt --model=/home/nvidia/project/jetson-inference/bvlc_data/bvlc_reference_caffenet.caffemodel --labels=/home/nvidia/project/jetson-inference/bvlc_data/synset_words.txt --input_blob=data --output_blob=prob

I get the result as below:

class 0274 - 0.080797 (dhole, Cuon alpinus)
class 0277 - 0.203834 (red fox, Vulpes vulpes)
class 0278 - 0.089156 (kit fox, Vulpes macrotis)
class 0281 - 0.150755 (tabby, tabby cat)
class 0282 - 0.119096 (tiger cat)
class 0285 - 0.112523 (Egyptian cat)
class 0287 - 0.040729 (lynx, catamount)
class 0330 - 0.040193 (wood rabbit, cottontail, cottontail rabbit)
class 0331 - 0.031648 (hare)
class 0333 - 0.011683 (hamster)
class 0335 - 0.012508 (fox squirrel, eastern fox squirrel, Sciurus niger)
class 0356 - 0.014462 (weasel)
imagenet-console: ‘/home/nvidia/project/jetson-inference/bvlc_data/cat.jpg’ -> 20.38337% class #277 (red fox, Vulpes vulpes)
loaded image fontmapA.png (256 x 512) 2097152 bytes
[cuda] cudaAllocMapped 2097152 bytes, CPU 0x101bf0000 GPU 0x101bf0000
[cuda] cudaAllocMapped 8192 bytes, CPU 0x101742000 GPU 0x101742000
imagenet-console: attempting to save output image to ‘/home/nvidia/project/jetson-inference/bvlc_data/cat_output.jpg’
imagenet-console: completed saving ‘/home/nvidia/project/jetson-inference/bvlc_data/cat_output.jpg’

but the true label should be tabby, tabby cat !
could you help me to do something , appreciate!


Do you use your own application or jetson_inference?

If you are using jetson_inference, could you also try official sample?
This will help us to narrow down the issue is from the sample or the related libraries.


Thanks for your reply!

I use the imagenet-console in jetson_inference, if I use the command

./imagenet-console orange_0.jpg output_0.jpg

I can get the right result, but if I deploy the bvlc_reference_caffenet by the command

./imagenet-console /home/nvidia/project/jetson-inference/bvlc_data/cat.jpg /home/nvidia/project/jetson-inference/bvlc_data/cat_output.jpg --prototxt=/home/nvidia/project/jetson-inference/bvlc_data/deploy.prototxt --model=/home/nvidia/project/jetson-inference/bvlc_data/bvlc_reference_caffenet.caffemodel --labels=/home/nvidia/project/jetson-inference/bvlc_data/synset_words.txt --input_blob=data --output_blob=prob

I get the wrong result, but I deploy the model in caffe C++ API and caffe Python API can get the right result.

I also modify the sampleMNIST to deploy the cifar10 model, but also get the wrong answer .


Please remember to align the image format to the input in your training process.

In Jetson_inference, image is RGB [0, 255] and subtract a given mean value by channel:


Please help me solve the problem , it bother me a long time.

suppose my caffemodel’s input format is CHW (for BGR), I use opencv to read a jpg ,code show as below:

float mean_channels[3]={0};
mean_channels[0] = 125.3069178;
mean_channels[1] = 122.95039426;	
mean_channels[2] = 113.86538316; //pre cal the mean data

Mat img = imread("1.jpg",CV_LOAD_IMAGE_UNCHANGED);
Mat img_resized;
resize(img, img_resized,Size(INPUT_H, INPUT_W))
vector<Mat> channels(3);
split(img_resized, channels);
float data[INPUT_C*INPUT_H*INPUT_W];  //the input of tensorRT
int count=0;
for(int i=0;i<3;++i){
  for(int j=0;j<INPUT_H;++j){
    float *begin = channels[i].ptr<float>(j);
    for(int k=0;k<INPUT_W;++k){
       data[count++] = begin[k] - mean_channels[i];

I want to know if this is correct ? I always get the wrong result…

Really need your help, it took me a long time on finding the reason, please help me , much much thanks for your help ~

I attach the all file , could you try it ?
cifar10.tar.gz (345 KB)


We will test this internally and update information with you.

Suppose you are using JetPack3.2, is it correct?

yes, and the name of JPG in attachment is the label.


Your model requires a CIFAR-10 format as input which is:
A 10000x3072 numpy array of uint8s. Each row of the array stores a 32x32 colour image. The first 1024 entries contain the red channel values, the next 1024 the green, and the final 1024 the blue. The image is stored in row-major order, so that the first 32 entries of the array are the red channel values of the first row of the image.

It looks like different to your source code.
Please make sure you have followed the input requirement.


thanks for your reply!!!

Now I read JPG as BGR, but I had tried to read JPG as RGB and it also lead to a wrong result .

I will try RGB again …

I had try to read the JPG as RGB, I modified the code :

for(int i=0;i<3;++i){
  for(int j=0;j<INPUT_H;++j){
    float *begin = channels[2-i].ptr<float>(j);
    for(int k=0;k<INPUT_W;++k){
       data[count++] = begin[k] - mean_channels[2-i];

but it also lead to the wrong result, could you point me where is wrong with my code? It almost make me med …

thank you very much !


Could you get the correct result with Caffe?
If yes, could you also attach the source for using Caffe with us?


Thanks for your reply!

I can get the correct result with caffe, but I think I have found the reason why I always get the wrong result!

the LRN layer that TensorRT support is only in the cross-channel, not within-channel, I replace the LRN then get the right result!

thanks for your help!!