TensorRT get bad classifier results

Im using TensorRT to accelerate a mnist caffe model, but the result is very bad 6%(even worse than guessing). I get to know TensorRT only for several days, I do need some help. The code is as following. I rewrite the official giexec sample and barely change anything. I think there maybe some problem with line 238~267, but I cant find the bugs.

#include <assert.h>
#include
#include
#include
#include
#include <sys/stat.h>
#include
#include <time.h>
#include <cuda_runtime_api.h>
#include
#include
#include <string.h>
#include
#include
#include <sys/time.h>
#include <unistd.h>

#include <opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>

#include “NvInfer.h”
#include “NvCaffeParser.h”

using namespace nvinfer1;
using namespace nvcaffeparser1;

#define CHECK(status)
{
if (status != 0)
{
std::cout << "Cuda failure: " << status;
abort();
}
}

struct Params
{
std::string deployFile, modelFile, engine, calibrationCache;
std::vectorstd::string outputs;
int device{ 0 }, batchSize{ 1 }, workspaceSize{ 16 }, iterations{ 10 }, avgRuns{ 10 };
bool half2{ false }, int8{ false }, verbose{ false }, hostTime{ false };
} gParams;

static inline int volume(DimsCHW dims)
{
return dims.c()*dims.h()*dims.w();
}

std::vectorstd::string gInputs;
std::map<std::string, DimsCHW> gInputDimensions;

// Logger for GIE info/warning/errors
class Logger : public ILogger
{
void log(Severity severity, const char* msg) override
{
// suppress info-level messages
if (severity != Severity::kINFO || gParams.verbose)
std::cout << msg << std::endl;
}
} gLogger;

class RndInt8Calibrator : public IInt8EntropyCalibrator
{
public:
RndInt8Calibrator(int totalSamples = 1)
: mTotalSamples(totalSamples)
, mCurrentSample(0)
{
std::default_random_engine generator;
std::uniform_real_distribution distribution(-1.0F, 1.0F);
for(auto& elem: gInputDimensions)
{
int elemCount = volume(elem.second);

		std::vector<float> rnd_data(elemCount);
		for(auto& val: rnd_data)
			val = distribution(generator);

		void * data;
		CHECK(cudaMalloc(&data, elemCount * sizeof(float)));
		CHECK(cudaMemcpy(data, &rnd_data[0], elemCount * sizeof(float), cudaMemcpyHostToDevice));

		mInputDeviceBuffers.insert(std::make_pair(elem.first, data));
	}
}

~RndInt8Calibrator()
{
	for(auto& elem: mInputDeviceBuffers)
		CHECK(cudaFree(elem.second));
}

int getBatchSize() const override
{
	return 1;
}

bool getBatch(void* bindings[], const char* names[], int nbBindings) override
{
	if (mCurrentSample >= mTotalSamples)
		return false;

	for(int i = 0; i < nbBindings; ++i)
		bindings[i] = mInputDeviceBuffers[names[i]];

	++mCurrentSample;
	return true;
}

const void* readCalibrationCache(size_t&) override
{
	return nullptr;
}

virtual void writeCalibrationCache(const void*, size_t) override
{
}

private:
int mTotalSamples;
int mCurrentSample;
std::map<std::string, void*> mInputDeviceBuffers;
};

ICudaEngine* caffeToGIEModel()
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger);

// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(),
														  gParams.modelFile.c_str(),
														  *network,
														  gParams.half2 ? DataType::kHALF:DataType::kFLOAT);


if (!blobNameToTensor)
	return nullptr;

for (int i = 0, n = network->getNbInputs(); i < n; i++)
{
	DimsCHW dims = static_cast<DimsCHW&&>(network->getInput(i)->getDimensions());
	gInputs.push_back(network->getInput(i)->getName());
	gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
	std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
}

// specify which tensors are outputs
for (auto& s : gParams.outputs)
{
	if (blobNameToTensor->find(s.c_str()) == nullptr)
	{
		std::cout << "could not find output blob " << s << std::endl;
		return nullptr;
	}
	network->markOutput(*blobNameToTensor->find(s.c_str()));
}

for (int i = 0, n = network->getNbOutputs(); i < n; i++)
{
	DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
	std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
}

// Build the engine
builder->setMaxBatchSize(gParams.batchSize);
builder->setMaxWorkspaceSize(gParams.workspaceSize<<20);
builder->setHalf2Mode(gParams.half2);

RndInt8Calibrator calibrator;
if (gParams.int8)
{
	builder->setInt8Mode(true);
	builder->setInt8Calibrator(&calibrator);
}

ICudaEngine* engine = builder->buildCudaEngine(*network);
if (engine == nullptr)
	std::cout << "could not build engine" << std::endl;

parser->destroy();
network->destroy();
builder->destroy();
shutdownProtobufLibrary();
return engine;

}

void createMemory(const ICudaEngine& engine, std::vector<void*>& buffers, const std::string& name)
{
size_t bindingIndex = engine.getBindingIndex(name.c_str());
printf(“name=%s, bindingIndex=%d, buffers.size()=%d\n”, name.c_str(), (int)bindingIndex, (int)buffers.size());
assert(bindingIndex < buffers.size());
DimsCHW dimensions = static_cast<DimsCHW&&>(engine.getBindingDimensions((int)bindingIndex));
size_t eltCount = dimensions.c()*dimensions.h()*dimensions.w()*gParams.batchSize, memSize = eltCount * sizeof(float);

void* deviceMem;
CHECK(cudaMalloc(&deviceMem, memSize));
if (deviceMem == nullptr)
{
	std::cerr << "Out of memory" << std::endl;
	exit(1);
}

buffers[bindingIndex] = deviceMem;	

}

void doInference(ICudaEngine& engine,std::string working_file)
{
IExecutionContext *context = engine.createExecutionContext();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.

std::vector<void*> buffers(gInputs.size() + gParams.outputs.size());
for (size_t i = 0; i < gInputs.size(); i++)
	createMemory(engine, buffers, gInputs[i]);

for (size_t i = 0; i < gParams.outputs.size(); i++)
	createMemory(engine, buffers, gParams.outputs[i]);

FILE *fp=fopen(working_file.c_str(),"r");
fseek(fp,0L,SEEK_END);
long fend=ftell(fp);
fseek(fp,0L,SEEK_SET);
long fstart=ftell(fp);
char im_name[1024];
cv::Mat im,im_float;
size_t memSize=1*28*28*sizeof(float);
struct timeval t1,t2;
gettimeofday(&t1,NULL);
float prob[10];
FILE *fout=fopen("rc.txt","w");
cudaStream_t stream;

while(fstart!=fend)
{
	fgets(im_name,1024,fp);
    CHECK(cudaStreamCreate(&stream));
    im_name[strlen(im_name)-1]='\0';
    im=cv::imread(im_name);
    im.convertTo(im_float,CV_32FC1);
	for (int i=0;i<28*28;i++)
	im_float.data[i]/=256.0;
    CHECK(cudaMemcpy(buffers[0], im_float.data, memSize, cudaMemcpyHostToDevice));


	context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
    CHECK(cudaMemcpyAsync(prob,buffers[1],10*sizeof(float),cudaMemcpyDeviceToHost));
	cudaStreamSynchronize(stream);
	int id=0;
	float val=-1.0;
    for (int i=0;i<10;i++)
	{
		if (prob[i]>val)
		{
			val=prob[i];
			id=i;
		}
	}
	fprintf(fout,"%d\n",id);
	fstart=ftell(fp);
	usleep(10);
	cudaStreamDestroy(stream);
}

gettimeofday(&t2,NULL);
printf("%ld %ld\n",t2.tv_sec-t1.tv_sec,t2.tv_usec-t2.tv_usec);
fclose(fp);
fclose(fout);

}

static void printUsage()
{
printf("\n");
printf(“Mandatory params:\n”);
printf(" --deploy= Caffe deploy file\n");
printf(" --output= Output blob name (can be specified multiple times)\n");

printf("\nOptional params:\n");

printf("  --model=<file>       Caffe model file (default = no model, random weights used)\n");
printf("  --batch=N            Set batch size (default = %d)\n", gParams.batchSize);
printf("  --device=N           Set cuda device to N (default = %d)\n", gParams.device);
printf("  --iterations=N       Run N iterations (default = %d)\n", gParams.iterations);
printf("  --avgRuns=N          Set avgRuns to N - perf is measured as an average of avgRuns (default=%d)\n", gParams.avgRuns);
printf("  --workspace=N        Set workspace size in megabytes (default = %d)\n", gParams.workspaceSize);
printf("  --half2              Run in paired fp16 mode (default = false)\n");
printf("  --int8               Run in int8 mode (default = false)\n");
printf("  --verbose            Use verbose logging (default = false)\n");
printf("  --hostTime	       Measure host time rather than GPU time (default = false)\n");
printf("  --engine=<file>      Generate a serialized GIE engine\n");
printf("  --calib=<file>       Read INT8 calibration cache file\n");

fflush(stdout);

}

static ICudaEngine* createEngine()
{
ICudaEngine *engine;

if (!gParams.deployFile.empty()) {
	engine = caffeToGIEModel();
	if (!engine)
	{
		std::cerr << "Engine could not be created" << std::endl;
		return nullptr;
	}


	if (!gParams.engine.empty())
	{
		std::ofstream p(gParams.engine);
		if (!p)
		{
			std::cerr << "could not open plan output file" << std::endl;
			return nullptr;
		}
		IHostMemory *ptr = engine->serialize();
        assert(ptr);
        p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
        ptr->destroy();
	}
	return engine;
}

// load directly from serialized engine file if deploy not specified
if (!gParams.engine.empty()) {
	char *gieModelStream{nullptr};
    size_t size{0};
	std::ifstream file(gParams.engine, std::ios::binary);
	if (file.good()) {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        gieModelStream = new char;
        assert(gieModelStream);
        file.read(gieModelStream, size);
		file.close();
	}

	IRuntime* infer = createInferRuntime(gLogger);
	engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr);
    if (gieModelStream) delete [] gieModelStream;

	// assume input to be "data" for deserialized engine
	gInputs.push_back("data");
	return engine;
}

// complain about empty deploy file
std::cerr << "Deploy file not specified" << std::endl;
return nullptr;

}

int main(int argc, char** argv)
{
// create a GIE model from the caffe model and serialize it to a stream

gParams.deployFile="model/test.prototxt";
gParams.modelFile="model/mnist.caffemodel";
gParams.calibrationCache="";
gParams.engine="";
gParams.outputs.push_back("fc2");
gParams.device=0;
gParams.batchSize=1;
gParams.workspaceSize=16;

std::string working_file="test.txt";

cudaSetDevice(gParams.device);

if (gParams.outputs.size() == 0)
{
	std::cerr << "At least one network output must be defined" << std::endl;
	return -1;
}

ICudaEngine* engine = createEngine();
if (!engine)
{
	std::cerr << "Engine could not be created" << std::endl;
	return -1;
}

doInference(*engine,working_file);
engine->destroy();

return 0;

}

I should say that:

  1. My working environment is ubuntu 16.04, cuda 8.0, cudnn6, TensorRT 2.1. The make progress seems good.

  2. The caffemodel had been trained and u know it’s only mnist, so there shouldn’t be any problem with the caffe net or model.

Can you post your output logs ?

Bro, I don’t know how to output the logs. Please tell me how can I do it.

I mean the command which you run for output and it’s output.
Also, I would like to know if you are getting correct FP32 accuracy or the result accuracy is bad only in INT8 ?
Maybe your calibration process is not working properly. Did you made batches by yourself or you used the batches already given in the data/mnist folder in TensorRT

Please mention steps which model and weights files are you using and did you made it.(or how you trained the model)