I modified the sampleMnist for cifar10 model, below is my code :
#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/imgcodecs/imgcodecs.hpp>
#include <chrono>
#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "common.h"
using namespace nvinfer1;
using namespace nvcaffeparser1;
// stuff we know about the network and the caffe input/output blobs
static const int INPUT_H = 32;
static const int INPUT_W = 32;
static const int INPUT_C = 3;
static const int OUTPUT_SIZE = 10;
static Logger gLogger;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
const std::vector<std::string> directories{ "data/samples/mnist/", "data/mnist/" };
struct PPM
{
std::string magic, fileName;
int h,w,max;
uint8_t buffer[INPUT_H*INPUT_W*INPUT_C];
};
std::string locateFile(const std::string& input)
{
return locateFile(input, directories);
}
void readPPMFile(const std::string& filename, PPM& ppm)
{
ppm.fileName = filename;
std::ifstream infile(locateFile(filename), std::ifstream::binary);
infile >> ppm.magic >> ppm.w >> ppm.h>> ppm.max;
infile.seekg(1,infile.cur);
infile.read(reinterpret_cast<char*>(ppm.buffer), ppm.w*ppm.h*3);
}
// simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& fileName, uint8_t buffer[INPUT_H*INPUT_W])
{
readPGMFile(fileName, buffer, INPUT_H, INPUT_W);
}
void caffeToGIEModel(const std::string& deployFile, // name for caffe prototxt
const std::string& modelFile, // name for model
const std::vector<std::string>& outputs, // network outputs
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
IHostMemory *&gieModelStream) // output buffer for the GIE model
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger);
// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, directories).c_str(),
locateFile(modelFile, directories).c_str(),
*network,
DataType::kHALF);
// specify which tensors are outputs
for (auto& s : outputs)
network->markOutput(*blobNameToTensor->find(s.c_str()));
// Build the engine
builder->setHalf2Mode(true);
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 20);
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);
// we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();
// serialize the engine, then close everything down
gieModelStream = engine->serialize();
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
}
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize,buffers, stream, nullptr);
auto t_start = std::chrono::high_resolution_clock::now();
for(int i=0;i<10;++i)
{
context.enqueue(batchSize, buffers, stream, nullptr);
}
auto t_end = std::chrono::high_resolution_clock::now();
auto ms = std::chrono::duration<float, std::milli>(t_end - t_start).count();
std::cout<<"time duration is : "<< ms/10.0 << " ms"<<std::endl;
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
int main(int argc, char** argv)
{
// create a GIE model from the caffe model and serialize it to a stream
IHostMemory *gieModelStream{nullptr};
caffeToGIEModel("cifar10_full.prototxt", "cifar10_full_iter_60000.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);
PPM ppms;
readPPMFile(argv[1], ppms);
// print an ascii representation
std::cout << "\n\n\n---------------------------" << "\n\n\n" << std::endl;
// parse the mean file and subtract it from the image
ICaffeParser* parser = createCaffeParser();
IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(locateFile("cifar10_mean.binaryproto", directories).c_str());
parser->destroy();
const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());
float mean_channels[3]={0};
mean_channels[0] = 125.3069178;
mean_channels[1] = 122.95039426;
mean_channels[2] = 113.86538316;
float data[INPUT_H*INPUT_W*INPUT_C]={0};
// deserialize the engine
IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), nullptr);
if (gieModelStream) gieModelStream->destroy();
IExecutionContext *context = engine->createExecutionContext();
std::ofstream fout("/home/nvidia/shujunhua/tensorRT/sampleMNIST.txt");
for(int c = 0;c<INPUT_C;++c){
for(unsigned j = 0, volChl = INPUT_H * INPUT_W ; j < volChl; ++j){
data[c*volChl + j] = float(ppms.buffer[j*INPUT_C+2 -c]) - mean_channels[c];
fout<<data[c*volChl +j]<<std::endl;
}
}
fout.close();
meanBlob->destroy();
// run inference
float prob[OUTPUT_SIZE];
doInference(*context, data, prob, 1);
// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
// print a histogram of the output distribution
std::cout << "\n\n";
float val{0.0f};
int idx{0};
for (unsigned int i = 0; i < 10; i++)
{
val = std::max(val, prob[i]);
if (val == prob[i]) idx = i;
std::cout << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << " || "<<prob[i]<<"\n";
}
std::cout << std::endl;
return (idx == 0 && val > 0.9f) ? EXIT_SUCCESS : EXIT_FAILURE;
}
then I run it , get the result as below
nvidia@tegra-ubuntu:/usr/src/tensorrt/bin$ ./sample_mnist 1.ppm
---------------------------
time duration is : 0.300693 ms
0: || 0.0087738
1: *** || 0.345947
2: || 0.0164337
3: || 0.0255432
4: || 0.00112057
5: || 0.00267792
6: * || 0.0835571
7: || 0.00227356
8: || 0.0361633
9: ***** || 0.477539
the label of 1.ppm should be 1, but I get 9
I use caffe c++ interface and python interface could get the right result, could anybody tell me the reason?
any help will be appreciated ! thanks!