Hi all:
I am facing an (Segmentation fault) error in following code.I Think when this line is executed then this error occurs.
ICudaEngine *engine = builder->buildCudaEngine(*network);
Can anybody guide me how to solve this error?
#include </usr/include/opencv2/core.hpp>
#include </usr/include/opencv2/highgui.hpp>
#include </usr/include/opencv2/calib3d.hpp>
#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <sys/stat.h>
#include <cmath>
#include <time.h>
#include <cuda_runtime_api.h>
#include <vector>
#include <map>
#include "NvInfer.h"
#include "NvCaffeParser.h"
#include <cstring>
#include <ctime>
using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace cv;
#define CHECK(status) \
{ \
if (status != 0) \
{ \
std::cout << "Cuda failure: " << status;\
abort(); \
} \
}
static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;
const int channel=1;
const int BATCH_SIZE=1;
std::stringstream gieModelStream;
bool mEnableFP16=false;
bool mOverride16=false;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "result";
class Logger : public ILogger
{
void log(Severity severity, const char* msg) override
{
// suppress info-level messages
if (severity != Severity::kINFO)
std::cout << msg << std::endl;
}
} gLogger;
int main(int argc, char** argv)
{
int start_s = clock();
std::cout << "Starting execution" << std::endl;
std::cout << "Hello jetson"<< std::endl;
Mat image;
image=imread("seven.bmp",0);
std::vector<uint8_t> array;
if (image.isContinuous()) {
array.assign((uint8_t*)image.datastart, (uint8_t*)image.dataend);
} else {
for (int i = 0; i < image.rows; ++i) {
array.insert(array.end(), image.ptr<uint8_t>(i), image.ptr<uint8_t>(i)+image.cols);
}}
//printing values
for(int j=0;j<28;j++){
for (int i = 0; i < 28; i++)
std::cout << array[i+j*28] << " ";
std::cout<<"\n";
}
void** mInputCPU= (void**)malloc(2*sizeof(void*));
cudaHostAlloc((void**)&mInputCPU[0], INPUT_H*channel*INPUT_W*sizeof(uint8_t), cudaHostAllocDefault);
mInputCPU[0]=&array[0];
//Building and inference code
IHostMemory *serialized{nullptr};
IHostMemory *deserialized{nullptr};
// create the builder
IBuilder* builder = createInferBuilder(gLogger);
const char* prototxt="lenet.prototxt";
const char* caffemodel="lenet.caffemodel";
mEnableFP16 = (mOverride16 == true) ? false : builder->platformHasFastFp16();
printf( "platform %s FP16 support.\n", mEnableFP16 ? "has" : "does not have");
printf( "loading %s %s\n", prototxt, prototxt);
nvinfer1::DataType modelDataType = mEnableFP16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; // create a 16-bit model if it's natively supported
// parse the caffe model to populate the network, then set the outputs and create an engine
INetworkDefinition* network = builder->createNetwork();
ICaffeParser *parser = createCaffeParser();
const IBlobNameToTensor *blobNameToTensor =
parser->parse(prototxt, // caffe deploy file
caffemodel, // caffe model file
*network, // network definition that the parser will populate
modelDataType);
assert(blobNameToTensor != NULL); //abort program if false
// the caffe file has no notion of outputs
// so we need to manually say which tensors the engine should generate
network->markOutput(*blobNameToTensor->find(OUTPUT_BLOB_NAME));
builder->setMaxBatchSize(1);
// maximum GPU temporary memory which the engine can use at execution time
builder->setMaxWorkspaceSize(16 << 20);//WORKSPACE_SIZE); //how to decide?
// set up the network for paired-fp16 format
if(mEnableFP16)
//Half2 mode is a paired image mode that is significantly faster for batch sizes greater than one on platforms with FP16 support
builder->setHalf2Mode(true);
// Eliminate the side-effect from the delay of GPU frequency boost
//this parameter controls the number of iteratios used in minimization
builder->setMinFindIterations(10);
builder->setAverageFindIterations(10); //when timing layers builder minimizes over a set of average times for layer execution
printf("fine till here \n");
//build
ICudaEngine *engine = builder->buildCudaEngine(*network);// an engine for executing inference on a built network
printf("fine till here \n");
assert(engine);
//serialized=engine->serialize();
gieModelStream.seekg(0, gieModelStream.beg);
// destroy
network->destroy();
parser->destroy();
//serialize
std::cout << "serializing" << std::endl;
nvinfer1::IHostMemory* serMem = engine->serialize();
if( !serMem )
{
printf("failed to serialize CUDA engine\n");
return false;
}
gieModelStream.write((const char*)serMem->data(), serMem->size());
std::cout << "printing size of bytes allocated \t" << (serMem->size())<< std::endl;
//........................ saving serialized object to file ........................
std::ofstream SaveFile("optimize",std::ios::out|std::ios::binary);
SaveFile.seekp(0,std::ios::beg);
SaveFile << gieModelStream.rdbuf();
gieModelStream.seekg(0, gieModelStream.beg);
SaveFile.close();
//....................... deserealization of object file......................
std::ifstream file("optimize",std::ios::in|std::ios::binary);
file.seekg(0);
gieModelStream << file.rdbuf();
gieModelStream.seekg(0, std::ios::end);
const int modelSize = gieModelStream.tellg();
gieModelStream.seekg(0, std::ios::beg);
std::cout << "printing size of read file bytes \t" << (modelSize)/2<< std::endl;
void* modelMem = malloc(modelSize);
if( !modelMem )
{
printf("failed to allocate %i bytes to deserialize model\n", modelSize);
return 0;
}
gieModelStream.read((char*)modelMem, modelSize);
//creating inference object
nvinfer1::IRuntime* infer = createInferRuntime(gLogger);
engine = infer->deserializeCudaEngine(modelMem, modelSize, 0);
//create context for execution
IExecutionContext *context = engine->createExecutionContext();
uint8_t prob[OUTPUT_SIZE*BATCH_SIZE];
IEngine::getNbBindings(),
assert(engine->getNbBindings() == 2);
void* buffers[2];
int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE *channel* INPUT_H * INPUT_W * sizeof(uint8_t)));
CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE *channel* OUTPUT_SIZE * sizeof(uint8_t)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], mInputCPU[0], BATCH_SIZE *channel* INPUT_H * INPUT_W * sizeof(uint8_t), cudaMemcpyHostToDevice, stream));
context->enqueue(BATCH_SIZE, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(prob, buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE*sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
file.close();
imshow("frame",image);
waitKey(0);
for(int i=0;i<10;i++)
std::cout<<prob[i]<<" "<<std::endl;
delete &array[0];
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
free(modelMem);
int stop_s = clock();
std::cout << "time: "<<(stop_s-start_s)<<std::endl;
return 0;
}