Segmentation fault (core dumped) while doing Tensorrt optimization of lenet

ZeeshanHayderr · March 1, 2018, 11:31am

Hi all:
I am facing an (Segmentation fault) error in following code.I Think when this line is executed then this error occurs.

ICudaEngine *engine = builder->buildCudaEngine(*network);

Can anybody guide me how to solve this error?

#include </usr/include/opencv2/core.hpp>
#include </usr/include/opencv2/highgui.hpp>
#include </usr/include/opencv2/calib3d.hpp>

#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <sys/stat.h>
#include <cmath>
#include <time.h>
#include <cuda_runtime_api.h>
#include <vector>
#include <map>

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include <cstring>

#include <ctime>


using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace cv;

#define CHECK(status)					\
{							\
    if (status != 0)				\
    {						\
        std::cout << "Cuda failure: " << status;\
		abort();				\
	}						\
}


static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;
const int channel=1;
const int BATCH_SIZE=1;
std::stringstream gieModelStream;


bool mEnableFP16=false;
bool mOverride16=false;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "result";

class Logger : public ILogger
{
	void log(Severity severity, const char* msg) override
	{
		// suppress info-level messages
		if (severity != Severity::kINFO)
			std::cout << msg << std::endl;
	}
} gLogger;


int main(int argc, char** argv)
{
	int start_s = clock();
	std::cout << "Starting execution" << std::endl;


   std::cout << "Hello jetson"<< std::endl;
	Mat image;
    image=imread("seven.bmp",0); 
std::vector<uint8_t> array;
if (image.isContinuous()) {
array.assign((uint8_t*)image.datastart, (uint8_t*)image.dataend);
} else {
for (int i = 0; i < image.rows; ++i) {
array.insert(array.end(), image.ptr<uint8_t>(i), image.ptr<uint8_t>(i)+image.cols);
}}
//printing values
for(int j=0;j<28;j++){
for (int i = 0; i < 28; i++)
std::cout << array[i+j*28] << " ";  
std::cout<<"\n";  
    }


	void** mInputCPU= (void**)malloc(2*sizeof(void*));
	cudaHostAlloc((void**)&mInputCPU[0],  INPUT_H*channel*INPUT_W*sizeof(uint8_t), cudaHostAllocDefault);
    mInputCPU[0]=&array[0];
    
//Building and inference code
IHostMemory *serialized{nullptr};
IHostMemory *deserialized{nullptr};


	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);
	const char* prototxt="lenet.prototxt";
	const char* caffemodel="lenet.caffemodel";






	mEnableFP16 = (mOverride16 == true) ? false : builder->platformHasFastFp16();
	printf( "platform %s FP16 support.\n", mEnableFP16 ? "has" : "does not have");
	printf( "loading %s %s\n", prototxt, prototxt);

	nvinfer1::DataType modelDataType = mEnableFP16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; // create a 16-bit model if it's natively supported

	// parse the caffe model to populate the network, then set the outputs and create an engine
	INetworkDefinition* network = builder->createNetwork();
	ICaffeParser *parser = createCaffeParser();
	
	const IBlobNameToTensor *blobNameToTensor =
				parser->parse(prototxt,		// caffe deploy file
				caffemodel,		// caffe model file
				*network,		// network definition that the parser will populate
				modelDataType);
	

	assert(blobNameToTensor != NULL); //abort program if false
	// the caffe file has no notion of outputs
	// so we need to manually say which tensors the engine should generate
	network->markOutput(*blobNameToTensor->find(OUTPUT_BLOB_NAME));
       
	builder->setMaxBatchSize(1);
// maximum GPU temporary memory which the engine can use at execution time
	builder->setMaxWorkspaceSize(16 << 20);//WORKSPACE_SIZE);   //how to decide?

	// set up the network for paired-fp16 format
	if(mEnableFP16)
//Half2 mode is a paired image mode that is significantly faster for batch sizes greater than one on platforms with FP16 support
		builder->setHalf2Mode(true);

	// Eliminate the side-effect from the delay of GPU frequency boost
//this parameter controls the number of iteratios used in minimization
	builder->setMinFindIterations(10);
	builder->setAverageFindIterations(10); //when timing layers builder minimizes over a set of average times for layer execution
printf("fine till here \n");
//build
ICudaEngine *engine = builder->buildCudaEngine(*network);// an engine for executing inference on a built network
printf("fine till here \n");
    assert(engine);
//serialized=engine->serialize();
    gieModelStream.seekg(0, gieModelStream.beg);
	// destroy


	network->destroy();
 	parser->destroy();

    //serialize
	std::cout << "serializing" << std::endl;

    nvinfer1::IHostMemory* serMem = engine->serialize();

	if( !serMem )
	{
		printf("failed to serialize CUDA engine\n");
		return false;
	}

	gieModelStream.write((const char*)serMem->data(), serMem->size());



std::cout << "printing size of bytes allocated \t" << (serMem->size())<< std::endl;

//........................ saving serialized object to file ........................
std::ofstream SaveFile("optimize",std::ios::out|std::ios::binary);
SaveFile.seekp(0,std::ios::beg);
SaveFile << gieModelStream.rdbuf();
gieModelStream.seekg(0, gieModelStream.beg);

SaveFile.close();




//....................... deserealization of object file......................

std::ifstream file("optimize",std::ios::in|std::ios::binary);
	
	
	

	file.seekg(0);

	

    gieModelStream << file.rdbuf();

    gieModelStream.seekg(0, std::ios::end);
   const int modelSize = gieModelStream.tellg();
	gieModelStream.seekg(0, std::ios::beg);
std::cout << "printing size of read file bytes \t" << (modelSize)/2<< std::endl;
	void* modelMem = malloc(modelSize);

	if( !modelMem )
	{
		printf("failed to allocate %i bytes to deserialize model\n", modelSize);
		return 0;
	}

    gieModelStream.read((char*)modelMem, modelSize);
    
    
    //creating inference object
    
    
    nvinfer1::IRuntime* infer = createInferRuntime(gLogger);  
    engine = infer->deserializeCudaEngine(modelMem, modelSize, 0); 
	
	
	//create context for execution
	IExecutionContext *context = engine->createExecutionContext();


	uint8_t prob[OUTPUT_SIZE*BATCH_SIZE];
 IEngine::getNbBindings(),
	
	assert(engine->getNbBindings() == 2);
	void* buffers[2];

	int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME); 
	int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

	CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE *channel* INPUT_H * INPUT_W * sizeof(uint8_t)));
	CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE *channel* OUTPUT_SIZE * sizeof(uint8_t)));
	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));
	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], mInputCPU[0], BATCH_SIZE *channel* INPUT_H * INPUT_W * sizeof(uint8_t), cudaMemcpyHostToDevice, stream));
	context->enqueue(BATCH_SIZE, buffers, stream, nullptr);
	CHECK(cudaMemcpyAsync(prob, buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE*sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);

	
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
	CHECK(cudaFree(buffers[outputIndex]));
	file.close();

imshow("frame",image);
waitKey(0);
for(int i=0;i<10;i++)
std::cout<<prob[i]<<" "<<std::endl;

   delete &array[0];
	engine->destroy();
	builder->destroy();
        shutdownProtobufLibrary();
        free(modelMem);
        int stop_s = clock();
        std::cout << "time: "<<(stop_s-start_s)<<std::endl;
return 0;
}

AastaLLL · March 2, 2018, 2:18am

Hi,

Could you modify the workspace size into an available number?
Ex.

builder->setMaxWorkspaceSize(1 << 20);

We assume user to choose the maximum workspace you can afford at runtime.
If TensorRT cannot create a network that runs in that amount of space, the builder will fail.

You can find more information in our document:

5.2. Choosing The Optimal Workspace Size
http://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#workspacesize

Thanks.

ZeeshanHayderr · March 6, 2018, 5:35am

Dear AastaLL:

Thanks for your valuable reply. I tried setting setMaxWorkspaceSize() to its max. value but it didn’t solve the error. I even tried different values but got same error.

builder->setMaxWorkspaceSize(1 << 31);

Then I debugged my program using gdb in ubuntu I got following message

Thread 1 "output" received signal SIGSEGV, Segmentation fault.
0x0000007fb10f9d9c in nvinfer1::Network::validate(nvinfer1::cudnn::HardwareContext const&, bool, bool, int) const ()
   from /usr/lib/aarch64-linux-gnu/libnvinfer.so.4

Now, can you give any suggestion?
Thanks.

AastaLLL · March 8, 2018, 7:00am

Hi,

If your are using TX2, please lower the amount of work space size.

builder->setMaxWorkspaceSize(1 << 20);

By the way, do you also meet this error with our official sample?
Thanks.

ZeeshanHayderr · March 14, 2018, 6:08am

Hi,
Sorry for late reply. No, I didn’t face this issue with official samples.

I have another problem I want to store my optimized model to hard disk for later use. I don’t want to optimize my model each time when I run the inference. I tried storing it using C++ but it didn’t work.
Can you please guide me in this regard?

Thanks.

AastaLLL · March 16, 2018, 6:49am

Hi,

You can serialize the optimized PLAN directly.
Here is an example in Jetson_inference:
[url]https://github.com/dusty-nv/jetson-inference/blob/master/tensorNet.cpp#L244[/url]

Thanks.

Topic		Replies	Views
cannot deserialize engine and segmentation fault(core dumped) Jetson TX2	2	2280	October 18, 2021
cannot deserialize engine and segmentation fault(core dumped) TensorRT	1	1027	September 6, 2019
Jetson TX2 Tensorrt l4t-tensorflow NGC Segmentation fault at build trt graphconverterV2 Jetson TX2 tensorrt	4	551	May 17, 2023
Segmentation fault when building an ICudaEngine in TensorRT3 CUDA Programming and Performance	0	536	January 2, 2018
OOM of conv layer TensorRT	4	701	October 12, 2021
Segmentation fault while processing context->enqueue(batch_size, buffer, stream, nullptr) TensorRT tensorrt , cuda , yolo	5	1434	March 2, 2022
The infrence of inpection_v3_retrained model starts and crashes with Segementatio fault (core dumped) error Jetson TX2 jetson-inference	4	486	October 18, 2021
AssertionError: Max workspace size for TensorRT inference should be positive, got 0 TensorRT	4	805	July 21, 2021
SSD Model, TRT, DeepStream, Triton: SegFault DeepStream SDK	13	1691	January 18, 2022
tensorRT：when calling buildCudaEngine，it showed segmentation fault Jetson TX1	10	4402	October 18, 2021

Segmentation fault (core dumped) while doing Tensorrt optimization of lenet

Related topics