Segmentation fault (core dumped) while doing Tensorrt optimization of lenet

Hi all:
I am facing an (Segmentation fault) error in following code.I Think when this line is executed then this error occurs.

ICudaEngine *engine = builder->buildCudaEngine(*network);

Can anybody guide me how to solve this error?

#include </usr/include/opencv2/core.hpp>
#include </usr/include/opencv2/highgui.hpp>
#include </usr/include/opencv2/calib3d.hpp>

#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <sys/stat.h>
#include <cmath>
#include <time.h>
#include <cuda_runtime_api.h>
#include <vector>
#include <map>

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include <cstring>

#include <ctime>

using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace cv;

#define CHECK(status)					\
{							\
    if (status != 0)				\
    {						\
        std::cout << "Cuda failure: " << status;\
		abort();				\
	}						\

static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;
const int channel=1;
const int BATCH_SIZE=1;
std::stringstream gieModelStream;

bool mEnableFP16=false;
bool mOverride16=false;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "result";

class Logger : public ILogger
	void log(Severity severity, const char* msg) override
		// suppress info-level messages
		if (severity != Severity::kINFO)
			std::cout << msg << std::endl;
} gLogger;

int main(int argc, char** argv)
	int start_s = clock();
	std::cout << "Starting execution" << std::endl;

   std::cout << "Hello jetson"<< std::endl;
	Mat image;
std::vector<uint8_t> array;
if (image.isContinuous()) {
array.assign((uint8_t*)image.datastart, (uint8_t*)image.dataend);
} else {
for (int i = 0; i < image.rows; ++i) {
array.insert(array.end(), image.ptr<uint8_t>(i), image.ptr<uint8_t>(i)+image.cols);
//printing values
for(int j=0;j<28;j++){
for (int i = 0; i < 28; i++)
std::cout << array[i+j*28] << " ";  

	void** mInputCPU= (void**)malloc(2*sizeof(void*));
	cudaHostAlloc((void**)&mInputCPU[0],  INPUT_H*channel*INPUT_W*sizeof(uint8_t), cudaHostAllocDefault);
//Building and inference code
IHostMemory *serialized{nullptr};
IHostMemory *deserialized{nullptr};

	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);
	const char* prototxt="lenet.prototxt";
	const char* caffemodel="lenet.caffemodel";

	mEnableFP16 = (mOverride16 == true) ? false : builder->platformHasFastFp16();
	printf( "platform %s FP16 support.\n", mEnableFP16 ? "has" : "does not have");
	printf( "loading %s %s\n", prototxt, prototxt);

	nvinfer1::DataType modelDataType = mEnableFP16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; // create a 16-bit model if it's natively supported

	// parse the caffe model to populate the network, then set the outputs and create an engine
	INetworkDefinition* network = builder->createNetwork();
	ICaffeParser *parser = createCaffeParser();
	const IBlobNameToTensor *blobNameToTensor =
				parser->parse(prototxt,		// caffe deploy file
				caffemodel,		// caffe model file
				*network,		// network definition that the parser will populate

	assert(blobNameToTensor != NULL); //abort program if false
	// the caffe file has no notion of outputs
	// so we need to manually say which tensors the engine should generate
// maximum GPU temporary memory which the engine can use at execution time
	builder->setMaxWorkspaceSize(16 << 20);//WORKSPACE_SIZE);   //how to decide?

	// set up the network for paired-fp16 format
//Half2 mode is a paired image mode that is significantly faster for batch sizes greater than one on platforms with FP16 support

	// Eliminate the side-effect from the delay of GPU frequency boost
//this parameter controls the number of iteratios used in minimization
	builder->setAverageFindIterations(10); //when timing layers builder minimizes over a set of average times for layer execution
printf("fine till here \n");
ICudaEngine *engine = builder->buildCudaEngine(*network);// an engine for executing inference on a built network
printf("fine till here \n");
    gieModelStream.seekg(0, gieModelStream.beg);
	// destroy


	std::cout << "serializing" << std::endl;

    nvinfer1::IHostMemory* serMem = engine->serialize();

	if( !serMem )
		printf("failed to serialize CUDA engine\n");
		return false;

	gieModelStream.write((const char*)serMem->data(), serMem->size());

std::cout << "printing size of bytes allocated \t" << (serMem->size())<< std::endl;

//........................ saving serialized object to file ........................
std::ofstream SaveFile("optimize",std::ios::out|std::ios::binary);
SaveFile << gieModelStream.rdbuf();
gieModelStream.seekg(0, gieModelStream.beg);


//....................... deserealization of object file......................

std::ifstream file("optimize",std::ios::in|std::ios::binary);



    gieModelStream << file.rdbuf();

    gieModelStream.seekg(0, std::ios::end);
   const int modelSize = gieModelStream.tellg();
	gieModelStream.seekg(0, std::ios::beg);
std::cout << "printing size of read file bytes \t" << (modelSize)/2<< std::endl;
	void* modelMem = malloc(modelSize);

	if( !modelMem )
		printf("failed to allocate %i bytes to deserialize model\n", modelSize);
		return 0;
	}*)modelMem, modelSize);
    //creating inference object
    nvinfer1::IRuntime* infer = createInferRuntime(gLogger);  
    engine = infer->deserializeCudaEngine(modelMem, modelSize, 0); 
	//create context for execution
	IExecutionContext *context = engine->createExecutionContext();

	uint8_t prob[OUTPUT_SIZE*BATCH_SIZE];
	assert(engine->getNbBindings() == 2);
	void* buffers[2];

	int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME); 
	int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

	CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE *channel* INPUT_H * INPUT_W * sizeof(uint8_t)));
	CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE *channel* OUTPUT_SIZE * sizeof(uint8_t)));
	cudaStream_t stream;
	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], mInputCPU[0], BATCH_SIZE *channel* INPUT_H * INPUT_W * sizeof(uint8_t), cudaMemcpyHostToDevice, stream));
	context->enqueue(BATCH_SIZE, buffers, stream, nullptr);
	CHECK(cudaMemcpyAsync(prob, buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE*sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));


for(int i=0;i<10;i++)
std::cout<<prob[i]<<" "<<std::endl;

   delete &array[0];
        int stop_s = clock();
        std::cout << "time: "<<(stop_s-start_s)<<std::endl;
return 0;


Could you modify the workspace size into an available number?

builder->setMaxWorkspaceSize(1 << 20);

We assume user to choose the maximum workspace you can afford at runtime.
If TensorRT cannot create a network that runs in that amount of space, the builder will fail.

You can find more information in our document:

5.2. Choosing The Optimal Workspace Size


Dear AastaLL:

Thanks for your valuable reply. I tried setting setMaxWorkspaceSize() to its max. value but it didn’t solve the error. I even tried different values but got same error.

builder->setMaxWorkspaceSize(1 << 31);

Then I debugged my program using gdb in ubuntu I got following message

Thread 1 "output" received signal SIGSEGV, Segmentation fault.
0x0000007fb10f9d9c in nvinfer1::Network::validate(nvinfer1::cudnn::HardwareContext const&, bool, bool, int) const ()
   from /usr/lib/aarch64-linux-gnu/

Now, can you give any suggestion?


If your are using TX2, please lower the amount of work space size.

builder->setMaxWorkspaceSize(1 << 20);

By the way, do you also meet this error with our official sample?

Sorry for late reply. No, I didn’t face this issue with official samples.

I have another problem I want to store my optimized model to hard disk for later use. I don’t want to optimize my model each time when I run the inference. I tried storing it using C++ but it didn’t work.
Can you please guide me in this regard?



You can serialize the optimized PLAN directly.
Here is an example in Jetson_inference: