cannot deserialize engine and segmentation fault(core dumped)

I aim to accelerate a detection model by using tensorRT on jetson TX2.And I add an upsample layer and a leakyReLU layer plugin into network.However, I can successfully generate my engine file but I have this error:

Segmentation fault (core dumped)

I add some print functions in my code later and I find that this error came out when deserialize engine:

ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), &pluginFactory);

here is my caffeToGIE and doInference function:

int caffeToGIEModel(const std::string& deployFile,					// name for caffe prototxt
		const std::string& modelFile,					// name for model 
		const std::vector<std::string>& outputs,		// network outputs
	        unsigned int maxBatchSize,						// batch size - NB must be at least as large as the batch we want to run with)
		nvcaffeparser1::IPluginFactory* pluginFactory,	// factory for plugin layers
		 IHostMemory *&gieModelStream,					// output stream for the GIE model
                 const std::string& engine_file)                // name for saved engine file
	// create the builder
        std::cout << "start parsing model..." << std::endl;
	IBuilder* builder = createInferBuilder(gLogger);
        std::cout << "start1 parsing model..." << std::endl;
	// parse the caffe model to populate the network, then set the outputs
	INetworkDefinition* network = builder->createNetwork();
	ICaffeParser* parser = createCaffeParser();
        std::cout << "start2 parsing model..." << std::endl;
	bool fp16 = builder->platformHasFastFp16();
	DataType modelDataType = fp16 ? DataType::kHALF : DataType::kFLOAT;
	const IBlobNameToTensor* blobNameToTensor = parser->parse( deployFile.c_str(),
        std::cout << "start3 parsing model..." << std::endl;
	// specify which tensors are outputs
	for (auto& s : outputs)

        std::cout << "start4 parsing model..." << std::endl;
	// Build the engine
	builder->setMaxWorkspaceSize(1 << 30);
	if(fp16) builder->setHalf2Mode(true);

	ICudaEngine* engine = builder->buildCudaEngine(*network);
    std::cout << "start5 parsing model..." << std::endl;
	// we don't need the network any more, and we can destroy the parser
        std::cout << "start6 parsing model..." << std::endl;
	// save and serialize the engine, then close everything down
    gieModelStream = engine->serialize();
    fprintf(stdout, "allocate memory size: %d bytes\n", gieModelStream->size());
	std::ofstream outfile(engine_file.c_str(), std::ios::out | std::ios::binary);
	if (!outfile.is_open()) {
		fprintf(stderr, "fail to open file to write: %s\n", engine_file.c_str());
	return -1;
	unsigned char* p = (unsigned char*)gieModelStream->data();
	outfile.write((char*)p, gieModelStream->size());

	std::cout << "start7 parsing model..." << std::endl;
    if(gieModelStream) gieModelStream->destroy();

    std::cout << "End parsing model.qq.." << std::endl;
	return 0;
int doInference(IExecutionContext& context, float* input, float* output0, float* output1, float* output2, int batchSize)
	const ICudaEngine& engine = context.getEngine();
	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
	// of these, but in this case we know that there is exactly one input and one output.
	assert(engine.getNbBindings() == 4);
	void* buffers[4];

	// In order to bind the buffers, we need to know the names of the input and output tensors.
	// note that indices are guaranteed to be less than IEngine::getNbBindings()
	int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), 
	outputIndex0 = engine.getBindingIndex(OUTPUT_BLOB_NAME0),
	outputIndex1 = engine.getBindingIndex(OUTPUT_BLOB_NAME1),
	outputIndex2 = engine.getBindingIndex(OUTPUT_BLOB_NAME2);

    std::cout << "1---------------------" << std::endl;
	// create GPU buffers and a stream
	 checkCudaErrors(cudaMalloc(&buffers[outputIndex0],  1 * OUTPUT_SIZE0 * sizeof(float))) ; // bbox_pred
	 checkCudaErrors(cudaMalloc(&buffers[outputIndex1],  1 * OUTPUT_SIZE1 * sizeof(float))) ;  // cls_prob
	 checkCudaErrors(cudaMalloc(&buffers[outputIndex2],  1 * OUTPUT_SIZE2 * sizeof(float))) ;                // rois

	 checkCudaErrors(cudaMalloc(&buffers[inputIndex],    1 * INPUT_C*INPUT_H * INPUT_W * sizeof(float))) ;

         std::cout << "2---------------------" << std::endl;
	 cudaStream_t stream;
	 checkCudaErrors(cudaStreamCreate(&stream)) ;
         std::cout << "3---------------------" << std::endl;

	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
	checkCudaErrors(cudaMemcpyAsync(buffers[inputIndex], input, batchSize *INPUT_C* INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)) ;
	context.enqueue(batchSize, buffers, stream, nullptr);

	checkCudaErrors(cudaMemcpyAsync(output0, buffers[outputIndex0], batchSize * OUTPUT_SIZE0 *sizeof(float), cudaMemcpyDeviceToHost, stream)) ;
	checkCudaErrors(cudaMemcpyAsync(output1, buffers[outputIndex1], batchSize * OUTPUT_SIZE1 *sizeof(float), cudaMemcpyDeviceToHost, stream)) ;
	checkCudaErrors(cudaMemcpyAsync(output2, buffers[outputIndex2], batchSize * OUTPUT_SIZE2 *sizeof(float), cudaMemcpyDeviceToHost, stream)) ;


	// release the stream and the buffers
	return 0;

and this is my main function:

int main(int argc, char** argv)
	// create a GIE model from the caffe model and serialize it to a stream
	const std::string engine_file { "models/yolov3.engine" };
	PluginFactory pluginFactory;
	IHostMemory *gieModelStream{ nullptr };
	fstream _file;"models/yolov3.engine", ios::in);
		cout<<"engine file not created yet!"<<endl;
		caffeToGIEModel("models/yolov3.prototxt", "models/yolov3.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME0 ,OUTPUT_BLOB_NAME1,OUTPUT_BLOB_NAME2}, 1, &pluginFactory, gieModelStream, engine_file);
    // pluginFactory.destroyPlugin();
    std::cout << "start reading image..." << std::endl;
    cv::Mat img = cv::imread("images/570.jpg", 1);
     std::cout<<"image processing finished!"<<endl;   
    PPM  imagePPM[1];


    float* data = new float[1*INPUT_C*INPUT_H*INPUT_W];

    for (int i = 0, volImg = INPUT_C*INPUT_H*INPUT_W; i < 1; ++i)
        for (int c = 0; c < INPUT_C; ++c)
            for (unsigned j = 0, volChl = INPUT_H*INPUT_W; j < volChl; ++j)
                data[i*volImg + c*volChl + j] = (float(imagePPM[i].buffer[j*INPUT_C + 2 - c]))*1/255.0;
    std::cout<<"data preparing finished!"<<endl;
	// deserialize the engine 
	std::ifstream in_file(engine_file.c_str(), std::ios::in | std::ios::binary);
	if (!in_file.is_open()) {
		fprintf(stderr, "fail to open file to write: %s\n", engine_file.c_str());
        return -1;
	std::streampos begin, end;
    begin = in_file.tellg();
    in_file.seekg(0, std::ios::end);
    end = in_file.tellg();
	std::size_t size = end - begin;
	fprintf(stdout, "engine file size: %d bytes\n", size);
	in_file.seekg(0, std::ios::beg);
	std::unique_ptr<unsigned char[]> engine_data(new unsigned char);*)engine_data.get(), size);

	IRuntime* runtime = createInferRuntime(gLogger);
	std::cout << "start deserializeCudaEngine model..." << std::endl;
	ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), &pluginFactory);
	std::cout << "end deserializeCudaEngine model..." << std::endl;
	IExecutionContext *context = engine->createExecutionContext();

	// run inference

	doInference(*context, data, prob1,prob2,prob3, 1);
        printf("successfully inferenced!!!\n");
	// destroy the engine

	// print a histogram of the output distribution

	return 0;

I’m new to TensorRT,here I have several questions :
1.Why can’t I der=serialize engine,is there any wrong code?
2.I can successfully generate my engine file,does that demonstrate my network with IPlugin can parse my caffemodel and extract its weights?
3.About"builder->setMaxWorkspaceSize(1 << 20);",how to decide the size of workspace? My engine file is about 120M,should I set it as “builder->setMaxWorkspaceSize(1 << 120);”??

Any suggestions are appreciated!Thanks to your help and wish you a happy day!!!


1. Could you check if your plugin implementation can be correctly created with serialized file?
Here is an example for your reference:

2. You can serialize weights for a plugin layer.
We also have a sample to demonstrate this: /usr/src/tensorrt/samples/samplePlugin

3. Workspace indicates the memory size occupied by GPU implementation. Not for the file size.
Please find our document for the suggestion on setting the value: