I aim to accelerate a detection model by using tensorRT on jetson TX2.And I add an upsample layer and a leakyReLU layer plugin into network.However, I can successfully generate my engine file but I have this error:
Segmentation fault (core dumped)
I add some print functions in my code later and I find that this error came out when deserialize engine:
ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), &pluginFactory);
here is my caffeToGIE and doInference function:
int caffeToGIEModel(const std::string& deployFile, // name for caffe prototxt
const std::string& modelFile, // name for model
const std::vector<std::string>& outputs, // network outputs
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
nvcaffeparser1::IPluginFactory* pluginFactory, // factory for plugin layers
IHostMemory *&gieModelStream, // output stream for the GIE model
const std::string& engine_file) // name for saved engine file
{
// create the builder
std::cout << "start parsing model..." << std::endl;
IBuilder* builder = createInferBuilder(gLogger);
std::cout << "start1 parsing model..." << std::endl;
// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
parser->setPluginFactory(pluginFactory);
std::cout << "start2 parsing model..." << std::endl;
bool fp16 = builder->platformHasFastFp16();
DataType modelDataType = fp16 ? DataType::kHALF : DataType::kFLOAT;
const IBlobNameToTensor* blobNameToTensor = parser->parse( deployFile.c_str(),
modelFile.c_str(),
*network,
modelDataType);
std::cout << "start3 parsing model..." << std::endl;
// specify which tensors are outputs
for (auto& s : outputs)
network->markOutput(*blobNameToTensor->find(s.c_str()));
std::cout << "start4 parsing model..." << std::endl;
// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 30);
if(fp16) builder->setHalf2Mode(true);
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);
std::cout << "start5 parsing model..." << std::endl;
// we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();
std::cout << "start6 parsing model..." << std::endl;
// save and serialize the engine, then close everything down
gieModelStream = engine->serialize();
fprintf(stdout, "allocate memory size: %d bytes\n", gieModelStream->size());
std::ofstream outfile(engine_file.c_str(), std::ios::out | std::ios::binary);
if (!outfile.is_open()) {
fprintf(stderr, "fail to open file to write: %s\n", engine_file.c_str());
return -1;
}
unsigned char* p = (unsigned char*)gieModelStream->data();
outfile.write((char*)p, gieModelStream->size());
outfile.close();
std::cout << "start7 parsing model..." << std::endl;
if(gieModelStream) gieModelStream->destroy();
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
std::cout << "End parsing model.qq.." << std::endl;
return 0;
}
int doInference(IExecutionContext& context, float* input, float* output0, float* output1, float* output2, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
std::cout<<engine.getNbBindings()<<std::endl;
assert(engine.getNbBindings() == 4);
void* buffers[4];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
outputIndex0 = engine.getBindingIndex(OUTPUT_BLOB_NAME0),
outputIndex1 = engine.getBindingIndex(OUTPUT_BLOB_NAME1),
outputIndex2 = engine.getBindingIndex(OUTPUT_BLOB_NAME2);
std::cout << "1---------------------" << std::endl;
// create GPU buffers and a stream
checkCudaErrors(cudaMalloc(&buffers[outputIndex0], 1 * OUTPUT_SIZE0 * sizeof(float))) ; // bbox_pred
checkCudaErrors(cudaMalloc(&buffers[outputIndex1], 1 * OUTPUT_SIZE1 * sizeof(float))) ; // cls_prob
checkCudaErrors(cudaMalloc(&buffers[outputIndex2], 1 * OUTPUT_SIZE2 * sizeof(float))) ; // rois
checkCudaErrors(cudaMalloc(&buffers[inputIndex], 1 * INPUT_C*INPUT_H * INPUT_W * sizeof(float))) ;
std::cout << "2---------------------" << std::endl;
cudaStream_t stream;
checkCudaErrors(cudaStreamCreate(&stream)) ;
std::cout << "3---------------------" << std::endl;
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
checkCudaErrors(cudaMemcpyAsync(buffers[inputIndex], input, batchSize *INPUT_C* INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)) ;
context.enqueue(batchSize, buffers, stream, nullptr);
checkCudaErrors(cudaMemcpyAsync(output0, buffers[outputIndex0], batchSize * OUTPUT_SIZE0 *sizeof(float), cudaMemcpyDeviceToHost, stream)) ;
checkCudaErrors(cudaMemcpyAsync(output1, buffers[outputIndex1], batchSize * OUTPUT_SIZE1 *sizeof(float), cudaMemcpyDeviceToHost, stream)) ;
checkCudaErrors(cudaMemcpyAsync(output2, buffers[outputIndex2], batchSize * OUTPUT_SIZE2 *sizeof(float), cudaMemcpyDeviceToHost, stream)) ;
cudaStreamSynchronize(stream);
// release the stream and the buffers
cudaStreamDestroy(stream);
checkCudaErrors(cudaFree(buffers[inputIndex]));
checkCudaErrors(cudaFree(buffers[outputIndex0]));
checkCudaErrors(cudaFree(buffers[outputIndex1]));
checkCudaErrors(cudaFree(buffers[outputIndex2]));
return 0;
}
and this is my main function:
int main(int argc, char** argv)
{
// create a GIE model from the caffe model and serialize it to a stream
const std::string engine_file { "models/yolov3.engine" };
PluginFactory pluginFactory;
IHostMemory *gieModelStream{ nullptr };
fstream _file;
_file.open("models/yolov3.engine", ios::in);
if(!_file)
{
cout<<"engine file not created yet!"<<endl;
caffeToGIEModel("models/yolov3.prototxt", "models/yolov3.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME0 ,OUTPUT_BLOB_NAME1,OUTPUT_BLOB_NAME2}, 1, &pluginFactory, gieModelStream, engine_file);
}
else
{
cout<<"ENGINE FILE ALREADY EXISTS!"<<endl;
}
// pluginFactory.destroyPlugin();
std::cout << "start reading image..." << std::endl;
cv::Mat img = cv::imread("images/570.jpg", 1);
img=Preprocess(img);
std::cout<<"image processing finished!"<<endl;
PPM imagePPM[1];
memcpy(imagePPM[0].buffer,img.data,INPUT_C*INPUT_H*INPUT_W);
float* data = new float[1*INPUT_C*INPUT_H*INPUT_W];
for (int i = 0, volImg = INPUT_C*INPUT_H*INPUT_W; i < 1; ++i)
{
for (int c = 0; c < INPUT_C; ++c)
{
for (unsigned j = 0, volChl = INPUT_H*INPUT_W; j < volChl; ++j)
data[i*volImg + c*volChl + j] = (float(imagePPM[i].buffer[j*INPUT_C + 2 - c]))*1/255.0;
}
}
std::cout<<"data preparing finished!"<<endl;
// deserialize the engine
std::ifstream in_file(engine_file.c_str(), std::ios::in | std::ios::binary);
if (!in_file.is_open()) {
fprintf(stderr, "fail to open file to write: %s\n", engine_file.c_str());
return -1;
}
std::streampos begin, end;
begin = in_file.tellg();
in_file.seekg(0, std::ios::end);
end = in_file.tellg();
std::size_t size = end - begin;
fprintf(stdout, "engine file size: %d bytes\n", size);
in_file.seekg(0, std::ios::beg);
std::unique_ptr<unsigned char[]> engine_data(new unsigned char);
in_file.read((char*)engine_data.get(), size);
in_file.close();
IRuntime* runtime = createInferRuntime(gLogger);
std::cout << "start deserializeCudaEngine model..." << std::endl;
ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), &pluginFactory);
std::cout << "end deserializeCudaEngine model..." << std::endl;
IExecutionContext *context = engine->createExecutionContext();
context->setProfiler(&gProfiler);
// run inference
doInference(*context, data, prob1,prob2,prob3, 1);
printf("successfully inferenced!!!\n");
// destroy the engine
context->destroy();
engine->destroy();
gProfiler.printLayerTimes();
runtime->destroy();
pluginFactory.destroyPlugin();
// print a histogram of the output distribution
return 0;
}
I’m new to TensorRT,here I have several questions :
1.Why can’t I der=serialize engine,is there any wrong code?
2.I can successfully generate my engine file,does that demonstrate my network with IPlugin can parse my caffemodel and extract its weights?
3.About"builder->setMaxWorkspaceSize(1 << 20);",how to decide the size of workspace? My engine file is about 120M,should I set it as “builder->setMaxWorkspaceSize(1 << 120);”??
Any suggestions are appreciated!Thanks to your help and wish you a happy day!!!