@NVES
I’m working on a wrapper of TensorRT inference, the working environment is like: TensorRT-5.0.2.6, the GPU is Titan xp, CUDA Version 9.0.176, cudnn7.3.1
First I prepared the engine plan file with the SSD sample. The sampleSSD.cpp is forked from TensorRT-5.0.2.6/samples/sampleSSD, several modifies are made to save the serialized engine to a local file, so it can be reused easily without parsing the network every time.
In the function caffeToTRTModel, output the stream data to file after serializing the engine, like:
// Serialize the engine, then close everything down
(*trtModelStream) = engine->serialize();
nvinfer1::IHostMemory* gieModelStream = engine->serialize();
std::ofstream outfile(engine_file.c_str(), std::ios::out | std::ios::binary);
if (!outfile.is_open()) {
fprintf(stderr, "fail to open engine file: %s\n", engine_file.c_str());
}
unsigned char* p = (unsigned char*)gieModelStream->data();
outfile.write((char*)p, gieModelStream->size());
outfile.close();
engine->destroy();
builder->destroy();
While testing, call the function loadGIEEngine to deserialize the engine from file, and use the engine to create IExecutionContext for inferencing. loadGIEEngine is like:
nvinfer1::ICudaEngine* loadGIEEngine(const std::string planFilePath) {
// reading the model
std::cout << "Loading TRT Engine: " << planFilePath << std::endl;
std::stringstream gieModelStream;
gieModelStream.seekg(0, gieModelStream.beg);
std::ifstream cache(planFilePath);
assert(cache.good());
gieModelStream << cache.rdbuf();
cache.close();
// calculating model size
gieModelStream.seekg(0, std::ios::end);
const int modelSize = gieModelStream.tellg();
gieModelStream.seekg(0, std::ios::beg);
void* modelMem = malloc(modelSize);
gieModelStream.read((char*)modelMem, modelSize);
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, nullptr);
free(modelMem);
runtime->destroy();
std::cout << "Loading Complete!" << std::endl;
return engine;
}
Testing of save/reloading engine is ok, and then I used the engine plan file in another project. The same loadGIEEngine function is reused, and for the inferencing part I followed the origin codes in sample SSD, except I changed the type of input pointer:
void TRTInference::doInference(nvinfer1::IExecutionContext& context, unsigned char* input, float* detOutput, int* keepCount, int batchSize) {
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly 1 input and 2 output.
const nvinfer1::ICudaEngine& engine = context.getEngine();
assert(engine.getNbBindings() == 3);
void* buffers[3];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(m_input_blob_name.c_str()),
outputIndex0 = engine.getBindingIndex(m_det_output_blob_name.c_str()),
outputIndex1 = engine.getBindingIndex(m_keep_count_blob_name.c_str());
// Create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * m_input_c * m_input_h * m_input_w * sizeof(float))); // Data
CHECK(cudaMalloc(&buffers[outputIndex0], batchSize * m_keep_topk * 7 * sizeof(float))); // Detection_out
CHECK(cudaMalloc(&buffers[outputIndex1], batchSize * sizeof(int))); // KeepCount (BBoxs left for each batch)
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * m_input_c * m_input_h * m_input_w * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(detOutput, buffers[outputIndex0], batchSize * m_keep_topk * 7 * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK(cudaMemcpyAsync(keepCount, buffers[outputIndex1], batchSize * sizeof(int), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex0]));
CHECK(cudaFree(buffers[outputIndex1]));
}
unsigned char* input is from cv::Mat.data, which is created from cv::dnn::blobFromImages, stands for a batch of images.
While testing this project, this warning came up:
WARNING: Using an engine plan file across different models of devices is not recommended and is likely to affect performance or even cause errors.
the engine deserializing part seems ok; an error
ERROR: Parameter check failed at: engine.cpp::enqueue::295, condition: bindings[x] != nullptr
came up here in the function doInference:
context.enqueue(batchSize, buffers, stream, nullptr);
I’m not sure what does this error message mean. So what may be the cause and where should I look into?
Thanks for your reply.