Hi,
Please check this sample:
#include <cuda_runtime.h>
#include <iostream>
#include <sstream>
#include <fstream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include "NvInfer.h"
#include "NvCaffeParser.h"
#include <cublas_v2.h>
#include <cudnn.h>
#define INPUT_BLOB "data"
#define OUTPUT_BLOB "prob"
#define ONE_MBYTE (1024*1024)
class Logger : public nvinfer1::ILogger
{
void log(Severity severity, const char* msg) override
{
if( severity != Severity::kINFO )
std::cout << msg << std::endl;
}
} gLogger;
void printMemInfo()
{
size_t free_byte ;
size_t total_byte ;
cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
if ( cudaSuccess != cuda_status ){
printf("Error: cudaMemGetInfo fails, %s\n", cudaGetErrorString(cuda_status));
exit(1);
}
double free_db = (double)free_byte ;
double total_db = (double)total_byte ;
double used_db = total_db - free_db ;
printf(" GPU memory usage: used = %.2f MB, free = %.2f MB, total = %.2f MB\n", used_db/ONE_MBYTE, free_db/ONE_MBYTE, total_db/ONE_MBYTE);
}
int main(int argc, char** argv)
{
if( argc < 5 )
{
std::cout << "Usage: " << argv[0] << " [prototxt] [caffemodel] [PLAN] [image]" << std::endl;
exit(-1);
}
printf("Initial memory:");
printMemInfo();
cudaError_t res = cudaFree(0);
printf("After CUDA initialization:");
printMemInfo();
cublasContext* cublas;
cublasCreate(&cublas);
cublasDestroy(cublas);
printf("After loading cuBLAS library:");
printMemInfo();
cudnnContext* cudnn;
cudnnCreate(&cudnn);
cudnnDestroy(cudnn);
printf("After loading cuDNN library:");
printMemInfo();
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
builder->destroy();
printf("After loading TensorRT library:");
printMemInfo();
printf("Before process:");
printMemInfo();
std::ifstream cache( argv[3] );
std::stringstream modelStream;
modelStream.seekg(0, modelStream.beg);
// caffe -> PLAN
if( !cache )
{
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
nvinfer1::INetworkDefinition* network = builder->createNetwork();
nvcaffeparser1::ICaffeParser* parser = nvcaffeparser1::createCaffeParser();
const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor = parser->parse(argv[1], argv[2], *network, nvinfer1::DataType::kFLOAT);
nvinfer1::ITensor* output = blobNameToTensor->find(OUTPUT_BLOB);
network->markOutput(*output);
builder->setMaxBatchSize(1);
builder->setMaxWorkspaceSize((16 << 20));
nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);
nvinfer1::IHostMemory* serializer = engine->serialize();
modelStream.write((const char*)serializer->data(), serializer->size());
std::ofstream output_obj;
output_obj.open(argv[3]);
output_obj << modelStream.rdbuf();
output_obj.close();
network->destroy();
parser->destroy();
engine->destroy();
builder->destroy();
modelStream.seekg(0, modelStream.beg);
}
else
{
modelStream << cache.rdbuf();
cache.close();
}
// PLAN -> engine
modelStream.seekg(0, std::ios::end);
const int size = modelStream.tellg();
modelStream.seekg(0, std::ios::beg);
void* mem = malloc(size);
modelStream.read((char*)mem, size);
nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(gLogger);
nvinfer1::ICudaEngine* engine = infer->deserializeCudaEngine(mem, size, NULL);
nvinfer1::IExecutionContext* context = engine->createExecutionContext();
free(mem);
// buffers
float* input_data;
float* output_data;
nvinfer1::Dims inputDims = engine->getBindingDimensions(engine->getBindingIndex( INPUT_BLOB));
nvinfer1::Dims outputDims = engine->getBindingDimensions(engine->getBindingIndex(OUTPUT_BLOB));
cudaMallocManaged( &input_data, inputDims.d[0]* inputDims.d[1]* inputDims.d[2]*sizeof(float));
cudaMallocManaged(&output_data, outputDims.d[0]*outputDims.d[1]*outputDims.d[2]*sizeof(float));
std::cout << INPUT_BLOB <<": ("<< inputDims.d[0] <<","<< inputDims.d[1] <<","<< inputDims.d[2] <<")"<<std::endl;
std::cout <<OUTPUT_BLOB <<": ("<<outputDims.d[0] <<","<<outputDims.d[1] <<","<<outputDims.d[2] <<")"<<std::endl;
// inference
const clock_t begin_time = clock();
cv::Mat image;
image = cv::imread(argv[4]);
cv::Mat resized;
cv::resize(image, resized, cv::Size(inputDims.d[2], inputDims.d[1]));
// U8+HWC -> float+CHW
size_t plane = inputDims.d[1]*inputDims.d[2];
for( size_t idx=0; idx<inputDims.d[1]*inputDims.d[2]; idx++ )
{
input_data[0*plane+idx] = float(resized.data[3*idx+0])-128;
input_data[1*plane+idx] = float(resized.data[3*idx+1])-128;
input_data[2*plane+idx] = float(resized.data[3*idx+2])-128;
}
cudaDeviceSynchronize();
void* buffers[] = { input_data, output_data };
context->execute(1, buffers);
cudaDeviceSynchronize();
for( size_t i=0; i<outputDims.d[0]; i++ )
{
if( output_data[i] > 0.01f )
std::cout << "index=" << i << ": " << output_data[i] << std::endl;
}
std::cout << "Inference time: " << float( clock()-begin_time)/CLOCKS_PER_SEC/100 << " per image" << std::endl;
nvcaffeparser1::shutdownProtobufLibrary();
//cudaFree(input_data);
//cudaFree(output_data);
cudaFree(buffers[0]);
cudaFree(buffers[1]);
context->destroy();
engine->destroy();
infer->destroy();
printf("after process:");
printMemInfo();
return 0;
}
nvcc -o main -std=c++11 -lnvinfer -lnvparsers -lopencv_core -lopencv_imgproc -lopencv_imgcodecs -lcudnn -lcublas topic_1055977.cpp
Initial memory: GPU memory usage: used = 2694.91 MB, free = 1261.64 MB, total = 3956.56 MB
After CUDA initialization: GPU memory usage: used = 2694.91 MB, free = 1261.64 MB, total = 3956.56 MB
After loading cuBLAS library: GPU memory usage: used = 2807.63 MB, free = 1148.93 MB, total = 3956.56 MB
After loading cuDNN library: GPU memory usage: used = 3198.71 MB, free = 757.84 MB, total = 3956.56 MB
After loading TensorRT library: GPU memory usage: used = 3275.61 MB, free = 680.95 MB, total = 3956.56 MB
Before process: GPU memory usage: used = 3275.61 MB, free = 680.95 MB, total = 3956.56 MB
After process: GPU memory usage: used = 3318.88 MB, free = 637.68 MB, total = 3956.56 MB
Most of the memory is occupied by loading the required memory, including cuBLAS, cuDNN and TensorRT.
(and some for image processing)
Thanks.