CUDA memory release

Hi,

Please check this sample:

#include <cuda_runtime.h>
#include <iostream>
#include <sstream>
#include <fstream>

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include <cublas_v2.h>
#include <cudnn.h>

#define INPUT_BLOB "data"
#define OUTPUT_BLOB "prob"
#define ONE_MBYTE (1024*1024)

class Logger : public nvinfer1::ILogger
{
    void log(Severity severity, const char* msg) override
    {
        if( severity != Severity::kINFO )
            std::cout << msg << std::endl;
    }
} gLogger;

void printMemInfo()
{
    size_t free_byte ;
    size_t total_byte ;
    cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;

    if ( cudaSuccess != cuda_status ){
        printf("Error: cudaMemGetInfo fails, %s\n", cudaGetErrorString(cuda_status));
        exit(1);
    }

    double free_db = (double)free_byte ;
    double total_db = (double)total_byte ;
    double used_db = total_db - free_db ;

    printf(" GPU memory usage: used = %.2f MB, free = %.2f MB, total = %.2f MB\n", used_db/ONE_MBYTE, free_db/ONE_MBYTE, total_db/ONE_MBYTE);
}

int main(int argc, char** argv)
{
    if( argc < 5 )
    {
        std::cout << "Usage: " << argv[0] << " [prototxt] [caffemodel] [PLAN] [image]" << std::endl;
        exit(-1);
    }

    printf("Initial memory:");
    printMemInfo();

    cudaError_t res = cudaFree(0);
    printf("After CUDA initialization:");
    printMemInfo();

    cublasContext* cublas;
    cublasCreate(&cublas);
    cublasDestroy(cublas);
    printf("After loading cuBLAS library:");
    printMemInfo();

    cudnnContext* cudnn;
    cudnnCreate(&cudnn);
    cudnnDestroy(cudnn);
    printf("After loading cuDNN library:");
    printMemInfo();

    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
    builder->destroy();

    printf("After loading TensorRT library:");
    printMemInfo();

    printf("Before process:");
    printMemInfo();

    std::ifstream cache( argv[3] );
    std::stringstream modelStream;
    modelStream.seekg(0, modelStream.beg);

    // caffe -> PLAN
    if( !cache )
    {
        nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
        nvinfer1::INetworkDefinition* network = builder->createNetwork();
        nvcaffeparser1::ICaffeParser* parser = nvcaffeparser1::createCaffeParser();

        const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor = parser->parse(argv[1], argv[2], *network, nvinfer1::DataType::kFLOAT);
        nvinfer1::ITensor* output = blobNameToTensor->find(OUTPUT_BLOB);
        network->markOutput(*output);

        builder->setMaxBatchSize(1);
        builder->setMaxWorkspaceSize((16 << 20));
        nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);

        nvinfer1::IHostMemory* serializer = engine->serialize();
        modelStream.write((const char*)serializer->data(), serializer->size());

        std::ofstream output_obj;
        output_obj.open(argv[3]);
        output_obj << modelStream.rdbuf();
        output_obj.close();

        network->destroy();
        parser->destroy();
        engine->destroy();
        builder->destroy();
        modelStream.seekg(0, modelStream.beg);
    }
    else
    {
        modelStream << cache.rdbuf();
        cache.close();
    }


    // PLAN -> engine
    modelStream.seekg(0, std::ios::end);
    const int size = modelStream.tellg();
    modelStream.seekg(0, std::ios::beg);

    void* mem = malloc(size);
    modelStream.read((char*)mem, size);

    nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(gLogger);
    nvinfer1::ICudaEngine* engine = infer->deserializeCudaEngine(mem, size, NULL);
    nvinfer1::IExecutionContext* context = engine->createExecutionContext();
    free(mem);


    // buffers
    float*  input_data;
    float* output_data;
    nvinfer1::Dims inputDims  = engine->getBindingDimensions(engine->getBindingIndex( INPUT_BLOB));
    nvinfer1::Dims outputDims = engine->getBindingDimensions(engine->getBindingIndex(OUTPUT_BLOB));
    cudaMallocManaged( &input_data, inputDims.d[0]* inputDims.d[1]* inputDims.d[2]*sizeof(float));
    cudaMallocManaged(&output_data, outputDims.d[0]*outputDims.d[1]*outputDims.d[2]*sizeof(float));

    std::cout << INPUT_BLOB <<": ("<< inputDims.d[0] <<","<< inputDims.d[1] <<","<< inputDims.d[2] <<")"<<std::endl;
    std::cout <<OUTPUT_BLOB <<": ("<<outputDims.d[0] <<","<<outputDims.d[1] <<","<<outputDims.d[2] <<")"<<std::endl;

    // inference
    const clock_t begin_time = clock();
    cv::Mat image;
    image = cv::imread(argv[4]);

    cv::Mat resized;
    cv::resize(image, resized, cv::Size(inputDims.d[2], inputDims.d[1]));

    // U8+HWC -> float+CHW 
    size_t plane = inputDims.d[1]*inputDims.d[2];
    for( size_t idx=0; idx<inputDims.d[1]*inputDims.d[2]; idx++ )
    {
        input_data[0*plane+idx] = float(resized.data[3*idx+0])-128;
        input_data[1*plane+idx] = float(resized.data[3*idx+1])-128;
        input_data[2*plane+idx] = float(resized.data[3*idx+2])-128;
    }
    cudaDeviceSynchronize();

    void* buffers[] = { input_data, output_data };	
    context->execute(1, buffers);
    cudaDeviceSynchronize();

    for( size_t i=0; i<outputDims.d[0]; i++ )
    {
        if( output_data[i] > 0.01f )
            std::cout << "index=" << i << ": " << output_data[i] << std::endl;
    }

    std::cout << "Inference time: " << float( clock()-begin_time)/CLOCKS_PER_SEC/100 << " per image" << std::endl;

    nvcaffeparser1::shutdownProtobufLibrary();
    //cudaFree(input_data);
    //cudaFree(output_data);
    cudaFree(buffers[0]);
    cudaFree(buffers[1]);

    context->destroy();
    engine->destroy();
    infer->destroy();

    printf("after process:");
    printMemInfo();

    return 0;
}
nvcc -o main -std=c++11 -lnvinfer -lnvparsers -lopencv_core -lopencv_imgproc -lopencv_imgcodecs -lcudnn -lcublas topic_1055977.cpp
Initial memory: GPU memory usage: used = 2694.91 MB, free = 1261.64 MB, total = 3956.56 MB
After CUDA initialization: GPU memory usage: used = 2694.91 MB, free = 1261.64 MB, total = 3956.56 MB
After loading cuBLAS library: GPU memory usage: used = 2807.63 MB, free = 1148.93 MB, total = 3956.56 MB
After loading cuDNN library: GPU memory usage: used = 3198.71 MB, free = 757.84 MB, total = 3956.56 MB
After loading TensorRT library: GPU memory usage: used = 3275.61 MB, free = 680.95 MB, total = 3956.56 MB
Before process: GPU memory usage: used = 3275.61 MB, free = 680.95 MB, total = 3956.56 MB
After  process: GPU memory usage: used = 3318.88 MB, free = 637.68 MB, total = 3956.56 MB

Most of the memory is occupied by loading the required memory, including cuBLAS, cuDNN and TensorRT.
(and some for image processing)

Thanks.