CUDA memory release

AastaLLL · June 28, 2019, 9:24am

Hi,

Please check this sample:

#include <cuda_runtime.h>
#include <iostream>
#include <sstream>
#include <fstream>

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include <cublas_v2.h>
#include <cudnn.h>

#define INPUT_BLOB "data"
#define OUTPUT_BLOB "prob"
#define ONE_MBYTE (1024*1024)

class Logger : public nvinfer1::ILogger
{
    void log(Severity severity, const char* msg) override
    {
        if( severity != Severity::kINFO )
            std::cout << msg << std::endl;
    }
} gLogger;

void printMemInfo()
{
    size_t free_byte ;
    size_t total_byte ;
    cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;

    if ( cudaSuccess != cuda_status ){
        printf("Error: cudaMemGetInfo fails, %s\n", cudaGetErrorString(cuda_status));
        exit(1);
    }

    double free_db = (double)free_byte ;
    double total_db = (double)total_byte ;
    double used_db = total_db - free_db ;

    printf(" GPU memory usage: used = %.2f MB, free = %.2f MB, total = %.2f MB\n", used_db/ONE_MBYTE, free_db/ONE_MBYTE, total_db/ONE_MBYTE);
}

int main(int argc, char** argv)
{
    if( argc < 5 )
    {
        std::cout << "Usage: " << argv[0] << " [prototxt] [caffemodel] [PLAN] [image]" << std::endl;
        exit(-1);
    }

    printf("Initial memory:");
    printMemInfo();

    cudaError_t res = cudaFree(0);
    printf("After CUDA initialization:");
    printMemInfo();

    cublasContext* cublas;
    cublasCreate(&cublas);
    cublasDestroy(cublas);
    printf("After loading cuBLAS library:");
    printMemInfo();

    cudnnContext* cudnn;
    cudnnCreate(&cudnn);
    cudnnDestroy(cudnn);
    printf("After loading cuDNN library:");
    printMemInfo();

    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
    builder->destroy();

    printf("After loading TensorRT library:");
    printMemInfo();

    printf("Before process:");
    printMemInfo();

    std::ifstream cache( argv[3] );
    std::stringstream modelStream;
    modelStream.seekg(0, modelStream.beg);

    // caffe -> PLAN
    if( !cache )
    {
        nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
        nvinfer1::INetworkDefinition* network = builder->createNetwork();
        nvcaffeparser1::ICaffeParser* parser = nvcaffeparser1::createCaffeParser();

        const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor = parser->parse(argv[1], argv[2], *network, nvinfer1::DataType::kFLOAT);
        nvinfer1::ITensor* output = blobNameToTensor->find(OUTPUT_BLOB);
        network->markOutput(*output);

        builder->setMaxBatchSize(1);
        builder->setMaxWorkspaceSize((16 << 20));
        nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);

        nvinfer1::IHostMemory* serializer = engine->serialize();
        modelStream.write((const char*)serializer->data(), serializer->size());

        std::ofstream output_obj;
        output_obj.open(argv[3]);
        output_obj << modelStream.rdbuf();
        output_obj.close();

        network->destroy();
        parser->destroy();
        engine->destroy();
        builder->destroy();
        modelStream.seekg(0, modelStream.beg);
    }
    else
    {
        modelStream << cache.rdbuf();
        cache.close();
    }


    // PLAN -> engine
    modelStream.seekg(0, std::ios::end);
    const int size = modelStream.tellg();
    modelStream.seekg(0, std::ios::beg);

    void* mem = malloc(size);
    modelStream.read((char*)mem, size);

    nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(gLogger);
    nvinfer1::ICudaEngine* engine = infer->deserializeCudaEngine(mem, size, NULL);
    nvinfer1::IExecutionContext* context = engine->createExecutionContext();
    free(mem);


    // buffers
    float*  input_data;
    float* output_data;
    nvinfer1::Dims inputDims  = engine->getBindingDimensions(engine->getBindingIndex( INPUT_BLOB));
    nvinfer1::Dims outputDims = engine->getBindingDimensions(engine->getBindingIndex(OUTPUT_BLOB));
    cudaMallocManaged( &input_data, inputDims.d[0]* inputDims.d[1]* inputDims.d[2]*sizeof(float));
    cudaMallocManaged(&output_data, outputDims.d[0]*outputDims.d[1]*outputDims.d[2]*sizeof(float));

    std::cout << INPUT_BLOB <<": ("<< inputDims.d[0] <<","<< inputDims.d[1] <<","<< inputDims.d[2] <<")"<<std::endl;
    std::cout <<OUTPUT_BLOB <<": ("<<outputDims.d[0] <<","<<outputDims.d[1] <<","<<outputDims.d[2] <<")"<<std::endl;

    // inference
    const clock_t begin_time = clock();
    cv::Mat image;
    image = cv::imread(argv[4]);

    cv::Mat resized;
    cv::resize(image, resized, cv::Size(inputDims.d[2], inputDims.d[1]));

    // U8+HWC -> float+CHW 
    size_t plane = inputDims.d[1]*inputDims.d[2];
    for( size_t idx=0; idx<inputDims.d[1]*inputDims.d[2]; idx++ )
    {
        input_data[0*plane+idx] = float(resized.data[3*idx+0])-128;
        input_data[1*plane+idx] = float(resized.data[3*idx+1])-128;
        input_data[2*plane+idx] = float(resized.data[3*idx+2])-128;
    }
    cudaDeviceSynchronize();

    void* buffers[] = { input_data, output_data };	
    context->execute(1, buffers);
    cudaDeviceSynchronize();

    for( size_t i=0; i<outputDims.d[0]; i++ )
    {
        if( output_data[i] > 0.01f )
            std::cout << "index=" << i << ": " << output_data[i] << std::endl;
    }

    std::cout << "Inference time: " << float( clock()-begin_time)/CLOCKS_PER_SEC/100 << " per image" << std::endl;

    nvcaffeparser1::shutdownProtobufLibrary();
    //cudaFree(input_data);
    //cudaFree(output_data);
    cudaFree(buffers[0]);
    cudaFree(buffers[1]);

    context->destroy();
    engine->destroy();
    infer->destroy();

    printf("after process:");
    printMemInfo();

    return 0;
}

nvcc -o main -std=c++11 -lnvinfer -lnvparsers -lopencv_core -lopencv_imgproc -lopencv_imgcodecs -lcudnn -lcublas topic_1055977.cpp

Initial memory: GPU memory usage: used = 2694.91 MB, free = 1261.64 MB, total = 3956.56 MB
After CUDA initialization: GPU memory usage: used = 2694.91 MB, free = 1261.64 MB, total = 3956.56 MB
After loading cuBLAS library: GPU memory usage: used = 2807.63 MB, free = 1148.93 MB, total = 3956.56 MB
After loading cuDNN library: GPU memory usage: used = 3198.71 MB, free = 757.84 MB, total = 3956.56 MB
After loading TensorRT library: GPU memory usage: used = 3275.61 MB, free = 680.95 MB, total = 3956.56 MB
Before process: GPU memory usage: used = 3275.61 MB, free = 680.95 MB, total = 3956.56 MB
After  process: GPU memory usage: used = 3318.88 MB, free = 637.68 MB, total = 3956.56 MB

Most of the memory is occupied by loading the required memory, including cuBLAS, cuDNN and TensorRT.
(and some for image processing)

Thanks.

Topic		Replies	Views
cuBLAS, cuDNN, and TensorRT memory release on Jetson nano Jetson Nano tensorrt , cuda , jetson-inference	5	1559	November 24, 2021
Memory deallocation in engine TensorRT	0	604	June 10, 2019
tegra_multimedia_api sample memory not release ? Jetson TX2	2	370	September 19, 2018
How can I release GPU memory without terminating the execution process TensorRT tensorrt , python	2	1754	June 10, 2022
TensorRT 6 memory leak Jetson AGX Xavier tensorrt	12	2070	October 18, 2021
How to release all gpu memory after saving built engine? TensorRT tensorrt	1	525	July 19, 2022
How to manage the GPU memory and Host Memory? Especially release to OS TensorRT	1	616	May 6, 2020
cudaMalloc KILLED on tx2, and the memory can not be cudaFree real Jetson TX2	10	1151	December 2, 2019
TensorRT 4.0.1.6 memory leaks TensorRT	1	1453	June 4, 2019
TensorRT 5.1.6-1+cuda10.0 jetson nano memory leakage TensorRT	2	619	November 13, 2019

CUDA memory release

Related topics