CUDA memory release

Hi,

When continuously calls our application, we find the memory resources used by TensorRT are not released, please help to check whether the CUDA or other resources are released in our codes, thanks

Hi,

Have you released the input/output GPU buffer?
Thanks.

Hi,

I think I have released the buffer and related memory, but they haven’t been released

This issue have blocked us to move forward, please check the attachment and help to modify

Thanks


Hi,

It’s hard for us to debug without the source code.
Could you profile your system in each stage and share the information with us?

By the way, TensorRT/cuDNN libraries also occupy some memory.
The libraries is loaded when first created and released once the application is terminated.

Thanks.

Hi,

How much memory does TensorRT/cuDNN libraries occupy? about 1G B?

Hi,

It varies from the cuDNN/TensorRT version.

You can test it with this sample directly: topic_1055977.cu

#include <stdio.h>
#include "cuda.h"
#include "cudnn.h"
#define ONE_MBYTE (1024*1024)

void printMemInfo()
{
    size_t free_byte ;
    size_t total_byte ;
    cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;

    if ( cudaSuccess != cuda_status ){
        printf("Error: cudaMemGetInfo fails, %s\n", cudaGetErrorString(cuda_status));
        exit(1);
    }

    double free_db = (double)free_byte ;
    double total_db = (double)total_byte ;
    double used_db = total_db - free_db ;

    printf(" GPU memory usage: used = %.2f MB, free = %.2f MB, total = %.2f MB\n", used_db/ONE_MBYTE, free_db/ONE_MBYTE, total_db/ONE_MBYTE);
}

int main(){
    printf("Initial memory:");
    printMemInfo();

    cudnnHandle_t handle_;
    cudnnCreate(&handle_);
    printf("After cuDNN create:");
    printMemInfo();

    return 0;
}
nvcc topic_1055977.cu -lcudnn -o test
./test

Here is the result of my environment:

Initial memory: GPU memory usage: used = 1314.97 MB, free = 2641.59 MB, total = 3956.56 MB
After cuDNN create: GPU memory usage: used = 2063.25 MB, free = 1893.31 MB, total = 3956.56 MB

cuDNN library occupies around 750Mb memory.

Thanks.

Hi,

Thanks very much for you information

Now the key problems are that:

  1. Seems all of memory are not released if the application is not terminated
  2. cuDNN/TensorRT libraries occupied memory cannot be released once called cuDNN/TensorRT without forcing to release the memory by linux command, which are over 750MB

Above things result in our application cannot be called continuously, but we need to call the application continuously

So,

  1. could you help to check in our codes why the memory are not be released in the attachments? we think we have used enough operations to release the memory in our codes
  2. how to release the cuDNN/TensorRT libraries occupied memory after one inference is finished

demo.cpp (4.25 KB)

Hi,

Have you checked and tried the demo codes?

Thanks

Hi,

Not yet. Will update information with you later.

Hi,

Please check this sample:

#include <cuda_runtime.h>
#include <iostream>
#include <sstream>
#include <fstream>

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include <cublas_v2.h>
#include <cudnn.h>

#define INPUT_BLOB "data"
#define OUTPUT_BLOB "prob"
#define ONE_MBYTE (1024*1024)

class Logger : public nvinfer1::ILogger
{
    void log(Severity severity, const char* msg) override
    {
        if( severity != Severity::kINFO )
            std::cout << msg << std::endl;
    }
} gLogger;

void printMemInfo()
{
    size_t free_byte ;
    size_t total_byte ;
    cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;

    if ( cudaSuccess != cuda_status ){
        printf("Error: cudaMemGetInfo fails, %s\n", cudaGetErrorString(cuda_status));
        exit(1);
    }

    double free_db = (double)free_byte ;
    double total_db = (double)total_byte ;
    double used_db = total_db - free_db ;

    printf(" GPU memory usage: used = %.2f MB, free = %.2f MB, total = %.2f MB\n", used_db/ONE_MBYTE, free_db/ONE_MBYTE, total_db/ONE_MBYTE);
}

int main(int argc, char** argv)
{
    if( argc < 5 )
    {
        std::cout << "Usage: " << argv[0] << " [prototxt] [caffemodel] [PLAN] [image]" << std::endl;
        exit(-1);
    }

    printf("Initial memory:");
    printMemInfo();

    cudaError_t res = cudaFree(0);
    printf("After CUDA initialization:");
    printMemInfo();

    cublasContext* cublas;
    cublasCreate(&cublas);
    cublasDestroy(cublas);
    printf("After loading cuBLAS library:");
    printMemInfo();

    cudnnContext* cudnn;
    cudnnCreate(&cudnn);
    cudnnDestroy(cudnn);
    printf("After loading cuDNN library:");
    printMemInfo();

    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
    builder->destroy();

    printf("After loading TensorRT library:");
    printMemInfo();

    printf("Before process:");
    printMemInfo();

    std::ifstream cache( argv[3] );
    std::stringstream modelStream;
    modelStream.seekg(0, modelStream.beg);

    // caffe -> PLAN
    if( !cache )
    {
        nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
        nvinfer1::INetworkDefinition* network = builder->createNetwork();
        nvcaffeparser1::ICaffeParser* parser = nvcaffeparser1::createCaffeParser();

        const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor = parser->parse(argv[1], argv[2], *network, nvinfer1::DataType::kFLOAT);
        nvinfer1::ITensor* output = blobNameToTensor->find(OUTPUT_BLOB);
        network->markOutput(*output);

        builder->setMaxBatchSize(1);
        builder->setMaxWorkspaceSize((16 << 20));
        nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);

        nvinfer1::IHostMemory* serializer = engine->serialize();
        modelStream.write((const char*)serializer->data(), serializer->size());

        std::ofstream output_obj;
        output_obj.open(argv[3]);
        output_obj << modelStream.rdbuf();
        output_obj.close();

        network->destroy();
        parser->destroy();
        engine->destroy();
        builder->destroy();
        modelStream.seekg(0, modelStream.beg);
    }
    else
    {
        modelStream << cache.rdbuf();
        cache.close();
    }


    // PLAN -> engine
    modelStream.seekg(0, std::ios::end);
    const int size = modelStream.tellg();
    modelStream.seekg(0, std::ios::beg);

    void* mem = malloc(size);
    modelStream.read((char*)mem, size);

    nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(gLogger);
    nvinfer1::ICudaEngine* engine = infer->deserializeCudaEngine(mem, size, NULL);
    nvinfer1::IExecutionContext* context = engine->createExecutionContext();
    free(mem);


    // buffers
    float*  input_data;
    float* output_data;
    nvinfer1::Dims inputDims  = engine->getBindingDimensions(engine->getBindingIndex( INPUT_BLOB));
    nvinfer1::Dims outputDims = engine->getBindingDimensions(engine->getBindingIndex(OUTPUT_BLOB));
    cudaMallocManaged( &input_data, inputDims.d[0]* inputDims.d[1]* inputDims.d[2]*sizeof(float));
    cudaMallocManaged(&output_data, outputDims.d[0]*outputDims.d[1]*outputDims.d[2]*sizeof(float));

    std::cout << INPUT_BLOB <<": ("<< inputDims.d[0] <<","<< inputDims.d[1] <<","<< inputDims.d[2] <<")"<<std::endl;
    std::cout <<OUTPUT_BLOB <<": ("<<outputDims.d[0] <<","<<outputDims.d[1] <<","<<outputDims.d[2] <<")"<<std::endl;

    // inference
    const clock_t begin_time = clock();
    cv::Mat image;
    image = cv::imread(argv[4]);

    cv::Mat resized;
    cv::resize(image, resized, cv::Size(inputDims.d[2], inputDims.d[1]));

    // U8+HWC -> float+CHW 
    size_t plane = inputDims.d[1]*inputDims.d[2];
    for( size_t idx=0; idx<inputDims.d[1]*inputDims.d[2]; idx++ )
    {
        input_data[0*plane+idx] = float(resized.data[3*idx+0])-128;
        input_data[1*plane+idx] = float(resized.data[3*idx+1])-128;
        input_data[2*plane+idx] = float(resized.data[3*idx+2])-128;
    }
    cudaDeviceSynchronize();

    void* buffers[] = { input_data, output_data };	
    context->execute(1, buffers);
    cudaDeviceSynchronize();

    for( size_t i=0; i<outputDims.d[0]; i++ )
    {
        if( output_data[i] > 0.01f )
            std::cout << "index=" << i << ": " << output_data[i] << std::endl;
    }

    std::cout << "Inference time: " << float( clock()-begin_time)/CLOCKS_PER_SEC/100 << " per image" << std::endl;

    nvcaffeparser1::shutdownProtobufLibrary();
    //cudaFree(input_data);
    //cudaFree(output_data);
    cudaFree(buffers[0]);
    cudaFree(buffers[1]);

    context->destroy();
    engine->destroy();
    infer->destroy();

    printf("after process:");
    printMemInfo();

    return 0;
}
nvcc -o main -std=c++11 -lnvinfer -lnvparsers -lopencv_core -lopencv_imgproc -lopencv_imgcodecs -lcudnn -lcublas topic_1055977.cpp
Initial memory: GPU memory usage: used = 2694.91 MB, free = 1261.64 MB, total = 3956.56 MB
After CUDA initialization: GPU memory usage: used = 2694.91 MB, free = 1261.64 MB, total = 3956.56 MB
After loading cuBLAS library: GPU memory usage: used = 2807.63 MB, free = 1148.93 MB, total = 3956.56 MB
After loading cuDNN library: GPU memory usage: used = 3198.71 MB, free = 757.84 MB, total = 3956.56 MB
After loading TensorRT library: GPU memory usage: used = 3275.61 MB, free = 680.95 MB, total = 3956.56 MB
Before process: GPU memory usage: used = 3275.61 MB, free = 680.95 MB, total = 3956.56 MB
After  process: GPU memory usage: used = 3318.88 MB, free = 637.68 MB, total = 3956.56 MB

Most of the memory is occupied by loading the required memory, including cuBLAS, cuDNN and TensorRT.
(and some for image processing)

Thanks.

Hi

I have modified our codes, and the application can be called continuously

Thanks

i got same problem cant allocate memory after few frame detect. Could you share how your modify ?

1 Like