Multiple CPU threads run with CUDA Graphs causes continuous increase in memory usage

In our system, we run multiple CPU threads concurrently, each one will generate multiple CUDA graphs and destroy all graphs at the end, but we found that when the CPU threads and the Graphs reach a certain amount, the GPU memory will continue to rise, and few CPU threads and Graphs will be fine. In addition, each CPU thread are independent, each thread are default stream.

Test environment:
GPU : GTX 1070
CPU : Intel(R) Core™ i9-10900X CPU @ 3.70GHz
OS : ubuntu 16.04
CUDA Version: 10.2
Driver Version: 440.64.00

Below is the memory usage:

nvidia-smi dmon -i 0 -s m
    # gpu    fb  bar1
    # Idx    MB    MB
        0    75     5
        0    75     5
        0   274     5   <------ program start
        0   286     5
        0   298     5
        0   310     5
        0   320     5
        0   330     5
        0   340     5
        0   352     5
        0   364     5
        0   376     5
        0   386     5
        0   396     5
        0   408     5
        0   418     5
        0   430     5
        0   442     5
        0   454     5   <------ program end
        0    75     5
        0    75     5
        0    75     5

Below is the simplified code and compile command:
nvcc --default-stream per-thread -gencode arch=compute_61,code=sm_61 main.cu -o main

#include <iostream>
#include <cuda_runtime.h>
#include <time.h>
#include <sys/time.h>
#include <string>
#include <thread>
#include <memory>
#include <vector>

#define BLOCK_SIZE_256 256

#define RETURN_IF_CUDA_ERROR( expr ) \
    { \
        cudaError_t result = expr; \
        if ( result != cudaSuccess ) {\
            std::cout << std::string( std::string( __FILE__ ) + ": line " + std::to_string( __LINE__ ) + "\n" ); \
            return false; \
        } \
    }

/////////////////////////////////////////////////////////

class GraphsLauncher
{
public:
    GraphsLauncher();
    ~GraphsLauncher();

    bool Initialize();
    bool AddKernelFunctionNode( void* func, void** kernelArgs, uint32_t gridSize, uint32_t blockSize, uint32_t sharedMemSize );
    //BoolResult AddMemoryCopyNode();
    //BoolResult AddMemorySetNode();
    bool Instantiate();
    bool Execution();
    bool Release();
private:
    cudaGraph_t m_graph;
    cudaGraphExec_t m_graphExec;

    std::vector<cudaGraphNode_t> m_nodeDependencies;
};

GraphsLauncher::GraphsLauncher()
    : m_graph( nullptr )
    , m_graphExec( nullptr )
    , m_nodeDependencies( 0 )
{
}

GraphsLauncher::~GraphsLauncher()
{
}

bool GraphsLauncher::Initialize()
{
    RETURN_IF_CUDA_ERROR( cudaGraphCreate( &m_graph, 0 ) );
    return true;
}

bool GraphsLauncher::AddKernelFunctionNode( void* func, void** kernelArgs, uint32_t gridSize, uint32_t blockSize, uint32_t sharedMemSize )
{
    cudaKernelNodeParams kernelNodeParams = { 0 };
    kernelNodeParams.func = func;
    kernelNodeParams.gridDim = dim3( gridSize, 1, 1 );
    kernelNodeParams.blockDim = dim3( blockSize, 1, 1 );
    kernelNodeParams.sharedMemBytes = sharedMemSize;
    kernelNodeParams.kernelParams = kernelArgs;
    kernelNodeParams.extra = nullptr;

    cudaGraphNode_t kernelNode;
    auto cudaError = cudaGraphAddKernelNode( &kernelNode, m_graph, m_nodeDependencies.data(), m_nodeDependencies.size(), &kernelNodeParams );
    if ( cudaError == cudaSuccess )
    {
        m_nodeDependencies.clear();
        m_nodeDependencies.push_back( kernelNode );
    }
    else
        return false;
    return true;
}

bool GraphsLauncher::Instantiate()
{
    RETURN_IF_CUDA_ERROR( cudaGraphInstantiate( &m_graphExec, m_graph, nullptr, nullptr, 0 ) );
    return true;
}

bool GraphsLauncher::Execution()
{
    RETURN_IF_CUDA_ERROR( cudaGraphLaunch( m_graphExec, 0 ) );
    RETURN_IF_CUDA_ERROR( cudaStreamSynchronize( 0 ) );
    return true;
}

bool GraphsLauncher::Release()
{
    if ( m_graphExec )
    {
        RETURN_IF_CUDA_ERROR( cudaGraphExecDestroy( m_graphExec ) );
        m_graphExec = nullptr;
    }
    if ( m_graph )
    {
        RETURN_IF_CUDA_ERROR( cudaGraphDestroy( m_graph ) );
        m_graph = nullptr;
    }
    m_nodeDependencies.clear();
    return true;
}

using GraphsLauncherPtr = std::unique_ptr< GraphsLauncher >;

/////////////////////////////////////////////////////////

__global__ void TestKernelFunc( float* value1 )
{
}

bool SetKernelNodes( GraphsLauncher* graphPtr, float* value1, uint32_t bufferSize )
{
    uint32_t blockSize = BLOCK_SIZE_256;
    uint32_t gridSize = static_cast<uint32_t>( ceilf( bufferSize / (float) blockSize ) );

    for ( uint32_t i = 0; i < 20; i++ )
    {
        void( *func )( float* ) = TestKernelFunc;
        void *kernelArgs[ 1 ] = { &value1 };
        if ( !graphPtr->AddKernelFunctionNode( ( void* ) func, ( void ** ) kernelArgs, gridSize, blockSize, 0 ) )
            return false;
    }
    return true;
}

struct TestObject
{
    float* value1;
    std::vector<GraphsLauncherPtr> graphList;

    TestObject() : value1( nullptr ), graphList( 0 )
    {

    }

    bool Initialize( int32_t bufferSize )
    {
        RETURN_IF_CUDA_ERROR( cudaMalloc( &value1, bufferSize * sizeof( float ) ) );

        const uint32_t graphCount = 10;
        for ( uint32_t idx = 1; idx < graphCount; idx++ )
        {
            graphList.push_back( std::make_unique< GraphsLauncher >() );
            GraphsLauncher* graphPtr = graphList.back().get();
            if ( !graphPtr->Initialize() )
                return false;
            if ( !SetKernelNodes( graphPtr, value1, bufferSize ) )
                return false;
            if ( !graphPtr->Instantiate() )
                return false;
        }
        return true;
    }

    bool Execution()
    {
        // Get one to execute
        return graphList.back()->Execution();
    }

    bool Release()
    {
        RETURN_IF_CUDA_ERROR( cudaFree( value1 ) );
        value1 = nullptr;

        for ( auto& graph : graphList )
        {
            if ( !graph->Release() )
                return false;
        }
        graphList.clear();
        return true;
    }
};

bool MainTestFunc()
{
    int32_t bufferSize = 131072;
    TestObject testObj1;

    if ( !testObj1.Initialize( bufferSize ) )
        return false;

    for ( uint32_t i = 0; i < 1000; i++ )
    {
        if ( !testObj1.Execution() )
            return false;
    }

    if ( !testObj1.Release() )
        return false;
    return true;
}

/////////////////////////////////////////////////////////

int main() {

    cudaFree( 0 );
    uint32_t maxTestCount = 64;

    for ( uint32_t testCount = 1; testCount <= maxTestCount; testCount++ )
    {
        uint32_t threadSize = 8;
        std::vector< bool > errorList( threadSize );
        std::vector< std::thread* > threads( threadSize, nullptr );

        for ( uint32_t i = 0; i < threadSize; ++i )
        {
            threads[ i ] = new std::thread( [ =, &errorList ]()
            {
                errorList[ i ] = MainTestFunc();
            } );
        }

        for ( uint32_t i = 0; i < threadSize; ++i )
            threads[ i ]->join();

        for ( uint32_t i = 0; i < threadSize; ++i )
            delete threads[ i ];

        for ( uint32_t i = 0; i < threadSize; ++i )
        {
            if ( !errorList[ i ] )
            {
                std::cout << "error occurred\n";
                cudaDeviceReset();
                return 0;
            }
        }
    }
    return 0;

}

We can observe that all the memory is released at the end of the program, also use CUDA-MEMCHECK tool to check memory leak, but no error was reported.

The question is

  1. Is all resource of the graph release immediately when we call destroy graph function?
  2. How many Graphs can be generated per process?
  3. Why memory usage continue rise…?

My experience with cudaGraph is it consumes a lot of memory so it cannot handle too large graphs. The cuda documentation leaves much white space about these details, though.

Just for your interest, I have a written a taskflow library that abstracts the cuda graphs as cudaFlows into CPU-GPU tasking.

This is what I see running your code on a GTX 960, CUDA 11.1, driver 455.23.05, Fedora 29:

$ nvidia-smi dmon -i 0 -s m
# gpu    fb  bar1
# Idx    MB    MB
    0     0     2
    0     0     2
    0     0     2
    0     0     2
    0     0     2
    0     0     2
    0     0     2
    0     0     2
    0     0     2
    0   140     2
    0   140     2
    0   140     2
    0   138     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   137     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0   140     2
    0     0     2
    0     0     2
    0     0     2
    0     0     2
^C    0     0     2
$

You might want to try newer CUDA/driver.

Thank you very much for your reply, we will test CUDA 11 and the latest driver later.

In addition, i saw this on the release note of CUDA 11.1

Resolved a memory issue when using cudaGraphInstantiate.

Do you know what kind of the memory issue is it?