In our system, we run multiple CPU threads concurrently, each one will generate multiple CUDA graphs and destroy all graphs at the end, but we found that when the CPU threads and the Graphs reach a certain amount, the GPU memory will continue to rise, and few CPU threads and Graphs will be fine. In addition, each CPU thread are independent, each thread are default stream.
Test environment:
GPU : GTX 1070
CPU : Intel(R) Core™ i9-10900X CPU @ 3.70GHz
OS : ubuntu 16.04
CUDA Version: 10.2
Driver Version: 440.64.00
Below is the memory usage:
nvidia-smi dmon -i 0 -s m
# gpu fb bar1
# Idx MB MB
0 75 5
0 75 5
0 274 5 <------ program start
0 286 5
0 298 5
0 310 5
0 320 5
0 330 5
0 340 5
0 352 5
0 364 5
0 376 5
0 386 5
0 396 5
0 408 5
0 418 5
0 430 5
0 442 5
0 454 5 <------ program end
0 75 5
0 75 5
0 75 5
Below is the simplified code and compile command:
nvcc --default-stream per-thread -gencode arch=compute_61,code=sm_61 main.cu -o main
#include <iostream>
#include <cuda_runtime.h>
#include <time.h>
#include <sys/time.h>
#include <string>
#include <thread>
#include <memory>
#include <vector>
#define BLOCK_SIZE_256 256
#define RETURN_IF_CUDA_ERROR( expr ) \
{ \
cudaError_t result = expr; \
if ( result != cudaSuccess ) {\
std::cout << std::string( std::string( __FILE__ ) + ": line " + std::to_string( __LINE__ ) + "\n" ); \
return false; \
} \
}
/////////////////////////////////////////////////////////
class GraphsLauncher
{
public:
GraphsLauncher();
~GraphsLauncher();
bool Initialize();
bool AddKernelFunctionNode( void* func, void** kernelArgs, uint32_t gridSize, uint32_t blockSize, uint32_t sharedMemSize );
//BoolResult AddMemoryCopyNode();
//BoolResult AddMemorySetNode();
bool Instantiate();
bool Execution();
bool Release();
private:
cudaGraph_t m_graph;
cudaGraphExec_t m_graphExec;
std::vector<cudaGraphNode_t> m_nodeDependencies;
};
GraphsLauncher::GraphsLauncher()
: m_graph( nullptr )
, m_graphExec( nullptr )
, m_nodeDependencies( 0 )
{
}
GraphsLauncher::~GraphsLauncher()
{
}
bool GraphsLauncher::Initialize()
{
RETURN_IF_CUDA_ERROR( cudaGraphCreate( &m_graph, 0 ) );
return true;
}
bool GraphsLauncher::AddKernelFunctionNode( void* func, void** kernelArgs, uint32_t gridSize, uint32_t blockSize, uint32_t sharedMemSize )
{
cudaKernelNodeParams kernelNodeParams = { 0 };
kernelNodeParams.func = func;
kernelNodeParams.gridDim = dim3( gridSize, 1, 1 );
kernelNodeParams.blockDim = dim3( blockSize, 1, 1 );
kernelNodeParams.sharedMemBytes = sharedMemSize;
kernelNodeParams.kernelParams = kernelArgs;
kernelNodeParams.extra = nullptr;
cudaGraphNode_t kernelNode;
auto cudaError = cudaGraphAddKernelNode( &kernelNode, m_graph, m_nodeDependencies.data(), m_nodeDependencies.size(), &kernelNodeParams );
if ( cudaError == cudaSuccess )
{
m_nodeDependencies.clear();
m_nodeDependencies.push_back( kernelNode );
}
else
return false;
return true;
}
bool GraphsLauncher::Instantiate()
{
RETURN_IF_CUDA_ERROR( cudaGraphInstantiate( &m_graphExec, m_graph, nullptr, nullptr, 0 ) );
return true;
}
bool GraphsLauncher::Execution()
{
RETURN_IF_CUDA_ERROR( cudaGraphLaunch( m_graphExec, 0 ) );
RETURN_IF_CUDA_ERROR( cudaStreamSynchronize( 0 ) );
return true;
}
bool GraphsLauncher::Release()
{
if ( m_graphExec )
{
RETURN_IF_CUDA_ERROR( cudaGraphExecDestroy( m_graphExec ) );
m_graphExec = nullptr;
}
if ( m_graph )
{
RETURN_IF_CUDA_ERROR( cudaGraphDestroy( m_graph ) );
m_graph = nullptr;
}
m_nodeDependencies.clear();
return true;
}
using GraphsLauncherPtr = std::unique_ptr< GraphsLauncher >;
/////////////////////////////////////////////////////////
__global__ void TestKernelFunc( float* value1 )
{
}
bool SetKernelNodes( GraphsLauncher* graphPtr, float* value1, uint32_t bufferSize )
{
uint32_t blockSize = BLOCK_SIZE_256;
uint32_t gridSize = static_cast<uint32_t>( ceilf( bufferSize / (float) blockSize ) );
for ( uint32_t i = 0; i < 20; i++ )
{
void( *func )( float* ) = TestKernelFunc;
void *kernelArgs[ 1 ] = { &value1 };
if ( !graphPtr->AddKernelFunctionNode( ( void* ) func, ( void ** ) kernelArgs, gridSize, blockSize, 0 ) )
return false;
}
return true;
}
struct TestObject
{
float* value1;
std::vector<GraphsLauncherPtr> graphList;
TestObject() : value1( nullptr ), graphList( 0 )
{
}
bool Initialize( int32_t bufferSize )
{
RETURN_IF_CUDA_ERROR( cudaMalloc( &value1, bufferSize * sizeof( float ) ) );
const uint32_t graphCount = 10;
for ( uint32_t idx = 1; idx < graphCount; idx++ )
{
graphList.push_back( std::make_unique< GraphsLauncher >() );
GraphsLauncher* graphPtr = graphList.back().get();
if ( !graphPtr->Initialize() )
return false;
if ( !SetKernelNodes( graphPtr, value1, bufferSize ) )
return false;
if ( !graphPtr->Instantiate() )
return false;
}
return true;
}
bool Execution()
{
// Get one to execute
return graphList.back()->Execution();
}
bool Release()
{
RETURN_IF_CUDA_ERROR( cudaFree( value1 ) );
value1 = nullptr;
for ( auto& graph : graphList )
{
if ( !graph->Release() )
return false;
}
graphList.clear();
return true;
}
};
bool MainTestFunc()
{
int32_t bufferSize = 131072;
TestObject testObj1;
if ( !testObj1.Initialize( bufferSize ) )
return false;
for ( uint32_t i = 0; i < 1000; i++ )
{
if ( !testObj1.Execution() )
return false;
}
if ( !testObj1.Release() )
return false;
return true;
}
/////////////////////////////////////////////////////////
int main() {
cudaFree( 0 );
uint32_t maxTestCount = 64;
for ( uint32_t testCount = 1; testCount <= maxTestCount; testCount++ )
{
uint32_t threadSize = 8;
std::vector< bool > errorList( threadSize );
std::vector< std::thread* > threads( threadSize, nullptr );
for ( uint32_t i = 0; i < threadSize; ++i )
{
threads[ i ] = new std::thread( [ =, &errorList ]()
{
errorList[ i ] = MainTestFunc();
} );
}
for ( uint32_t i = 0; i < threadSize; ++i )
threads[ i ]->join();
for ( uint32_t i = 0; i < threadSize; ++i )
delete threads[ i ];
for ( uint32_t i = 0; i < threadSize; ++i )
{
if ( !errorList[ i ] )
{
std::cout << "error occurred\n";
cudaDeviceReset();
return 0;
}
}
}
return 0;
}
We can observe that all the memory is released at the end of the program, also use CUDA-MEMCHECK tool to check memory leak, but no error was reported.
The question is
- Is all resource of the graph release immediately when we call destroy graph function?
- How many Graphs can be generated per process?
- Why memory usage continue rise…?