CUDA Graph Memory Reservations

I have a simple program that uses CUDAGraph:

        system("nvidia-smi");

        checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
        for (int ikrnl = 0; ikrnl < 1000; ikrnl++)
        {
                shortKernel<<<1, 1, 0, stream>>>(A);
        }
        checkCudaErrors(cudaStreamEndCapture(stream, &graph));
        checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
        checkCudaErrors(cudaGraphLaunch(instance, stream));
        checkCudaErrors(cudaStreamSynchronize(stream));

        system("nvidia-smi");

        checkCudaErrors(cudaGraphExecDestroy(instance));
        checkCudaErrors(cudaGraphDestroy(graph));

        system("nvidia-smi");

From the numbers reported by nvidia-smi, I notice that the creation of CUDAGraph’s cause the GPU memory consumption to go up a little bit, and furthermore, destroying the CUDAGraph data structure (the last 3-4 lines) does not reclaim the memory. May I ask (1) why CUDAGraph’s need to make memory reservations and (2) whether there is a proper way for us to reclaim the memory allocated?

Thank you.

I see the issue as well. I’m not aware of any other method to reclaim the memory (other than cudaDeviceReset() which I’m sure is not what you had in mind).

I’ve filed an internal bug at NVIDIA to have this looked at. (3865932)

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

The development team has concluded that the issue arises due to the nature of cuda device memory management. Specifically, when device memory is allocated, and then freed, it does not necessarily return to unallocated space but may reside in a reserved pool. This is occurring during stream capture. The capture process itself allocates device memory for graph management. This memory, although “freed” at the point of graph destruction, does not return to unallocated device memory but instead remains in a pool for future reuse. This is expected behavior. To support this idea, we can demonstrate that after the destruction process, we can re-capture a new graph, and the consumed memory does not increase. This indicates that the memory is returned to a pool and is reused for subsequent graph activity. This is expected behavior based on the cuda device memory management, and is not modifiable.

example:

# cat t201.cu
#include <cstdlib>
#include <iostream>

#define checkCudaErrors(x) e = x; if (e != cudaSuccess) std::cout << __LINE__ << " " <<  cudaGetErrorString(e) << std::endl

__global__ void shortKernel(int *A){

   int idx = threadIdx.x;
   A[idx] = 123;
}


int main(int argc, char *argv[]){
        cudaError_t e;
        int loops = 1000;
        if (argc > 1) loops = atoi(argv[1]);
        cudaStream_t stream;
        checkCudaErrors(cudaStreamCreate(&stream));
        int *A;
        checkCudaErrors(cudaMalloc(&A, sizeof(int)));
        shortKernel<<<1, 1, 0, stream>>>(A);
        checkCudaErrors(cudaDeviceSynchronize());
        system("nvidia-smi |grep MiB");
        cudaGraph_t graph;
        checkCudaErrors(cudaGraphCreate(&graph, 0));
        cudaGraphExec_t instance;
        checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
        for (int ikrnl = 0; ikrnl < loops; ikrnl++)
        {
                shortKernel<<<1, 1, 0, stream>>>(A);
        }
        checkCudaErrors(cudaStreamEndCapture(stream, &graph));
        checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
        checkCudaErrors(cudaGraphLaunch(instance, stream));
        checkCudaErrors(cudaStreamSynchronize(stream));

        system("nvidia-smi |grep MiB");

        checkCudaErrors(cudaGraphExecDestroy(instance));
        checkCudaErrors(cudaGraphDestroy(graph));
        checkCudaErrors(cudaStreamDestroy(stream));
        system("nvidia-smi |grep MiB");
        checkCudaErrors(cudaStreamCreate(&stream));
        checkCudaErrors(cudaGraphCreate(&graph, 0));
        checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
        for (int ikrnl = 0; ikrnl < loops; ikrnl++)
        {
                shortKernel<<<1, 1, 0, stream>>>(A);
        }
        checkCudaErrors(cudaStreamEndCapture(stream, &graph));
        checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
        checkCudaErrors(cudaGraphLaunch(instance, stream));
        checkCudaErrors(cudaStreamSynchronize(stream));

        system("nvidia-smi |grep MiB");

        checkCudaErrors(cudaGraphExecDestroy(instance));
        checkCudaErrors(cudaGraphDestroy(graph));
        checkCudaErrors(cudaStreamDestroy(stream));
        system("nvidia-smi |grep MiB");
}
# nvcc -o t201 t201.cu
# ./t201 1000
| N/A   46C    P0              29W /  72W |    198MiB / 23034MiB |      0%      Default |
|    0   N/A  N/A    222183      C   ./t201                                      186MiB |
| N/A   46C    P0              29W /  72W |    264MiB / 23034MiB |      1%      Default |
|    0   N/A  N/A    222183      C   ./t201                                      252MiB |
| N/A   46C    P0              29W /  72W |    264MiB / 23034MiB |      0%      Default |
|    0   N/A  N/A    222183      C   ./t201                                      252MiB |
| N/A   46C    P0              29W /  72W |    264MiB / 23034MiB |      0%      Default |
|    0   N/A  N/A    222183      C   ./t201                                      252MiB |
| N/A   46C    P0              29W /  72W |    264MiB / 23034MiB |      0%      Default |
|    0   N/A  N/A    222183      C   ./t201                                      252MiB |
#

(FWIW, the cudaDeviceGraphMemTrim API has no effect on the above behavior.)