The development team has concluded that the issue arises due to the nature of cuda device memory management. Specifically, when device memory is allocated, and then freed, it does not necessarily return to unallocated space but may reside in a reserved pool. This is occurring during stream capture. The capture process itself allocates device memory for graph management. This memory, although “freed” at the point of graph destruction, does not return to unallocated device memory but instead remains in a pool for future reuse. This is expected behavior. To support this idea, we can demonstrate that after the destruction process, we can re-capture a new graph, and the consumed memory does not increase. This indicates that the memory is returned to a pool and is reused for subsequent graph activity. This is expected behavior based on the cuda device memory management, and is not modifiable.
example:
# cat t201.cu
#include <cstdlib>
#include <iostream>
#define checkCudaErrors(x) e = x; if (e != cudaSuccess) std::cout << __LINE__ << " " << cudaGetErrorString(e) << std::endl
__global__ void shortKernel(int *A){
int idx = threadIdx.x;
A[idx] = 123;
}
int main(int argc, char *argv[]){
cudaError_t e;
int loops = 1000;
if (argc > 1) loops = atoi(argv[1]);
cudaStream_t stream;
checkCudaErrors(cudaStreamCreate(&stream));
int *A;
checkCudaErrors(cudaMalloc(&A, sizeof(int)));
shortKernel<<<1, 1, 0, stream>>>(A);
checkCudaErrors(cudaDeviceSynchronize());
system("nvidia-smi |grep MiB");
cudaGraph_t graph;
checkCudaErrors(cudaGraphCreate(&graph, 0));
cudaGraphExec_t instance;
checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
for (int ikrnl = 0; ikrnl < loops; ikrnl++)
{
shortKernel<<<1, 1, 0, stream>>>(A);
}
checkCudaErrors(cudaStreamEndCapture(stream, &graph));
checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphLaunch(instance, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
system("nvidia-smi |grep MiB");
checkCudaErrors(cudaGraphExecDestroy(instance));
checkCudaErrors(cudaGraphDestroy(graph));
checkCudaErrors(cudaStreamDestroy(stream));
system("nvidia-smi |grep MiB");
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(cudaGraphCreate(&graph, 0));
checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
for (int ikrnl = 0; ikrnl < loops; ikrnl++)
{
shortKernel<<<1, 1, 0, stream>>>(A);
}
checkCudaErrors(cudaStreamEndCapture(stream, &graph));
checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphLaunch(instance, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
system("nvidia-smi |grep MiB");
checkCudaErrors(cudaGraphExecDestroy(instance));
checkCudaErrors(cudaGraphDestroy(graph));
checkCudaErrors(cudaStreamDestroy(stream));
system("nvidia-smi |grep MiB");
}
# nvcc -o t201 t201.cu
# ./t201 1000
| N/A 46C P0 29W / 72W | 198MiB / 23034MiB | 0% Default |
| 0 N/A N/A 222183 C ./t201 186MiB |
| N/A 46C P0 29W / 72W | 264MiB / 23034MiB | 1% Default |
| 0 N/A N/A 222183 C ./t201 252MiB |
| N/A 46C P0 29W / 72W | 264MiB / 23034MiB | 0% Default |
| 0 N/A N/A 222183 C ./t201 252MiB |
| N/A 46C P0 29W / 72W | 264MiB / 23034MiB | 0% Default |
| 0 N/A N/A 222183 C ./t201 252MiB |
| N/A 46C P0 29W / 72W | 264MiB / 23034MiB | 0% Default |
| 0 N/A N/A 222183 C ./t201 252MiB |
#
(FWIW, the cudaDeviceGraphMemTrim
API has no effect on the above behavior.)