Memory leak with cudagraph?

youkaichao1 · May 26, 2024, 6:02pm

I’m trying to understand the memory (de)allocation behavior with stream-ordered allocation and cudagraph. From the blog, it seems cudaMallocAsync requests memory from pool, and cudaFreeAsync returns memory to pool, and pool returns unused memory to OS when stream is synced.

However, when using cudagraph, it seems the memory is still hold by the graph, and it even retains after I destroy the graph. It seems the memory has been leaked.

Here is my code:

#include <cuda_runtime.h>
#include <iostream>

// Error checking macro and function
#define cudaCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void addKernel(int *a, int *b, int *c, int N) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        c[idx] = a[idx] + b[idx];
    }
}

void logMemoryStatus(const char* message) {
    size_t free_mem, total_mem;
    cudaCheck(cudaMemGetInfo(&free_mem, &total_mem));
    float free_gb = free_mem / (float)(1 << 30);  // Convert bytes to gigabytes
    float total_gb = total_mem / (float)(1 << 30);
    std::cout << message << " - Free Memory: " << free_gb << " GB, Total Memory: " << total_gb << " GB\n";
}

int main() {
    cudaMemPool_t mempool;
    cudaDeviceGetDefaultMemPool(&mempool, 0);
    uint64_t threshold = 0; // UINT64_MAX;
    cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);

    const int N = 1024 * 1024 * 256;
    const int bytes = N * sizeof(int);
    int *a, *b, *c, *h_c;

    // Allocate device memory for a and b
    cudaCheck(cudaMalloc(&a, bytes));
    cudaCheck(cudaMalloc(&b, bytes));

    // Initialize a and b on the host
    int *h_a = new int[N];
    int *h_b = new int[N];
    for (int i = 0; i < N; ++i) {
        h_a[i] = i;
        h_b[i] = i;
    }

    // Copy data from host to device
    cudaCheck(cudaMemcpy(a, h_a, bytes, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(b, h_b, bytes, cudaMemcpyHostToDevice));

    // Allocate host memory for the result
    h_c = new int[N];

    // Create a stream
    cudaStream_t stream;
    cudaCheck(cudaStreamCreate(&stream));

    logMemoryStatus("before capture");

    // Begin graph capture
    cudaGraph_t graph;
    cudaCheck(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));

    // Allocate memory for c during graph capture using cudaMallocAsync
    cudaCheck(cudaMallocAsync(&c, bytes, stream));

    logMemoryStatus("inside capture, after cudaMallocAsync");

    // Launch the add kernel
    dim3 block(256);
    dim3 grid((N + block.x - 1) / block.x);
    addKernel<<<grid, block, 0, stream>>>(a, b, c, N);

    // Copy the output to CPU using cudaMemcpyAsync
    cudaCheck(cudaMemcpyAsync(h_c, c, bytes, cudaMemcpyDeviceToHost, stream));

    // Free c using cudaFreeAsync within graph capture
    cudaCheck(cudaFreeAsync(c, stream));

    logMemoryStatus("inside capture, after cudaFreeAsync");

    // End graph capture
    cudaCheck(cudaStreamEndCapture(stream, &graph));

    // Launch the graph
    cudaGraphExec_t graphExec;
    cudaCheck(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));

    logMemoryStatus("after capture, before execution");
    cudaCheck(cudaGraphLaunch(graphExec, stream));

    // Wait for stream to complete
    cudaCheck(cudaStreamSynchronize(stream));

    logMemoryStatus("after capture, after execution");

    // Output the graph to a .dot file
    cudaCheck(cudaGraphDebugDotPrint(graph, "graph.dot", cudaGraphDebugDotFlagsVerbose));

    // Check result
    bool correct = true;
    for (int i = 0; i < N; ++i) {
        if (h_c[i] != h_a[i] + h_b[i]) {
            correct = false;
            break;
        }
    }
    if (correct) {
        std::cout << "Results are correct!" << std::endl;
    } else {
        std::cout << "Results are incorrect!" << std::endl;
    }

    // Cleanup

    cudaCheck(cudaGraphDestroy(graph));
    cudaCheck(cudaGraphExecDestroy(graphExec));
    cudaCheck(cudaDeviceSynchronize());
    logMemoryStatus("after delete graph");

    cudaCheck(cudaFree(a));
    cudaCheck(cudaFree(b));
    delete[] h_a;
    delete[] h_b;
    delete[] h_c;

    cudaCheck(cudaStreamDestroy(stream));

    return 0;
}

And the output:

before capture - Free Memory: 29.4301 GB, Total Memory: 31.7325 GB
inside capture, after cudaMallocAsync - Free Memory: 29.4301 GB, Total Memory: 31.7325 GB
inside capture, after cudaFreeAsync - Free Memory: 29.4301 GB, Total Memory: 31.7325 GB
after capture, before execution - Free Memory: 29.4301 GB, Total Memory: 31.7325 GB
after capture, after execution - Free Memory: 28.4301 GB, Total Memory: 31.7325 GB
Results are correct!
after delete graph - Free Memory: 28.4301 GB, Total Memory: 31.7325 GB

Note that I even set the cudaMemPoolAttrReleaseThreshold to 0, ideally this should release all unused memory to OS.

I’m using cuda 12.4 .

youkaichao1 · May 26, 2024, 6:22pm

I can confirm the problem is related with cudagraph. When I remove the cudagraph, I can see the memory allocation behavior is as expected:

#include <cuda_runtime.h>
#include <iostream>

// Error checking macro and function
#define cudaCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void addKernel(int *a, int *b, int *c, int N) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        c[idx] = a[idx] + b[idx];
    }
}

void logMemoryStatus(const char* message) {
    size_t free_mem, total_mem;
    cudaCheck(cudaMemGetInfo(&free_mem, &total_mem));
    float free_gb = free_mem / (float)(1 << 30);  // Convert bytes to gigabytes
    float total_gb = total_mem / (float)(1 << 30);
    std::cout << message << " - Free Memory: " << free_gb << " GB, Total Memory: " << total_gb << " GB\n";
}

int main() {
    cudaMemPool_t mempool;
    cudaDeviceGetDefaultMemPool(&mempool, 0);
    uint64_t threshold = 0; // UINT64_MAX;
    cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);

    const int N = 1024 * 1024 * 256;
    const int bytes = N * sizeof(int);
    int *a, *b, *c, *h_c;

    // Allocate device memory for a and b
    cudaCheck(cudaMalloc(&a, bytes));
    cudaCheck(cudaMalloc(&b, bytes));

    // Initialize a and b on the host
    int *h_a = new int[N];
    int *h_b = new int[N];
    for (int i = 0; i < N; ++i) {
        h_a[i] = i;
        h_b[i] = i;
    }

    // Copy data from host to device
    cudaCheck(cudaMemcpy(a, h_a, bytes, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(b, h_b, bytes, cudaMemcpyHostToDevice));

    // Allocate host memory for the result
    h_c = new int[N];

    // Create a stream
    cudaStream_t stream;
    cudaCheck(cudaStreamCreate(&stream));

    logMemoryStatus("before cudaMallocAsync");

    // Allocate memory for c during graph capture using cudaMallocAsync
    cudaCheck(cudaMallocAsync(&c, bytes, stream));

    logMemoryStatus("after cudaMallocAsync");

    // Launch the add kernel
    dim3 block(256);
    dim3 grid((N + block.x - 1) / block.x);
    addKernel<<<grid, block, 0, stream>>>(a, b, c, N);

    // Copy the output to CPU using cudaMemcpyAsync
    cudaCheck(cudaMemcpyAsync(h_c, c, bytes, cudaMemcpyDeviceToHost, stream));

    // Free c using cudaFreeAsync within graph capture
    cudaCheck(cudaFreeAsync(c, stream));

    logMemoryStatus("after cudaFreeAsync");

    // Wait for stream to complete
    cudaCheck(cudaStreamSynchronize(stream));

    logMemoryStatus("after stream sync");

    // Check result
    bool correct = true;
    for (int i = 0; i < N; ++i) {
        if (h_c[i] != h_a[i] + h_b[i]) {
            correct = false;
            break;
        }
    }
    if (correct) {
        std::cout << "Results are correct!" << std::endl;
    } else {
        std::cout << "Results are incorrect!" << std::endl;
    }

    // Cleanup

    cudaCheck(cudaFree(a));
    cudaCheck(cudaFree(b));
    delete[] h_a;
    delete[] h_b;
    delete[] h_c;

    cudaCheck(cudaStreamDestroy(stream));

    return 0;
}

Output:

before cudaMallocAsync - Free Memory: 29.4301 GB, Total Memory: 31.7325 GB
after cudaMallocAsync - Free Memory: 28.4301 GB, Total Memory: 31.7325 GB
after cudaFreeAsync - Free Memory: 28.4301 GB, Total Memory: 31.7325 GB
after stream sync - Free Memory: 29.4301 GB, Total Memory: 31.7325 GB
Results are correct!

I can get the memory back (free memory goes back to 29GB) after stream sync.

striker159 · May 26, 2024, 8:16pm

See Section 12.6. Physical Memory Footprint in the programming guide.

The pool-management behavior of asynchronous allocation means that destroying a graph which contains memory nodes (even if their allocations are free) will not immediately return physical memory to the OS for use by other processes. To explicitly release memory back to the OS, an application should use the cudaDeviceGraphMemTrim API.

youkaichao1 · May 27, 2024, 7:49am

@striker159 thank you very much! It solves my problem. Indeed it is because of cudaDeviceGraphMemTrim .

system · June 10, 2024, 7:49am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
CUDA Graph Memory Reservations CUDA Programming and Performance cuda	3	710	May 28, 2024
Global memory "leak" after register spill? CUDA Programming and Performance cuda , kernel	16	84	September 3, 2024
Cuda memory pool performance issue CUDA Programming and Performance cuda , api	4	2308	February 1, 2022
cudaStream alloc after free result in oom CUDA Programming and Performance	8	45	January 1, 2025
FAO: Nvidia Engineers:- Memory Leak in cudaMemcpyAsync Only occurs on Host To Device memory transfer CUDA Programming and Performance	4	5872	August 18, 2010
Using the NVIDIA CUDA Stream-Ordered Memory Allocator, Part 1 Technical Blog	1	650	September 13, 2024
Using the NVIDIA CUDA Stream-Ordered Memory Allocator, Part 2 Technical Blog	12	1232	September 12, 2023
cudaMalloc KILLED on tx2, and the memory can not be cudaFree real Jetson TX2	10	1123	December 2, 2019
cudaMalloc error in big loop CUDA Programming and Performance	12	15608	May 21, 2008
cudaSafeCall() Runtime API error 11 CUDA Programming and Performance	12	7249	March 9, 2012

Memory leak with cudagraph?

Related topics