I’m trying to understand the memory (de)allocation behavior with stream-ordered allocation and cudagraph. From the blog, it seems cudaMallocAsync
requests memory from pool, and cudaFreeAsync
returns memory to pool, and pool returns unused memory to OS when stream is synced.
However, when using cudagraph, it seems the memory is still hold by the graph, and it even retains after I destroy the graph. It seems the memory has been leaked.
Here is my code:
#include <cuda_runtime.h>
#include <iostream>
// Error checking macro and function
#define cudaCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void addKernel(int *a, int *b, int *c, int N) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
c[idx] = a[idx] + b[idx];
}
}
void logMemoryStatus(const char* message) {
size_t free_mem, total_mem;
cudaCheck(cudaMemGetInfo(&free_mem, &total_mem));
float free_gb = free_mem / (float)(1 << 30); // Convert bytes to gigabytes
float total_gb = total_mem / (float)(1 << 30);
std::cout << message << " - Free Memory: " << free_gb << " GB, Total Memory: " << total_gb << " GB\n";
}
int main() {
cudaMemPool_t mempool;
cudaDeviceGetDefaultMemPool(&mempool, 0);
uint64_t threshold = 0; // UINT64_MAX;
cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
const int N = 1024 * 1024 * 256;
const int bytes = N * sizeof(int);
int *a, *b, *c, *h_c;
// Allocate device memory for a and b
cudaCheck(cudaMalloc(&a, bytes));
cudaCheck(cudaMalloc(&b, bytes));
// Initialize a and b on the host
int *h_a = new int[N];
int *h_b = new int[N];
for (int i = 0; i < N; ++i) {
h_a[i] = i;
h_b[i] = i;
}
// Copy data from host to device
cudaCheck(cudaMemcpy(a, h_a, bytes, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(b, h_b, bytes, cudaMemcpyHostToDevice));
// Allocate host memory for the result
h_c = new int[N];
// Create a stream
cudaStream_t stream;
cudaCheck(cudaStreamCreate(&stream));
logMemoryStatus("before capture");
// Begin graph capture
cudaGraph_t graph;
cudaCheck(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
// Allocate memory for c during graph capture using cudaMallocAsync
cudaCheck(cudaMallocAsync(&c, bytes, stream));
logMemoryStatus("inside capture, after cudaMallocAsync");
// Launch the add kernel
dim3 block(256);
dim3 grid((N + block.x - 1) / block.x);
addKernel<<<grid, block, 0, stream>>>(a, b, c, N);
// Copy the output to CPU using cudaMemcpyAsync
cudaCheck(cudaMemcpyAsync(h_c, c, bytes, cudaMemcpyDeviceToHost, stream));
// Free c using cudaFreeAsync within graph capture
cudaCheck(cudaFreeAsync(c, stream));
logMemoryStatus("inside capture, after cudaFreeAsync");
// End graph capture
cudaCheck(cudaStreamEndCapture(stream, &graph));
// Launch the graph
cudaGraphExec_t graphExec;
cudaCheck(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
logMemoryStatus("after capture, before execution");
cudaCheck(cudaGraphLaunch(graphExec, stream));
// Wait for stream to complete
cudaCheck(cudaStreamSynchronize(stream));
logMemoryStatus("after capture, after execution");
// Output the graph to a .dot file
cudaCheck(cudaGraphDebugDotPrint(graph, "graph.dot", cudaGraphDebugDotFlagsVerbose));
// Check result
bool correct = true;
for (int i = 0; i < N; ++i) {
if (h_c[i] != h_a[i] + h_b[i]) {
correct = false;
break;
}
}
if (correct) {
std::cout << "Results are correct!" << std::endl;
} else {
std::cout << "Results are incorrect!" << std::endl;
}
// Cleanup
cudaCheck(cudaGraphDestroy(graph));
cudaCheck(cudaGraphExecDestroy(graphExec));
cudaCheck(cudaDeviceSynchronize());
logMemoryStatus("after delete graph");
cudaCheck(cudaFree(a));
cudaCheck(cudaFree(b));
delete[] h_a;
delete[] h_b;
delete[] h_c;
cudaCheck(cudaStreamDestroy(stream));
return 0;
}
And the output:
before capture - Free Memory: 29.4301 GB, Total Memory: 31.7325 GB
inside capture, after cudaMallocAsync - Free Memory: 29.4301 GB, Total Memory: 31.7325 GB
inside capture, after cudaFreeAsync - Free Memory: 29.4301 GB, Total Memory: 31.7325 GB
after capture, before execution - Free Memory: 29.4301 GB, Total Memory: 31.7325 GB
after capture, after execution - Free Memory: 28.4301 GB, Total Memory: 31.7325 GB
Results are correct!
after delete graph - Free Memory: 28.4301 GB, Total Memory: 31.7325 GB
Note that I even set the cudaMemPoolAttrReleaseThreshold
to 0, ideally this should release all unused memory to OS.
I’m using cuda 12.4 .