I found a bug from CUDA-memcheck and Compute Sanitizer, that they can’t detect memory leak when using cuMemAlloc and OpenACC.
compiler: nvhpc/22.5
MPI: OpenMPI/4.1.4
source code
#include <stdio.h>
#include <mpi.h>
#include <cuda.h>
#include <openacc.h>
#include <string.h>
#include <limits.h>
int rank;
enum {N = 10};
enum {GB = 1073741824}; //1024*1024*1024
// This will output the proper CUDA error strings
// in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( CUresult err, const char *file, const int line )
{
if( CUDA_SUCCESS != err) {
fprintf(stderr,
"CUDA Driver API error = %04d from file <%s>, line %i.\n",
err, file, line );
exit(-1);
}
}
void print_mem_device() {
// cudaMemGetInfo ( size_t* free, size_t* total )
size_t free_bytes, total_bytes;
cuMemGetInfo( &free_bytes, &total_bytes );
printf("free: %zu GB, total: %zu GB.\n", free_bytes / GB, total_bytes / GB);
}
void testCudaDriverAPIOpenACC() {
int *a = malloc(sizeof(int) * GB); // 4GB
int *b = malloc(sizeof(int) * GB);
int *c = malloc(sizeof(int) * GB);
int *d = malloc(sizeof(int) * GB);
int *e = malloc(sizeof(int) * GB);
int in[N];
int out[N];
for (int i = 0; i < N; ++i) {
in[i] = -1;
out[i] = 1;
}
CUdeviceptr d_leak[5];
int test = 0;
print_mem_device();
#pragma acc data copy(a[0:GB])
{
// memory leak, d_data, d_leak is not explicitly freed.
CUdeviceptr d_data;
checkCudaErrors( cuMemAlloc(&d_leak[test], sizeof(int) * GB) );
checkCudaErrors( cuMemAlloc(&d_data, sizeof(int) * N) );
checkCudaErrors( cuMemcpyHtoD(d_data, in, sizeof(int) * N) );
checkCudaErrors( cuMemcpyDtoH(out, d_data, sizeof(int) * N) );
}
print_mem_device();
test++;
#pragma acc data copyin(b[0:GB])
{
// memory leak, d_leak is not explicitly freed.
checkCudaErrors( cuMemAlloc(&d_leak[test], sizeof(int) * GB) );
}
print_mem_device();
test++;
#pragma acc data copyout(c[0:GB])
{
// memory leak, d_leak is not explicitly freed.
checkCudaErrors( cuMemAlloc(&d_leak[test], sizeof(int) * GB) );
}
print_mem_device();
test++;
#pragma acc data create(d[0:GB])
{
// memory leak, d_leak is not explicitly freed.
checkCudaErrors( cuMemAlloc(&d_leak[test], sizeof(int) * GB) );
}
print_mem_device();
test++;
#pragma acc data copy(e[0:GB])
{
;
}
// allocate GPU memory outside openacc directives
checkCudaErrors( cuMemAlloc(&d_leak[test], sizeof(int) * GB) );
print_mem_device();
}
int main() {
int size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (rank == 0)
printf("size: %d.\n", size);
printf("rank: %d.\n", rank);
testCudaDriverAPIOpenACC(); // compute sanitizer failed with segment fault, CUDA-memcheck detected no memleak
MPI_Finalize();
}
the compilation script is
CFLAGS="-O0 -g -I${CUDA_ROOT}/include -acc=gpu -gpu=cc80,nordc,debug -Minfo"
LDFLAGS="-g -lnvToolsExt -lcuda -acc=gpu -gpu=cc80,nordc,debug"
mpicc -O0 -g -I${CUDA_ROOT}/include -acc=gpu -gpu=cc80,nordc,debug -Minfo -lnvToolsExt -lcuda -o mem_test mem_test.c
CUDA-memcheck results:
$ srun -n 1 cuda-memcheck --tool memcheck --leak-check full mem_test
$ size: 1
rank: 0
free: 0 GB, total: 0 GB.
free: 70 GB, total: 79 GB.
free: 66 GB, total: 79 GB.
free: 62 GB, total: 79 GB.
free: 58 GB, total: 79 GB.
free: 54 GB, total: 79 GB.
========== LEAK SUMMARY: 0 bytes leaked in 0 allocations
compute sanitizer results are very similar.
the memory leak inside d_leak was not detected.