Memory leak using cufftmp

icgill443 · July 16, 2024, 3:25am

I am attempting to get a sense of how quickly I can get the cufftmp library to perform the ffts I need for a simulation program using the following code

#include <mpi.h>
#include <cuda_runtime.h>
#include <cufftMp.h>
#include <iostream>
#include <random>
#include <vector>

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

#define CUFFT_CHECK(ans) { cufftAssert((ans), __FILE__, __LINE__); }
inline void cufftAssert(cufftResult code, const char *file, int line, bool abort=true) {
    if (code != CUFFT_SUCCESS) {
        fprintf(stderr, "CUFFTassert: %d %s %d\n", code, file, line);
        if (abort) exit(code);
    }
}

// Function to generate random data
void generate_random(std::vector<float>& data, int seed, int rank, MPI_Comm comm) {
    std::mt19937 gen(seed);
    std::uniform_real_distribution<float> dist(-1, 1);
    for (size_t i = 0; i < data.size(); ++i) {
        data[i] = dist(gen);
        // Print some values periodically
        if (i % 100000 == 0) {
            MPI_Barrier(comm);
            printf("Rank %d: data[%lu] = %f\n", rank, i, data[i]);
        }
    }
}

void run_cuda_transform(size_t nx, size_t ny, float* cpu_data, MPI_Comm comm, const int rank, const int size, double& r2c_time, double& c2r_time) {
    int ndevices;
    CUDA_CHECK(cudaGetDeviceCount(&ndevices));
    CUDA_CHECK(cudaSetDevice(rank % ndevices));
    printf("Hello from rank %d/%d using GPU %d\n", rank, size, rank % ndevices);

    cufftHandle plan_r2c = 0;
    cufftHandle plan_c2r = 0;
    cudaStream_t stream = nullptr;
    cudaLibXtDesc *desc = nullptr;

    CUDA_CHECK(cudaStreamCreate(&stream));
    CUFFT_CHECK(cufftCreate(&plan_r2c));
    CUFFT_CHECK(cufftCreate(&plan_c2r));

    CUFFT_CHECK(cufftMpAttachComm(plan_r2c, CUFFT_COMM_MPI, &comm));
    CUFFT_CHECK(cufftMpAttachComm(plan_c2r, CUFFT_COMM_MPI, &comm));

    CUFFT_CHECK(cufftSetStream(plan_r2c, stream));
    CUFFT_CHECK(cufftSetStream(plan_c2r, stream));

    size_t workspace;
    CUFFT_CHECK(cufftMakePlan2d(plan_r2c, nx, ny, CUFFT_R2C, &workspace));
    CUFFT_CHECK(cufftMakePlan2d(plan_c2r, nx, ny, CUFFT_C2R, &workspace));

    CUFFT_CHECK(cufftXtMalloc(plan_r2c, &desc, CUFFT_XT_FORMAT_INPLACE));

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUFFT_CHECK(cufftXtMemcpy(plan_r2c, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE));

    CUDA_CHECK(cudaStreamSynchronize(stream));

    double start_time = MPI_Wtime();
    CUFFT_CHECK(cufftXtExecDescriptor(plan_r2c, desc, desc, CUFFT_FORWARD));
    r2c_time = MPI_Wtime() - start_time;

    start_time = MPI_Wtime();
    CUFFT_CHECK(cufftXtExecDescriptor(plan_c2r, desc, desc, CUFFT_INVERSE));
    c2r_time = MPI_Wtime() - start_time;

    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUFFT_CHECK(cufftXtMemcpy(plan_c2r, (void*)cpu_data, (void*)desc, CUFFT_COPY_DEVICE_TO_HOST));

    CUFFT_CHECK(cufftXtFree(desc));
    CUFFT_CHECK(cufftDestroy(plan_r2c));
    CUFFT_CHECK(cufftDestroy(plan_c2r));
    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaFree(0));
}

void run_iterations(size_t nx, size_t ny, const int rank, const int size, int iterations) {
    MPI_Comm comm = MPI_COMM_WORLD;

    for (int i = 0; i < iterations; ++i) {
        std::vector<float> data((ny + 2) * (nx / size), 1.0);
        MPI_Barrier(comm);
        generate_random(data, rank, rank, comm);

        double r2c_time, c2r_time;
        double start_time = MPI_Wtime();
        run_cuda_transform(nx, ny, data.data(), comm, rank, size, r2c_time, c2r_time);
        double total_time = MPI_Wtime() - start_time;

        if (rank == 0) {
            printf("Iteration %d: R2C time: %f seconds, C2R time: %f seconds, Total time: %f seconds\n", i + 1, r2c_time, c2r_time, total_time);
        }

        printf("Rank %d: Memory freed after iteration %d\n", rank, i + 1);
    }
}

int main(int argc, char** argv) {
    MPI_Init(&argc, &argv);

    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    size_t nx = 2048;
    size_t ny = 4096;

    int iterations = 10;

    run_iterations(nx, ny, rank, size, iterations);

    MPI_Finalize();

    return 0;
}

When i run this code using 4 gpus, one task per GPU , the first 7 iterations function successfully, but on the 8th nvshmem is unable to allocate memory. Specifically, i get

src/mem/mem.cpp:248: non-zero status: 2 cuMemAddressReserve failed
src/init/init.cu:677: non-zero status: 7 nvshmem setup local heap failed

When i check nvidia-smi, i see that after each iteration the memory usage is increasing. That said, I am unable to detect where the leak might be coming from. I also attempted to replace CUDA_CHECK(cudaFree(0)) with CUDA_CHECK(cudaDeviceReset());, and this seems to fix the memory leak based on on using nvidia-smi, it causes the program to segfault when initializing the data array on the second iteration, and i have checked using print statements that this segfault will sometimes occur after initializing the 300,000th element, and other times only after initializing the 100,000th element. Any insight as to what is causing either of these problems would be greatly appreciated.

Topic		Replies	Views
Nvc++ cufft memory leak nvc, nvc++ and nvfortran	5	712	January 25, 2024
Memory leak in cuFFT (cuda 5.0)? GPU-Accelerated Libraries	8	3695	January 27, 2013
About the cufft CUDA Programming and Performance	4	717	January 8, 2017
CUFFT library leaking memory? CUDA Programming and Performance	0	4687	March 21, 2011
cuFFT out of memory yields "irreparable" context GPU-Accelerated Libraries	2	910	August 10, 2016
memory problem on tesla c1060 CUDA Programming and Performance	4	1972	August 24, 2009
GPU idle time caused by CUFFT? GPU-Accelerated Libraries cufft	6	891	February 20, 2022
cuFFT 2^15+ issues? GPU-Accelerated Libraries	3	1865	January 2, 2013
memory allocation jumps in cufftplan3d sudden increase in GPU memory allocation CUDA Programming and Performance	12	9855	November 4, 2010
Is there a memory leak in CUDA CUDA Programming and Performance	6	7266	June 11, 2008

Memory leak using cufftmp

Related topics