Memory leak using cufftmp

I am attempting to get a sense of how quickly I can get the cufftmp library to perform the ffts I need for a simulation program using the following code

#include <mpi.h>
#include <cuda_runtime.h>
#include <cufftMp.h>
#include <iostream>
#include <random>
#include <vector>

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

#define CUFFT_CHECK(ans) { cufftAssert((ans), __FILE__, __LINE__); }
inline void cufftAssert(cufftResult code, const char *file, int line, bool abort=true) {
    if (code != CUFFT_SUCCESS) {
        fprintf(stderr, "CUFFTassert: %d %s %d\n", code, file, line);
        if (abort) exit(code);
    }
}

// Function to generate random data
void generate_random(std::vector<float>& data, int seed, int rank, MPI_Comm comm) {
    std::mt19937 gen(seed);
    std::uniform_real_distribution<float> dist(-1, 1);
    for (size_t i = 0; i < data.size(); ++i) {
        data[i] = dist(gen);
        // Print some values periodically
        if (i % 100000 == 0) {
            MPI_Barrier(comm);
            printf("Rank %d: data[%lu] = %f\n", rank, i, data[i]);
        }
    }
}

void run_cuda_transform(size_t nx, size_t ny, float* cpu_data, MPI_Comm comm, const int rank, const int size, double& r2c_time, double& c2r_time) {
    int ndevices;
    CUDA_CHECK(cudaGetDeviceCount(&ndevices));
    CUDA_CHECK(cudaSetDevice(rank % ndevices));
    printf("Hello from rank %d/%d using GPU %d\n", rank, size, rank % ndevices);

    cufftHandle plan_r2c = 0;
    cufftHandle plan_c2r = 0;
    cudaStream_t stream = nullptr;
    cudaLibXtDesc *desc = nullptr;

    CUDA_CHECK(cudaStreamCreate(&stream));
    CUFFT_CHECK(cufftCreate(&plan_r2c));
    CUFFT_CHECK(cufftCreate(&plan_c2r));

    CUFFT_CHECK(cufftMpAttachComm(plan_r2c, CUFFT_COMM_MPI, &comm));
    CUFFT_CHECK(cufftMpAttachComm(plan_c2r, CUFFT_COMM_MPI, &comm));

    CUFFT_CHECK(cufftSetStream(plan_r2c, stream));
    CUFFT_CHECK(cufftSetStream(plan_c2r, stream));

    size_t workspace;
    CUFFT_CHECK(cufftMakePlan2d(plan_r2c, nx, ny, CUFFT_R2C, &workspace));
    CUFFT_CHECK(cufftMakePlan2d(plan_c2r, nx, ny, CUFFT_C2R, &workspace));

    CUFFT_CHECK(cufftXtMalloc(plan_r2c, &desc, CUFFT_XT_FORMAT_INPLACE));

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUFFT_CHECK(cufftXtMemcpy(plan_r2c, (void*)desc, (void*)cpu_data, CUFFT_COPY_HOST_TO_DEVICE));

    CUDA_CHECK(cudaStreamSynchronize(stream));

    double start_time = MPI_Wtime();
    CUFFT_CHECK(cufftXtExecDescriptor(plan_r2c, desc, desc, CUFFT_FORWARD));
    r2c_time = MPI_Wtime() - start_time;

    start_time = MPI_Wtime();
    CUFFT_CHECK(cufftXtExecDescriptor(plan_c2r, desc, desc, CUFFT_INVERSE));
    c2r_time = MPI_Wtime() - start_time;

    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUFFT_CHECK(cufftXtMemcpy(plan_c2r, (void*)cpu_data, (void*)desc, CUFFT_COPY_DEVICE_TO_HOST));

    CUFFT_CHECK(cufftXtFree(desc));
    CUFFT_CHECK(cufftDestroy(plan_r2c));
    CUFFT_CHECK(cufftDestroy(plan_c2r));
    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaFree(0));
}

void run_iterations(size_t nx, size_t ny, const int rank, const int size, int iterations) {
    MPI_Comm comm = MPI_COMM_WORLD;

    for (int i = 0; i < iterations; ++i) {
        std::vector<float> data((ny + 2) * (nx / size), 1.0);
        MPI_Barrier(comm);
        generate_random(data, rank, rank, comm);

        double r2c_time, c2r_time;
        double start_time = MPI_Wtime();
        run_cuda_transform(nx, ny, data.data(), comm, rank, size, r2c_time, c2r_time);
        double total_time = MPI_Wtime() - start_time;

        if (rank == 0) {
            printf("Iteration %d: R2C time: %f seconds, C2R time: %f seconds, Total time: %f seconds\n", i + 1, r2c_time, c2r_time, total_time);
        }

        printf("Rank %d: Memory freed after iteration %d\n", rank, i + 1);
    }
}

int main(int argc, char** argv) {
    MPI_Init(&argc, &argv);

    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    size_t nx = 2048;
    size_t ny = 4096;

    int iterations = 10;

    run_iterations(nx, ny, rank, size, iterations);

    MPI_Finalize();

    return 0;
}

When i run this code using 4 gpus, one task per GPU , the first 7 iterations function successfully, but on the 8th nvshmem is unable to allocate memory. Specifically, i get

src/mem/mem.cpp:248: non-zero status: 2 cuMemAddressReserve failed
src/init/init.cu:677: non-zero status: 7 nvshmem setup local heap failed

When i check nvidia-smi, i see that after each iteration the memory usage is increasing. That said, I am unable to detect where the leak might be coming from. I also attempted to replace CUDA_CHECK(cudaFree(0)) with CUDA_CHECK(cudaDeviceReset());, and this seems to fix the memory leak based on on using nvidia-smi, it causes the program to segfault when initializing the data array on the second iteration, and i have checked using print statements that this segfault will sometimes occur after initializing the 300,000th element, and other times only after initializing the 100,000th element. Any insight as to what is causing either of these problems would be greatly appreciated.