cudaAPI error for certain sizes of FFT (seems to be a bug?)

In cuda 11.7 I am getting the following error for some sizes of FFT when running in cuda-gdb

Cuda API error detected: cuModuleGetFunction returned (0x1f4)

The output from cuda-gdb is below, and the smallest test case I could get, with comments on my runtime environment included. Please note, this occurs in both static and shared builds, with either icpc or g++.

 /usr/local/cuda/bin/cuda-gdb -ex "set cuda api_failure stop" ~/software/test_gdb_shared 
NVIDIA (R) CUDA Debugger
11.7 release

[SOME TEXT REMOVED FOR CLARITY]

Reading symbols from /home/himesb/software/test_gdb_shared...
(No debugging symbols found in /home/himesb/software/test_gdb_shared)
(cuda-gdb) r
Starting program: /home/himesb/software/test_gdb_shared 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
Trying size: 512
[Detaching after fork from child process 530600]
[New Thread 0x7fffe77df000 (LWP 530604)]
[New Thread 0x7fffe6fde000 (LWP 530605)]
[New Thread 0x7fffe3fff000 (LWP 530606)]
Success for size!512
Trying size: 192
Success for size!192
Trying size: 384
Cuda API error detected: cuModuleGetFunction returned (0x1f4)
(cuda-gdb) bt
#0  0x00007fffeda3ffb0 in cudbgReportDriverApiError () from /lib/x86_64-linux-gnu/libcuda.so.1
#1  0x00007fffedd85d4a in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#2  0x00007fffedcb828c in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#3  0x00007fffedb498ea in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#4  0x00007fffef790283 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#5  0x00007fffef78bb11 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#6  0x00007fffef7fd8d0 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#7  0x00007fffef792702 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#8  0x00007fffef7935c5 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#9  0x00007fffef5a5771 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#10 0x00007fffef5a624e in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#11 0x00007fffef5a655c in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#12 0x00007fffef59eb8f in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#13 0x00007fffef59b153 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#14 0x00007fffef5b1220 in cufftXtMakePlanMany () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#15 0x0000000000402581 in main ()
(cuda-gdb) 

HERE IS A MINIMAL SAMPLE

#include <iostream>
#include <vector>

#include <cuda_runtime.h>
#include <cuda.h>
#include <cufft.h>
#include <cufftXt.h>

// clang-format off

// Error checking with synchronization
#define cudaErr(error) { auto status = static_cast<cudaError_t>(error); if (status != cudaSuccess) { std::cerr << cudaGetErrorString(status) << " :-> "; } }
#define postcheck { cudaErr(cudaPeekAtLastError()); cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); cudaErr(error); };
#define precheck { cudaErr(cudaGetLastError()); }

// compile line for static and shared resp. (same behavior when g++ is used)

//usr/local/cuda/bin/nvcc --ccbin icpc -G --cudart static --extra-device-vectorization -std=c++17 --expt-relaxed-constexpr -t8 --default-stream per-thread -m64 -O2 --use_fast_math  -Xptxas --warn-on-local-memory-usage,--warn-on-spills, -I/usr/local/cuda/include -L/usr/local/cuda/$libdir -lcufft_static -lculibos -lcudart_static -lrt -c test_gdb.cu -o test_gdb
// /usr/local/cuda/bin/nvcc -ccbin icpc -G --extra-device-vectorization -std=c++17 --expt-relaxed-constexpr -t8 --default-stream per-thread -m64 -O2 --use_fast_math  -Xptxas --warn-on-local-memory-usage,--warn-on-spills, -I/usr/local/cuda/include -L/usr/local/cuda/$libdir -lcufft -lculibos -lcudart -lrt test_gdb.cu -o test_gdb_shared

int main( ) {


    // Things die for 384 (similarly 768 among other sizes)
    std::vector<int> sizes = {512,192,384};

    for (auto nxy : sizes) {

        std::cout << "Trying size: " << nxy << std::endl;

        int3 dims = make_int3(nxy,nxy,1);

        cufftReal* real_values_gpu;
        cufftComplex* complex_values_gpu;

        cudaErr(cudaMalloc(&real_values_gpu, sizeof(cufftReal) * dims.x * dims.y * dims.z));
        complex_values_gpu = (cufftComplex*)real_values_gpu;

        cufftHandle cuda_plan_forward;
        size_t      cuda_plan_worksize_forward;

        int            rank;
        long long int* fftDims;

        cudaErr(cufftCreate(&cuda_plan_forward));
        cudaErr(cufftSetStream(cuda_plan_forward, cudaStreamPerThread));

        rank    = 2;
        fftDims = new long long int[rank];

        fftDims[0] = dims.y;
        fftDims[1] = dims.x;

        int iBatch = 1;

        cudaErr(cufftXtMakePlanMany(cuda_plan_forward, rank, fftDims,
                                    NULL, NULL, NULL, CUDA_R_32F,
                                    NULL, NULL, NULL, CUDA_C_32F, iBatch, &cuda_plan_worksize_forward, CUDA_C_32F));


        // Get any error prior to the call to cufftExecR2C, then post check synncs on cudaStreamPerThread and checks for errors.
        precheck
        cudaErr(cufftExecR2C(cuda_plan_forward, (cufftReal*)real_values_gpu, (cufftComplex*)complex_values_gpu));
        postcheck

        cudaErr(cudaFree(real_values_gpu));
        cudaErr(cufftDestroy(cuda_plan_forward));

        std::cout << "Success for size!" << nxy << std::endl;
    }


    return 0;
}

** And some extra details about the setup**

Some extra details:

icpc --version
icpc (ICC) 2021.4.0 20210910
Copyright (C) 1985-2021 Intel Corporation. All rights reserved.

g++ (Ubuntu 11.1.0-1ubuntu1~20.04) 11.1.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

/usr/local/cuda/bin/nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_May__3_18:49:52_PDT_2022
Cuda compilation tools, release 11.7, V11.7.64
Build cuda_11.7.r11.7/compiler.31294372_0

NVIDIA-SMI 515.43.04 Driver Version: 515.43.04 CUDA Version: 11.7

Distributor ID: Ubuntu
Description: Ubuntu 20.04.4 LTS
Release: 20.04
Codename: focal

I added a necessary, but not sufficient correctness test as a way to see if the problem is with the FFT or with cuda-gdb (or other debugging info) somehow.

the test
The starting array is constant (1) so the DC component will be (NxN) + 0i, after a forward FFT and all other entries should be 0 + 0i

the result
The forward FFT seems to give the correct answer, even for size 384, while the API error reported in the initial post is still present for that size, among others.

udpated reproducible case with correctness check.

  • Note that I also fixed two errors in the suggested compile line (- -ccbin → -ccbin, and remove the -c flag so we get a binary)
#include <iostream>
#include <vector>

#include <cuda_runtime.h>
#include <cuda.h>
#include <cufft.h>
#include <cufftXt.h>

// clang-format off

// Error checking with synchronization
#define cudaErr(error) { auto status = static_cast<cudaError_t>(error); if (status != cudaSuccess) { std::cerr << cudaGetErrorString(status) << " :-> ";  std::cerr << __LINE__ << std::endl;} }
#define postcheck { cudaErr(cudaPeekAtLastError()); cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); cudaErr(error); };
#define precheck { cudaErr(cudaGetLastError()); }

// compile line for static and shared resp. (same behavior when g++ is used)

// /usr/local/cuda/bin/nvcc -ccbin icpc --cudart static --extra-device-vectorization -std=c++17 --expt-relaxed-constexpr -t8 --default-stream per-thread -m64 -O2 --use_fast_math  -Xptxas --generate-line-info,--warn-on-local-memory-usage,--warn-on-spills, -I/usr/local/cuda/include -L/usr/local/cuda/$libdir -lcufft_static -lculibos -lcudart_static -lrt test_gdb.cu -o test_gdb
// /usr/local/cuda/bin/nvcc -ccbin icpc --extra-device-vectorization -std=c++17 --expt-relaxed-constexpr -t8 --default-stream per-thread -m64 -O2 --use_fast_math  -Xptxas --generate-line-info,--warn-on-local-memory-usage,--warn-on-spills, -I/usr/local/cuda/include -L/usr/local/cuda/$libdir -lcufft -lculibos -lcudart -lrt test_gdb.cu -o test_gdb_shared

int main( ) {


    // Things die for 384 (similarly 768 among other sizes)
    std::vector<int> sizes = {512,192,384};

    for (auto nxy : sizes) {

        std::cout << "Trying size: " << nxy << std::endl;

        int3 dims = make_int3(nxy,nxy,1);

        cufftReal* real_values;
        cufftReal* real_values_gpu;
        cufftComplex* complex_values_gpu;

        // ALlocating as in FFTW padding for in place transform
        int n_elements = 2*(dims.x/2+1) * dims.y * dims.z;

        real_values = new float[n_elements];
        cudaErr(cudaMalloc(&real_values_gpu, sizeof(cufftReal) * n_elements));
        complex_values_gpu = (cufftComplex*)real_values_gpu;

        // Necessary but not sufficient correctness test:
        // Set everything (including the padding to 1.0) so that after the fowrard transform, complex_values_gpu[0] should be nxy*nxy and other values should be zero
        for (int i = 0; i < n_elements; i++) {
            real_values[i] = 1.0f;
        }

        cudaErr(cudaMemcpyAsync(real_values_gpu,real_values,sizeof(cufftReal) * n_elements,cudaMemcpyHostToDevice, cudaStreamPerThread));

        cufftHandle cuda_plan_forward;
        size_t      cuda_plan_worksize_forward;

        int            rank;
        long long int* fftDims;

        cudaErr(cufftCreate(&cuda_plan_forward));
        cudaErr(cufftSetStream(cuda_plan_forward, cudaStreamPerThread));

        rank    = 2;
        fftDims = new long long int[rank];

        fftDims[0] = dims.y;
        fftDims[1] = dims.x;

        int iBatch = 1;

        cudaErr(cufftXtMakePlanMany(cuda_plan_forward, rank, fftDims,
                                    NULL, NULL, NULL, CUDA_R_32F,
                                    NULL, NULL, NULL, CUDA_C_32F, iBatch, &cuda_plan_worksize_forward, CUDA_C_32F));


        // Get any error prior to the call to cufftExecR2C, then post check synncs on cudaStreamPerThread and checks for errors.
        precheck
        cudaErr(cufftExecR2C(cuda_plan_forward, (cufftReal*)real_values_gpu, (cufftComplex*)complex_values_gpu));
        postcheck

        cudaErr(cudaStreamSynchronize(cudaStreamPerThread));

        // Make sure we are in the same stream as the FFT, then wait on it.
        cudaErr(cudaMemcpyAsync(real_values, real_values_gpu,sizeof(cufftReal) * n_elements,cudaMemcpyDeviceToHost,cudaStreamPerThread));

        cudaErr(cudaFree(real_values_gpu));
        cudaErr(cufftDestroy(cuda_plan_forward));

        // Complex array, so all vaules should are non-padding
        bool passed = true;
        for (int i = 0; i < nxy; i++) {
            if (i == 0) {
                if (real_values[0] != nxy*nxy || fabsf(real_values[1]) > 1e-6) {
                    std::cerr << "The DC component is not correct" << std::endl;
                    std::cerr << "Error: " << real_values[0] << " " << real_values[1] << std::endl;
                    passed = false;
                }
            }
            else {
                if (fabsf(real_values[i]) > 1e-6) {
                    std::cerr << "The non-DC values should all be zero" << std::endl;
                    std::cerr << "Error: " << real_values[i] << std::endl;   
                    passed = false;           
                }
            }

        }


        if (passed) {
          std::cout << "Success for this size! with DC component as expected " << real_values[0] << "," << real_values[1] << std::endl;
        }  

        delete [] fftDims;
        delete [] real_values;

    }


    return 0;
}

/*

Some extra details:

icpc --version
icpc (ICC) 2021.4.0 20210910
Copyright (C) 1985-2021 Intel Corporation.  All rights reserved.

g++ (Ubuntu 11.1.0-1ubuntu1~20.04) 11.1.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

 /usr/local/cuda/bin/nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_May__3_18:49:52_PDT_2022
Cuda compilation tools, release 11.7, V11.7.64
Build cuda_11.7.r11.7/compiler.31294372_0

NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7 

Distributor ID:	Ubuntu
Description:	Ubuntu 20.04.4 LTS
Release:	20.04
Codename:	focal


*/