cudaAPI error for certain sizes of FFT (seems to be a bug?)

himes.benjamin · July 8, 2022, 8:44pm

In cuda 11.7 I am getting the following error for some sizes of FFT when running in cuda-gdb

Cuda API error detected: cuModuleGetFunction returned (0x1f4)

The output from cuda-gdb is below, and the smallest test case I could get, with comments on my runtime environment included. Please note, this occurs in both static and shared builds, with either icpc or g++.

 /usr/local/cuda/bin/cuda-gdb -ex "set cuda api_failure stop" ~/software/test_gdb_shared 
NVIDIA (R) CUDA Debugger
11.7 release

[SOME TEXT REMOVED FOR CLARITY]

Reading symbols from /home/himesb/software/test_gdb_shared...
(No debugging symbols found in /home/himesb/software/test_gdb_shared)
(cuda-gdb) r
Starting program: /home/himesb/software/test_gdb_shared 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
Trying size: 512
[Detaching after fork from child process 530600]
[New Thread 0x7fffe77df000 (LWP 530604)]
[New Thread 0x7fffe6fde000 (LWP 530605)]
[New Thread 0x7fffe3fff000 (LWP 530606)]
Success for size!512
Trying size: 192
Success for size!192
Trying size: 384
Cuda API error detected: cuModuleGetFunction returned (0x1f4)
(cuda-gdb) bt
#0  0x00007fffeda3ffb0 in cudbgReportDriverApiError () from /lib/x86_64-linux-gnu/libcuda.so.1
#1  0x00007fffedd85d4a in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#2  0x00007fffedcb828c in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#3  0x00007fffedb498ea in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#4  0x00007fffef790283 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#5  0x00007fffef78bb11 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#6  0x00007fffef7fd8d0 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#7  0x00007fffef792702 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#8  0x00007fffef7935c5 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#9  0x00007fffef5a5771 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#10 0x00007fffef5a624e in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#11 0x00007fffef5a655c in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#12 0x00007fffef59eb8f in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#13 0x00007fffef59b153 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#14 0x00007fffef5b1220 in cufftXtMakePlanMany () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#15 0x0000000000402581 in main ()
(cuda-gdb)

HERE IS A MINIMAL SAMPLE

#include <iostream>
#include <vector>

#include <cuda_runtime.h>
#include <cuda.h>
#include <cufft.h>
#include <cufftXt.h>

// clang-format off

// Error checking with synchronization
#define cudaErr(error) { auto status = static_cast<cudaError_t>(error); if (status != cudaSuccess) { std::cerr << cudaGetErrorString(status) << " :-> "; } }
#define postcheck { cudaErr(cudaPeekAtLastError()); cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); cudaErr(error); };
#define precheck { cudaErr(cudaGetLastError()); }

// compile line for static and shared resp. (same behavior when g++ is used)

//usr/local/cuda/bin/nvcc --ccbin icpc -G --cudart static --extra-device-vectorization -std=c++17 --expt-relaxed-constexpr -t8 --default-stream per-thread -m64 -O2 --use_fast_math  -Xptxas --warn-on-local-memory-usage,--warn-on-spills, -I/usr/local/cuda/include -L/usr/local/cuda/$libdir -lcufft_static -lculibos -lcudart_static -lrt -c test_gdb.cu -o test_gdb
// /usr/local/cuda/bin/nvcc -ccbin icpc -G --extra-device-vectorization -std=c++17 --expt-relaxed-constexpr -t8 --default-stream per-thread -m64 -O2 --use_fast_math  -Xptxas --warn-on-local-memory-usage,--warn-on-spills, -I/usr/local/cuda/include -L/usr/local/cuda/$libdir -lcufft -lculibos -lcudart -lrt test_gdb.cu -o test_gdb_shared

int main( ) {


    // Things die for 384 (similarly 768 among other sizes)
    std::vector<int> sizes = {512,192,384};

    for (auto nxy : sizes) {

        std::cout << "Trying size: " << nxy << std::endl;

        int3 dims = make_int3(nxy,nxy,1);

        cufftReal* real_values_gpu;
        cufftComplex* complex_values_gpu;

        cudaErr(cudaMalloc(&real_values_gpu, sizeof(cufftReal) * dims.x * dims.y * dims.z));
        complex_values_gpu = (cufftComplex*)real_values_gpu;

        cufftHandle cuda_plan_forward;
        size_t      cuda_plan_worksize_forward;

        int            rank;
        long long int* fftDims;

        cudaErr(cufftCreate(&cuda_plan_forward));
        cudaErr(cufftSetStream(cuda_plan_forward, cudaStreamPerThread));

        rank    = 2;
        fftDims = new long long int[rank];

        fftDims[0] = dims.y;
        fftDims[1] = dims.x;

        int iBatch = 1;

        cudaErr(cufftXtMakePlanMany(cuda_plan_forward, rank, fftDims,
                                    NULL, NULL, NULL, CUDA_R_32F,
                                    NULL, NULL, NULL, CUDA_C_32F, iBatch, &cuda_plan_worksize_forward, CUDA_C_32F));


        // Get any error prior to the call to cufftExecR2C, then post check synncs on cudaStreamPerThread and checks for errors.
        precheck
        cudaErr(cufftExecR2C(cuda_plan_forward, (cufftReal*)real_values_gpu, (cufftComplex*)complex_values_gpu));
        postcheck

        cudaErr(cudaFree(real_values_gpu));
        cudaErr(cufftDestroy(cuda_plan_forward));

        std::cout << "Success for size!" << nxy << std::endl;
    }


    return 0;
}

** And some extra details about the setup**

Some extra details:

icpc --version
icpc (ICC) 2021.4.0 20210910
Copyright (C) 1985-2021 Intel Corporation. All rights reserved.

g++ (Ubuntu 11.1.0-1ubuntu1~20.04) 11.1.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

/usr/local/cuda/bin/nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_May__3_18:49:52_PDT_2022
Cuda compilation tools, release 11.7, V11.7.64
Build cuda_11.7.r11.7/compiler.31294372_0

NVIDIA-SMI 515.43.04 Driver Version: 515.43.04 CUDA Version: 11.7

Distributor ID: Ubuntu
Description: Ubuntu 20.04.4 LTS
Release: 20.04
Codename: focal

himes.benjamin · July 10, 2022, 8:35pm

I added a necessary, but not sufficient correctness test as a way to see if the problem is with the FFT or with cuda-gdb (or other debugging info) somehow.

the test
The starting array is constant (1) so the DC component will be (NxN) + 0i, after a forward FFT and all other entries should be 0 + 0i

the result
The forward FFT seems to give the correct answer, even for size 384, while the API error reported in the initial post is still present for that size, among others.

udpated reproducible case with correctness check.

Note that I also fixed two errors in the suggested compile line (- -ccbin → -ccbin, and remove the -c flag so we get a binary)

#include <iostream>
#include <vector>

#include <cuda_runtime.h>
#include <cuda.h>
#include <cufft.h>
#include <cufftXt.h>

// clang-format off

// Error checking with synchronization
#define cudaErr(error) { auto status = static_cast<cudaError_t>(error); if (status != cudaSuccess) { std::cerr << cudaGetErrorString(status) << " :-> ";  std::cerr << __LINE__ << std::endl;} }
#define postcheck { cudaErr(cudaPeekAtLastError()); cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); cudaErr(error); };
#define precheck { cudaErr(cudaGetLastError()); }

// compile line for static and shared resp. (same behavior when g++ is used)

// /usr/local/cuda/bin/nvcc -ccbin icpc --cudart static --extra-device-vectorization -std=c++17 --expt-relaxed-constexpr -t8 --default-stream per-thread -m64 -O2 --use_fast_math  -Xptxas --generate-line-info,--warn-on-local-memory-usage,--warn-on-spills, -I/usr/local/cuda/include -L/usr/local/cuda/$libdir -lcufft_static -lculibos -lcudart_static -lrt test_gdb.cu -o test_gdb
// /usr/local/cuda/bin/nvcc -ccbin icpc --extra-device-vectorization -std=c++17 --expt-relaxed-constexpr -t8 --default-stream per-thread -m64 -O2 --use_fast_math  -Xptxas --generate-line-info,--warn-on-local-memory-usage,--warn-on-spills, -I/usr/local/cuda/include -L/usr/local/cuda/$libdir -lcufft -lculibos -lcudart -lrt test_gdb.cu -o test_gdb_shared

int main( ) {


    // Things die for 384 (similarly 768 among other sizes)
    std::vector<int> sizes = {512,192,384};

    for (auto nxy : sizes) {

        std::cout << "Trying size: " << nxy << std::endl;

        int3 dims = make_int3(nxy,nxy,1);

        cufftReal* real_values;
        cufftReal* real_values_gpu;
        cufftComplex* complex_values_gpu;

        // ALlocating as in FFTW padding for in place transform
        int n_elements = 2*(dims.x/2+1) * dims.y * dims.z;

        real_values = new float[n_elements];
        cudaErr(cudaMalloc(&real_values_gpu, sizeof(cufftReal) * n_elements));
        complex_values_gpu = (cufftComplex*)real_values_gpu;

        // Necessary but not sufficient correctness test:
        // Set everything (including the padding to 1.0) so that after the fowrard transform, complex_values_gpu[0] should be nxy*nxy and other values should be zero
        for (int i = 0; i < n_elements; i++) {
            real_values[i] = 1.0f;
        }

        cudaErr(cudaMemcpyAsync(real_values_gpu,real_values,sizeof(cufftReal) * n_elements,cudaMemcpyHostToDevice, cudaStreamPerThread));

        cufftHandle cuda_plan_forward;
        size_t      cuda_plan_worksize_forward;

        int            rank;
        long long int* fftDims;

        cudaErr(cufftCreate(&cuda_plan_forward));
        cudaErr(cufftSetStream(cuda_plan_forward, cudaStreamPerThread));

        rank    = 2;
        fftDims = new long long int[rank];

        fftDims[0] = dims.y;
        fftDims[1] = dims.x;

        int iBatch = 1;

        cudaErr(cufftXtMakePlanMany(cuda_plan_forward, rank, fftDims,
                                    NULL, NULL, NULL, CUDA_R_32F,
                                    NULL, NULL, NULL, CUDA_C_32F, iBatch, &cuda_plan_worksize_forward, CUDA_C_32F));


        // Get any error prior to the call to cufftExecR2C, then post check synncs on cudaStreamPerThread and checks for errors.
        precheck
        cudaErr(cufftExecR2C(cuda_plan_forward, (cufftReal*)real_values_gpu, (cufftComplex*)complex_values_gpu));
        postcheck

        cudaErr(cudaStreamSynchronize(cudaStreamPerThread));

        // Make sure we are in the same stream as the FFT, then wait on it.
        cudaErr(cudaMemcpyAsync(real_values, real_values_gpu,sizeof(cufftReal) * n_elements,cudaMemcpyDeviceToHost,cudaStreamPerThread));

        cudaErr(cudaFree(real_values_gpu));
        cudaErr(cufftDestroy(cuda_plan_forward));

        // Complex array, so all vaules should are non-padding
        bool passed = true;
        for (int i = 0; i < nxy; i++) {
            if (i == 0) {
                if (real_values[0] != nxy*nxy || fabsf(real_values[1]) > 1e-6) {
                    std::cerr << "The DC component is not correct" << std::endl;
                    std::cerr << "Error: " << real_values[0] << " " << real_values[1] << std::endl;
                    passed = false;
                }
            }
            else {
                if (fabsf(real_values[i]) > 1e-6) {
                    std::cerr << "The non-DC values should all be zero" << std::endl;
                    std::cerr << "Error: " << real_values[i] << std::endl;   
                    passed = false;           
                }
            }

        }


        if (passed) {
          std::cout << "Success for this size! with DC component as expected " << real_values[0] << "," << real_values[1] << std::endl;
        }  

        delete [] fftDims;
        delete [] real_values;

    }


    return 0;
}

/*

Some extra details:

icpc --version
icpc (ICC) 2021.4.0 20210910
Copyright (C) 1985-2021 Intel Corporation.  All rights reserved.

g++ (Ubuntu 11.1.0-1ubuntu1~20.04) 11.1.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

 /usr/local/cuda/bin/nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_May__3_18:49:52_PDT_2022
Cuda compilation tools, release 11.7, V11.7.64
Build cuda_11.7.r11.7/compiler.31294372_0

NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7 

Distributor ID:	Ubuntu
Description:	Ubuntu 20.04.4 LTS
Release:	20.04
Codename:	focal


*/

Topic		Replies	Views
gcc 4.4 support anytime soon? CUDA Programming and Performance	24	108099	April 9, 2010
Crash after large number of FFTs CUDA Programming and Performance	4	4308	October 2, 2007
CUDA Toolkit and SDK v2.2 released CUDA Programming and Performance	59	64621	January 25, 2011
CUFFT_INVALID_DEVICE on cufftPlan1d in NVIDIA's Simple CUFFT example GPU-Accelerated Libraries	6	3802	December 15, 2014
CUDA compile trouble CUDA Programming and Performance	47	5115	November 8, 2010
my speedy FFT 3x faster than CUFFT CUDA Programming and Performance	139	241047	November 16, 2011
CUDA 11.6.0 with gcc 11.2.1 fails to process system headers included by <functional> CUDA NVCC Compiler	4	6257	May 24, 2022
CUDA version not available message with nvc++ on Ubuntu nvc, nvc++ and nvfortran	11	7642	April 30, 2021
Warning: Cuda API error detected: cuModuleLoadFatBinary returned (0xd1) CUDA-GDB	25	1086	July 30, 2024
Cuda-gdb internal-error of copy_type on basic fortran example CUDA-GDB cuda-gdb	5	974	January 2, 2024

cudaAPI error for certain sizes of FFT (seems to be a bug?)

Related topics