In cuda 11.7 I am getting the following error for some sizes of FFT when running in cuda-gdb
Cuda API error detected: cuModuleGetFunction returned (0x1f4)
The output from cuda-gdb is below, and the smallest test case I could get, with comments on my runtime environment included. Please note, this occurs in both static and shared builds, with either icpc or g++.
/usr/local/cuda/bin/cuda-gdb -ex "set cuda api_failure stop" ~/software/test_gdb_shared
NVIDIA (R) CUDA Debugger
11.7 release
[SOME TEXT REMOVED FOR CLARITY]
Reading symbols from /home/himesb/software/test_gdb_shared...
(No debugging symbols found in /home/himesb/software/test_gdb_shared)
(cuda-gdb) r
Starting program: /home/himesb/software/test_gdb_shared
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
Trying size: 512
[Detaching after fork from child process 530600]
[New Thread 0x7fffe77df000 (LWP 530604)]
[New Thread 0x7fffe6fde000 (LWP 530605)]
[New Thread 0x7fffe3fff000 (LWP 530606)]
Success for size!512
Trying size: 192
Success for size!192
Trying size: 384
Cuda API error detected: cuModuleGetFunction returned (0x1f4)
(cuda-gdb) bt
#0 0x00007fffeda3ffb0 in cudbgReportDriverApiError () from /lib/x86_64-linux-gnu/libcuda.so.1
#1 0x00007fffedd85d4a in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#2 0x00007fffedcb828c in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#3 0x00007fffedb498ea in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
#4 0x00007fffef790283 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#5 0x00007fffef78bb11 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#6 0x00007fffef7fd8d0 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#7 0x00007fffef792702 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#8 0x00007fffef7935c5 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#9 0x00007fffef5a5771 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#10 0x00007fffef5a624e in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#11 0x00007fffef5a655c in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#12 0x00007fffef59eb8f in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#13 0x00007fffef59b153 in ?? () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#14 0x00007fffef5b1220 in cufftXtMakePlanMany () from /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufft.so.10
#15 0x0000000000402581 in main ()
(cuda-gdb)
HERE IS A MINIMAL SAMPLE
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cufft.h>
#include <cufftXt.h>
// clang-format off
// Error checking with synchronization
#define cudaErr(error) { auto status = static_cast<cudaError_t>(error); if (status != cudaSuccess) { std::cerr << cudaGetErrorString(status) << " :-> "; } }
#define postcheck { cudaErr(cudaPeekAtLastError()); cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); cudaErr(error); };
#define precheck { cudaErr(cudaGetLastError()); }
// compile line for static and shared resp. (same behavior when g++ is used)
//usr/local/cuda/bin/nvcc --ccbin icpc -G --cudart static --extra-device-vectorization -std=c++17 --expt-relaxed-constexpr -t8 --default-stream per-thread -m64 -O2 --use_fast_math -Xptxas --warn-on-local-memory-usage,--warn-on-spills, -I/usr/local/cuda/include -L/usr/local/cuda/$libdir -lcufft_static -lculibos -lcudart_static -lrt -c test_gdb.cu -o test_gdb
// /usr/local/cuda/bin/nvcc -ccbin icpc -G --extra-device-vectorization -std=c++17 --expt-relaxed-constexpr -t8 --default-stream per-thread -m64 -O2 --use_fast_math -Xptxas --warn-on-local-memory-usage,--warn-on-spills, -I/usr/local/cuda/include -L/usr/local/cuda/$libdir -lcufft -lculibos -lcudart -lrt test_gdb.cu -o test_gdb_shared
int main( ) {
// Things die for 384 (similarly 768 among other sizes)
std::vector<int> sizes = {512,192,384};
for (auto nxy : sizes) {
std::cout << "Trying size: " << nxy << std::endl;
int3 dims = make_int3(nxy,nxy,1);
cufftReal* real_values_gpu;
cufftComplex* complex_values_gpu;
cudaErr(cudaMalloc(&real_values_gpu, sizeof(cufftReal) * dims.x * dims.y * dims.z));
complex_values_gpu = (cufftComplex*)real_values_gpu;
cufftHandle cuda_plan_forward;
size_t cuda_plan_worksize_forward;
int rank;
long long int* fftDims;
cudaErr(cufftCreate(&cuda_plan_forward));
cudaErr(cufftSetStream(cuda_plan_forward, cudaStreamPerThread));
rank = 2;
fftDims = new long long int[rank];
fftDims[0] = dims.y;
fftDims[1] = dims.x;
int iBatch = 1;
cudaErr(cufftXtMakePlanMany(cuda_plan_forward, rank, fftDims,
NULL, NULL, NULL, CUDA_R_32F,
NULL, NULL, NULL, CUDA_C_32F, iBatch, &cuda_plan_worksize_forward, CUDA_C_32F));
// Get any error prior to the call to cufftExecR2C, then post check synncs on cudaStreamPerThread and checks for errors.
precheck
cudaErr(cufftExecR2C(cuda_plan_forward, (cufftReal*)real_values_gpu, (cufftComplex*)complex_values_gpu));
postcheck
cudaErr(cudaFree(real_values_gpu));
cudaErr(cufftDestroy(cuda_plan_forward));
std::cout << "Success for size!" << nxy << std::endl;
}
return 0;
}
** And some extra details about the setup**
Some extra details:
icpc --version
icpc (ICC) 2021.4.0 20210910
Copyright (C) 1985-2021 Intel Corporation. All rights reserved.
g++ (Ubuntu 11.1.0-1ubuntu1~20.04) 11.1.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
/usr/local/cuda/bin/nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_May__3_18:49:52_PDT_2022
Cuda compilation tools, release 11.7, V11.7.64
Build cuda_11.7.r11.7/compiler.31294372_0
NVIDIA-SMI 515.43.04 Driver Version: 515.43.04 CUDA Version: 11.7
Distributor ID: Ubuntu
Description: Ubuntu 20.04.4 LTS
Release: 20.04
Codename: focal