I’ve identified a bug in CUDA. Basically if you try to run an FFT after you launch a kernel with 0 threads the FFT fails.
Please observe the following fft.cu code example. This is for cuda 2.1 by the way.
[codebox]
#include <stdio.h>
#include <math_functions.h>
#include “cublas.h”
#include “cufft.h”
#include “cutil.h”
global void vec_set(float *v, unsigned off)
{
unsigned j = threadIdx.x + blockIdx.x * blockDim.x + off ;
unsigned j2 = j << 1 ;
v[j2] = 1.0 ;
v[j2+1] = 0.0 ;
}
int main(int argc, char *argv)
{
char ch ;
cublasInit() ;
#define NX 256
#define BATCH 1
dim3 block ;
dim3 grid ;
dim3 residual ;
block.x = NX ;
block.y = BATCH ;
grid.x = 1 ;
residual.x = 0 ;
residual.y = block.y ;
cufftHandle plan ;
cufftComplex *data ;
cudaMalloc((void**)&data, sizeof(cufftComplex)NXBATCH);
/* Create a 1D FFT plan. */
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
/* run a kernel */
vec_set<<<grid,block>>>((float *)data, 0) ;
/* a kernel with 0 threads. The next statement causes the FFT to fail */
vec_set<<<1,residual>>>((float *)data, block.x * grid.x) ;
/* Use the CUFFT plan to transform the signal in place. */
cufftExecC2C(plan, data, data, CUFFT_FORWARD);
/* Inverse transform the signal in place. */
cufftExecC2C(plan, data, data, CUFFT_INVERSE);
cublasShutdown() ;
printf(“Goodbye Cruel World\n”) ;
ch = getchar() ;
}
[/codebox]
The kernel launched with the residual dim3 box that has the zero dimension appears to be the culprit.
Execution fails with these errors:
cufft: ERROR: D:/Bld/rel/gpgpu/toolkit/r2.1/cufft/src/execute.cu, line 1070
cufft: ERROR: CUFFT_EXEC_FAILED
cufft: ERROR: D:/Bld/rel/gpgpu/toolkit/r2.1/cufft/src/cufft.cu, line 151
cufft: ERROR: CUFFT_EXEC_FAILED
Goodbye Cruel World
By the way the 0 threads instantiation has worked in all other contexts with nary a problem. So far it only kills the FFT call.