CUFFT with size 1 I guess this is a bug

Seems like my thread in Linux programming didn’t get that much attention there.

Currently I’m implementing CUFFT in a big software package. The software package came with a test program for FFT. The CUFFT failed as the test program was passing an input array of size 1 to be calculated by CUFFT. Yea I know that it doesn’t really make sense to calculate FFT of array with size 1, but I still kinda expect it to give the correct answer (even if it is trivial) instead of giving me the CUFFT_INVALID_PLAN error.

So is this considered a bug? From what I can see, this can be addressed by a simple IF case and doesn’t need to go through the kernel code when the input size is 1. I guessed programmer shouldn’t be expected to be aware of such limitation of CUFFT. And I’ve tested this for FFTW, and FFTW do give the correct answers FFT of size 1.

This is tested on Fedora Core 8 (with CUDA 1.1 installed) as well as a mac (with CUDA 2.0 installed)

my compilation code

nvcc -g -I ${MYSDK}/common/inc -L ${MYSDK}/lib -L ${MYSDK}/common/lib -lcudart -lcufft -lcutil -lfftw3f -o compareFFT compareFFT.cu

codes look like this (slightly modified before posted here, let me know if it doesn’t compile)

// includes, project

#include <cufft.h>

#include <cutil.h>

#include <fftw3.h>

// Defining how many data to be used

#define	NUM 1

#define TIMES	1	

cufftComplex *runfft(cufftComplex *data, cufftComplex *dataOut, int num);

// Executing fftw with single precision

void runFFTWf(int num, fftwf_complex *in, fftwf_complex *out);

// Program use to calculate the fft and fftw for a simple array.

int main( int argc, char **argv)

{

    unsigned int timer = 0;

    int i;

    int num = NUM;

    int times;

    fftwf_complex *in, *out;

   if( argc > 1 )

    {

	times = atoi(argv[1]);

    }

    else times = TIMES;

   // Initialising the input data

    in = (fftwf_complex *)fftwf_malloc(sizeof(fftwf_complex) * num);

    out = (fftwf_complex *)fftwf_malloc(sizeof(fftwf_complex) * num);

    cufftComplex *data = (cufftComplex *)malloc(sizeof(cufftComplex) * num);

    cufftComplex *dataOut = (cufftComplex *)malloc(sizeof(cufftComplex) * num);

    for( i = 0; i < num; i++ )

    {

	data[i].x = (float)i+1;

	data[i].y = (float)i+1;

	in[i][0] = (float)i+1;

	in[i][1] = (float)i+1;

   }

   // Print the input data, comment out when the data set is large

    printf( "%d data points\n\n", num );

    for( i = 0; i < num; i++ )

    {

	printf( "%d -- %f + %fi\n", i+1, in[i][0], in[i][1] );

    }    

   // Timing the cuda fft

    CUT_SAFE_CALL( cutCreateTimer(&timer) );

    CUT_SAFE_CALL( cutStartTimer(timer) );

   for( i = 0; i < times; i++ )

    {

	dataOut = runfft(data, dataOut, num);

    }

   CUT_SAFE_CALL( cutStopTimer(timer) );

    printf( "Time for doing %d times CUFFT: \t%f (ms)\n", times, cutGetTimerValue(timer) );

    CUT_SAFE_CALL(cutDeleteTimer(timer) );

   // Timing the fftw

    CUT_SAFE_CALL( cutCreateTimer(&timer) );

    CUT_SAFE_CALL( cutStartTimer(timer) );

   for( i = 0; i < times; i++ )

    {

	runFFTWf(num, in, out);

    }

   CUT_SAFE_CALL( cutStopTimer(timer) );

    printf( "Time for doing %d times FFTW: \t%f (ms)\n", times, cutGetTimerValue(timer) );

    CUT_SAFE_CALL(cutDeleteTimer(timer) );

    // Print the result fft

    printf( "FFTW results:\n" );

    for( i = 0; i < num; i++ )

    {

	printf( "%d -- %f + %fi\n", i+1, out[i][0], out[i][1] );

    }

   printf( "CUFFT results: \n" );

    for( i = 0; i < num; i++ )

    {

	printf( "%d -- %f + %fi\n", i+1, dataOut[i].x, dataOut[i].y );

   }

   free(data);

    fftwf_free(in);

    fftwf_free(out);

}

cufftComplex *runfft(cufftComplex *data, cufftComplex *dataOut, int num)

{

       // Allocate device memory for data

    cufftComplex *d_data;

    cudaMalloc( (void **)&d_data, sizeof(cufftComplex) * num );

    

   // Copy host memory to device

    cudaMemcpy(d_data, data, num * sizeof(cufftComplex), cudaMemcpyHostToDevice);

   // CUFFT plan

    cufftHandle plan;

    cufftPlan1d(&plan, num, CUFFT_C2C, 1);

   // FFT execution

    cufftExecC2C(plan, (cufftComplex *)d_data, (cufftComplex *)d_data, CUFFT_FORWARD);

   // Copy result to host

    cudaMemcpy(dataOut, d_data, num * sizeof(cufftComplex), cudaMemcpyDeviceToHost);

   // Clear device memory

    cufftDestroy(plan);

    cudaFree(d_data);

    

    return dataOut;

}

void runFFTWf(int num, fftwf_complex *in, fftwf_complex *out)

{

    fftwf_plan plan = fftwf_plan_dft_1d(num, in, out, FFTW_FORWARD, FFTW_ESTIMATE);

    fftwf_execute(plan);

    fftwf_destroy_plan(plan);

}