I have a problem with a project that I am working on that requires me to perform 540 FFTs with 12288 elements a piece. I think the CUFFT documentation is quite straightforward, but I can’t seem to get anywhere near the results that I am expecting. All of my data is real, so I am performing an in place real to complex transform. If you have a card, and linux, and wouldn’t mind running this code, and pasting your output, I would be much obliged.
I run it for a size of 1 x 12288 and a size for 540 x 12288, so that I can see the speedup from a batch size of 1 and a batch size of 540.
Also, if you see if I am doing something foolishly, I would also really like to hear about it too.
// includes, system
#include <stdio.h>
#include <cutil.h>
#include <cufft.h>
#define NX 12288
#define NUM_FFT_RUNS 6
#define PRINT_EACH_RUN 0
int main(int argc, char ** argv)
{
//Get and print device info
int deviceCount;
CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
for (int dev = 0; dev < deviceCount; ++dev)
{
cudaDeviceProp deviceProp;
CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));
printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
printf(" Major revision number: %d\n",
deviceProp.major);
printf(" Minor revision number: %d\n",
deviceProp.minor);
printf(" Total amount of global memory: %d bytes\n",
deviceProp.totalGlobalMem);
printf(" Clock rate: %d kilohertz\n",
deviceProp.clockRate);
}
printf("Time for CUFFT represents only the Kernel execution\n");
int batchSize;
//Run both batch of size 1 and batch of size 540
for(batchSize = 1; batchSize < 541; batchSize*=540)
{
printf("\nBatch Size is %i | ", batchSize);
printf("NX size is %i\n", NX);
// Initialize CUFFT PLAN
cufftHandle plan;
cufftPlan1d(&plan, NX, CUFFT_R2C, batchSize);
cufftComplex *data;
float * in;
cufftComplex * data_host;
data_host = (cufftComplex*)malloc(sizeof(cufftComplex)*(NX/2+1)*batchSize);
//Creat Non-Trivial Input
in = (float*)malloc(sizeof(cufftComplex)*(NX)*batchSize);
int y;
for(y = 0; y < NX * batchSize; y++)
{
in[y] = (float)rand()/(RAND_MAX);
}
cudaMalloc((void**)&data, sizeof(cufftComplex)*(NX/2+1)*batchSize);
unsigned int timer;
CUT_SAFE_CALL(cutCreateTimer(&timer));
float averagecufft = 0;
int x;
for(x = 0; x < NUM_FFT_RUNS; x++)
{
cudaMemcpy(data, in, sizeof(cufftComplex)*(NX/2+1)*batchSize,
cudaMemcpyHostToDevice);
//TIME CUFFT
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStartTimer(timer));
cufftExecR2C(plan, (cufftReal*)data, data);
cudaThreadSynchronize( );
CUT_SAFE_CALL(cutStopTimer(timer));
cudaMemcpy(data_host, data, sizeof(cufftComplex)*(NX/2+1)*batchSize,cudaMemcpyDeviceToHost);
averagecufft += cutGetTimerValue(timer);
#if PRINT_EACH_RUN printf("CUFFT run %i: %i x %i %f\n",x, batchSize, NX, cutGetTimerValue(timer) );
#endif
CUT_SAFE_CALL(cutResetTimer(timer));
}
printf("Average CUFFT over %i executions of %i x %i : %fms\n", NUM_FFT_RUNS,
batchSize, NX, averagecufft/NUM_FFT_RUNS);
cudaFree(data);
cufftDestroy(plan);
}
return 0;
}
This is the result that I have been getting, when I run it.
Device 0: "GeForce 8800 GTX"
Major revision number: 1
Minor revision number: 0
Total amount of global memory: 804585472 bytes
Clock rate: 1350000 kilohertz
Time for CUFFT represents only the Kernel execution
Batch Size is 1 | NX size is 12288
Segmentation fault
[mczapar2@tipquad02 release]$ ./cufftonly
Device 0: "GeForce 8800 GTX"
Major revision number: 1
Minor revision number: 0
Total amount of global memory: 804585472 bytes
Clock rate: 1350000 kilohertz
Time for CUFFT represents only the Kernel execution
Batch Size is 1 | NX size is 12288
Average CUFFT over 6 executions of 1 x 12288 : 0.144667ms
Batch Size is 540 | NX size is 12288
Average CUFFT over 6 executions of 540 x 12288 : 77.663498ms