Hello
I wrote the following code utilizing CUFFT Library for calculating FFT of 256 numbers and in 10 Batches. I have following questions in this regard:
1- Am I using the correct way for calculating the elapsed time for cufftExecC2C ( Basically I am using CUDA Event) ?
2- To calculate the execution time on the CPU, I am simple running the program in emulation mode -deviceemu. Based on the method of calculating the execution time I am using, I am observing that there is no benefit in performing the FFT on the GPU. I checked my results by varying the number of batches from 10 to 6000, and the benefit that I am observing is almost negligible; Am I doing something wrong while calculating the timing?
Thanks in advance.
#include <stdio.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cuda.h>
#define NX 256
#define BATCH 10
int main()
{
cufftHandle plan;
cufftComplex *devPtr;
cufftComplex data[NX*BATCH];
int i;
/* source data creation */
for(i= 0; i < NX*BATCH; i++){
data[i].x = 1.0f;
data[i].y = 1.0f;
}
cudaEvent_t start,stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/* GPU memory allocation */
cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*NX*BATCH);
/* transfer to GPU memory */
cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice);
/* creates 1D FFT plan */
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
/* Timing Calculations*/
cudaEventRecord( start, 0 );
/* executes FFT processes */
cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD);
cudaThreadSynchronize();
cudaEventRecord( stop , 0 );
cudaEventSynchronize( stop );
float elapsedTime;
cudaEventElapsedTime( &elapsedTime, start, stop );
printf("Processing time=%f(ms)\n",elapsedTime);
cudaEventDestroy( start );
cudaEventDestroy( stop );
/* transfer results from GPU memory */
cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost);
/* deletes CUFFT plan */
cufftDestroy(plan);
/* frees GPU memory */
cudaFree(devPtr);
/*for(i = 0; i < NX*BATCH; i++){
printf("data[%d] %f %f\n", i, data[i].x, data[i].y);
}*/
return 0;
}