hi, i have this function to benchmark my fft:

```
extern "C" void cuFFT(int *src,int len){
CUT_DEVICE_INIT();
cufftComplex *srcHost;
cudaMallocHost((void **)&srcHost, len*sizeof(cufftComplex));
for(int i=0;i<len;i++){
srcHost[i].x=src[i];
srcHost[i].y=0;
}
cufftHandle plan;
CUFFT_SAFE_CALL(cufftPlan1d(&plan, len, CUFFT_C2C, 1));
cufftComplex *srcDevice;
CUDA_SAFE_CALL(cudaMalloc((void**)&srcDevice, len*sizeof(cufftComplex)));
cufftComplex *dstD;
CUDA_SAFE_CALL(cudaMalloc((void**)&dstD, len*sizeof(cufftComplex)));
int *dstDevice;
CUDA_SAFE_CALL(cudaMalloc((void**)&dstDevice, len*sizeof(int)));
CUDA_SAFE_CALL(cudaMemcpy(srcDevice, srcHost, len*sizeof(cufftComplex), cudaMemcpyHostToDevice));
dim3 dimBlock(N_THREADS);
dim3 dimGrid(len/N_THREADS);
for(int i=0;i<100000;i++){
CUFFT_SAFE_CALL(cufftExecC2C(plan, srcDevice, dstD,CUFFT_FORWARD));
//cuMag<<<dimGrid,dimBlock>>>(dstD,dstDevice,len);
}
//CUDA_SAFE_CALL(cudaMemcpy(src, dstDevice, len*sizeof(int), cudaMemcpyDeviceToHost));
CUFFT_SAFE_CALL(cufftDestroy(plan));
CUDA_SAFE_CALL(cudaFree(dstDevice));
CUDA_SAFE_CALL(cudaFree(dstD));
CUDA_SAFE_CALL(cudaFree(srcDevice));
cudaFreeHost(srcHost);
}
```

this is taking more or less the same amount of time than intel IPP FFT function also in a 100000 loop (39 seconds). if i put it calculating the Magnitude it takes more 3 or 4 seconds, which is even worse, considering the ipp function is already calculating the magnitude…

is it normal for the cufft function to take as much time as ipp? if not then what am i doing wrong? is there anything i can optimize? the cycle only has the fftexec function so i’m lost in optimizing …

i still have to optimize the Magnitude function so i’d be much happy if the fft time was better than ipp, or else my study results would be disapointing :(