Hello!
When I apply in-place 2D real-to-complex FFT I get wrong results. Out-of-place version of the same routine gives the same results as FFTW. Is that a bug?
I use the following code:
void CuFFTDirect(cufftComplex *m, cufftComplex *out, int size1, int size2)
{
CUT_DEVICE_INIT();
unsigned int mem_size_in=sizeof(float)*size1*size2;
unsigned int mem_size_out=sizeof(cufftComplex)*size1*(size2/2 + 1);
cufftComplex* d_fft;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_fft, mem_size_out));
CUDA_SAFE_CALL(cudaMemcpy(d_fft, m, mem_size_in, cudaMemcpyHostToDevice));
cufftHandle plan;
CUFFT_SAFE_CALL(cufftPlan2d(&plan, size1, size2, CUFFT_R2C));
CUFFT_SAFE_CALL(cufftExecR2C(plan, (float*)d_fft, d_fft));
CUDA_SAFE_CALL(cudaMemcpy(out, d_fft, mem_size_out, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(d_fft));
cufftDestroy(plan);
}