Buggy CUFFT inplace 2D R2C?

Hello!

When I apply in-place 2D real-to-complex FFT I get wrong results. Out-of-place version of the same routine gives the same results as FFTW. Is that a bug?

I use the following code:

void CuFFTDirect(cufftComplex *m, cufftComplex *out, int size1, int size2)

{

	CUT_DEVICE_INIT();

	unsigned int mem_size_in=sizeof(float)*size1*size2;

	unsigned int mem_size_out=sizeof(cufftComplex)*size1*(size2/2 + 1);

	cufftComplex* d_fft;

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_fft, mem_size_out));

	CUDA_SAFE_CALL(cudaMemcpy(d_fft, m, mem_size_in, cudaMemcpyHostToDevice));

	cufftHandle plan;

	CUFFT_SAFE_CALL(cufftPlan2d(&plan, size1, size2, CUFFT_R2C));

	CUFFT_SAFE_CALL(cufftExecR2C(plan, (float*)d_fft, d_fft));

	CUDA_SAFE_CALL(cudaMemcpy(out, d_fft, mem_size_out, cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaFree(d_fft));

	cufftDestroy(plan);

}