2D CUFFT wrong result

You’re getting tripped up by CUFFT symmetry.

CUFFT R2C and C2R transforms exploit (complex conjugate, i.e. hermitian) symmetry (not the same as a hermitian matrix) in the complex data to reduce the amount of data required/produced. Although you don’t show your print function, it’s evident from your printout that you’re not taking this into account.

A simpler alternative is to use CUFFT C2C transforms instead.

If you wish to use R2C and/or C2R, you’ll need to take into account the symmetry patterns that CUFFT uses. For a 1D case, it is pretty straightforward and is described here:

http://docs.nvidia.com/cuda/cufft/index.html#data-layout

For a 2D case, the 2nd dimension (only) is reduced due to symmetry. The reduction is to a size of N/2 + 1, so 2nd dimension of 4 will only have 3 columns of output:

http://docs.nvidia.com/cuda/cufft/index.html#multi-dimensional

note that the symmetry in this case is in each row.

The following modification of your code demonstrates proper handling of data in either the C2C case or the R2C case, just add -DUSE_C2C to your compile command line to see the C2C behavior:

$ cat t734.cu
#include <cufft.h>
#include <stdio.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

void Print2DComplex(int rows, int cols, cufftComplex *data, bool cufft_symmetry = false){

        int sym_cols = cols;
        if (cufft_symmetry) sym_cols = cols/2 + 1;
        printf("Real Part: \n");
        for (int i = 0; i < rows; i++){
          for (int j = 0; j < cols; j++)
            if (j>=sym_cols)
              printf("%f ", data[i*sym_cols+(cols-j)].x);
            else
              printf("%f ", data[i*sym_cols+j].x);
          printf("\n");}
        printf("Imag Part: \n");
        for (int i = 0; i < rows; i++){
          for (int j = 0; j < cols; j++)
            if (j>=sym_cols)
              printf("%f ", -data[i*sym_cols+(cols-j)].y); // complex (hermitian) symmetry
            else
              printf("%f ", data[i*sym_cols+j].y);
          printf("\n");}
}

int main(){

        const int row = 4;
        const int col = 4;
        float A[row][col] =
        {{ 1, 2, 3, 4},
         { 5, 6, 7, 8},
         { 9,10,11,12},
         {13,14,15,16}};
        cufftHandle plan;
        cufftReal *d_idata;
        cufftComplex *h_odata, *d_odata;
        cudaMalloc((void**)&d_idata, sizeof(cufftComplex)*row*col);
        cudaMalloc((void**)&d_odata, sizeof(cufftComplex)*row*col);
        cudaMemset(d_idata, 0, sizeof(cufftComplex)*row*col);
        cudaMemset(d_odata, 0, sizeof(cufftComplex)*row*col);
        cudaMallocHost((void**)&h_odata, sizeof(cufftComplex)*row*col);
        bool symmetric_data = false;
#ifdef USE_C2C
        cudaMemcpy2D(d_idata, sizeof(cufftComplex), A, sizeof(float), sizeof(float), row*col, cudaMemcpyHostToDevice);
        if ((cufftPlan2d(&plan, row,col, CUFFT_C2C))!= CUFFT_SUCCESS) {printf("cufft plan error\n"); return -1;}
        if ((cufftExecC2C(plan, (cufftComplex*)d_idata, (cufftComplex*)d_odata, CUFFT_FORWARD))!=CUFFT_SUCCESS) {printf("cufft exec error\n"); return -1 ;}
#else
        cudaMemcpy(d_idata,A,sizeof(cufftReal)*row*col,cudaMemcpyHostToDevice);
        if ((cufftPlan2d(&plan, row,col, CUFFT_R2C))!= CUFFT_SUCCESS) {printf("cufft plan error\n"); return -1;}
        if ((cufftExecR2C(plan, (cufftReal*)d_idata, (cufftComplex*)d_odata))!=CUFFT_SUCCESS) {printf("cufft exec error\n"); return -1 ;}
        symmetric_data = true;
#endif
        cudaMemcpy(h_odata,d_odata,sizeof(cufftComplex)*row*col,cudaMemcpyDeviceToHost);
        cudaCheckErrors("some error");
        Print2DComplex(row,col,h_odata, symmetric_data);
        return 0;
}
$ nvcc -o t734 t734.cu -lcufft
$ ./t734
Real Part:
136.000000 -8.000005 -8.000000 -8.000005
-32.000000 0.000000 -0.000001 0.000000
-32.000000 -0.000000 0.000000 -0.000000
-32.000000 0.000000 -0.000001 0.000000
Imag Part:
-0.000001 8.000001 0.000001 -8.000001
32.000000 -0.000000 0.000001 0.000000
0.000000 0.000000 -0.000000 -0.000000
-32.000000 0.000000 -0.000001 -0.000000
$ nvcc -DUSE_C2C -o t734 t734.cu -lcufft
$ ./t734
Real Part:
136.000000 -8.000000 -8.000000 -8.000000
-32.000000 0.000000 0.000000 0.000000
-32.000000 0.000000 0.000000 0.000000
-32.000000 0.000000 0.000000 0.000000
Imag Part:
0.000000 8.000000 0.000000 -8.000000
32.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000
-32.000000 0.000000 0.000000 0.000000
$

If you don’t like the “complexity” associated with R2C, I suggest you just use C2C transforms.
However, in many cases, from a cufft execution standpoint, the R2C transform will be faster than the corresponding (real-to-complex) C2C transform.