Wrong results in CUFFT!

Hi.

I’m tring to use CUFFT to compute the complex fourier transform of some data, but the results are wrong.

Can anyone see a problem on the following code?

void k_CUDA_FFT(float **data_pointers, int* args){

int N = args[0];

// allocate space on device

    float2* d_InputSignal;

    cudaMalloc((void**)&d_InputSignal, N*sizeof(float2));

// copy data from host to device

    float2* h_InputSignal = new float2[N];

    for(int i=0; i< N; i++){

        h_InputSignal[i].x=data_pointers[0][i];

        h_InputSignal[i].y=data_pointers[1][i];

    }

    cudaMemcpy(d_InputSignal, h_InputSignal,N*sizeof(float2), cudaMemcpyHostToDevice );

// CUFFT plan

    cufftHandle plan;

    int size = sizeof(float2) * N;

    cufftPlan1d(&plan, size, CUFFT_C2C, 1);

/* executes FFT processes */

    if(args[1]==1)

        cufftExecC2C(plan, (cufftComplex *)d_InputSignal, (cufftComplex *)d_InputSignal, CUFFT_FORWARD);

    //else

    //    cufftExecC2C(plan, (cufftComplex *)d_InputSignal, (cufftComplex *)d_InputSignal, CUFFT_INVERSE);

// copy data from device to host

    cudaMemcpy(h_InputSignal,d_InputSignal,N*sizeof(float2),cudaMemcpyDeviceToHost);

    for(int i=0; i< N; i++){

       data_pointers[0][i] = h_InputSignal[i].x;

       data_pointers[1][i] = h_InputSignal[i].y;

    }

cufftDestroy(plan);

    cudaFree(h_InputSignal);

    cudaFree(d_InputSignal);

}

Input:

REAL:0.841471 0.909297 0.141120 -0.756802 -0.958924 -0.279415 0.656987 0.989358

IMAG:0.841471 0.909297 0.141120 -0.756802 -0.958924 -0.279415 0.656987 0.989358

The correct result should be :

REAL: 1.543091 0.799255 0.427607 0.136506 -0.181784 -0.686254 -2.258727 6.952074

IMAG: 0.332754 0.552790 0.662729 0.748840 0.842995 0.992223 1.457381 -1.267293

The wrong result is:

REAL: 1.543091 1.803324 1.575109 1.012566 0.405025 0.030154 0.028103 0.350289

IMAG: 0.332754 -0.238005 -0.809705 -1.081805 -0.935108 -0.462346 0.098292 0.488494

Thanks

Hi again!

The problem is in “cufftPlan1d(&plan, size, CUFFT_C2C, 1);”.

Size should be the number of points of the FFT.

Thanks

The problem should be in your call:
cufftPlan1d(&plan, size, CUFFT_C2C, 1);

size of the transform is in element, you are passing the number of bytes.
Try:

cufftPlan1d(&plan, N, CUFFT_C2C, 1);

Now i have a problem with this code:

int N = args[0];

// allocate space on device

    float2* d_InputSignal;

    printf("Before cudaMalloc call\n");fflush(stdout);

    cudaMalloc((void**)&d_InputSignal, N*N*sizeof(float2));

// copy data from host to device

    float2* h_InputSignal = new float2[N];

    for(int i=0; i< N*N; i++){

        h_InputSignal[i].x=data_pointers[0][i];

        h_InputSignal[i].y=data_pointers[1][i];

    }

    printf("Before cudaMemcpy call\n");fflush(stdout);

    cudaMemcpy(d_InputSignal, h_InputSignal,N*N*sizeof(float2), cudaMemcpyHostToDevice );

// CUFFT plan

    cufftHandle plan;

    int size = N;

    printf("Before cudaPlan call\n");fflush(stdout);

    cufftPlan2d(&plan, size, size, CUFFT_C2C);

printf("cudaPlane return\n");fflush(stdout);

/* executes FFT processes */

    printf("Before cufft call\n");fflush(stdout);

    if(args[1]==1){

        cufftExecC2C(plan, (cufftComplex *)d_InputSignal, (cufftComplex *)d_InputSignal, CUFFT_FORWARD);

    }

    else{

        cufftExecC2C(plan, (cufftComplex *)d_InputSignal, (cufftComplex *)d_InputSignal, CUFFT_INVERSE);

}

    // copy data from device to host

    cudaMemcpy(h_InputSignal,d_InputSignal,N*N*sizeof(float2),cudaMemcpyDeviceToHost);

    for(int i=0; i< N*N; i++){

       data_pointers[0][i] = h_InputSignal[i].x;

       data_pointers[1][i] = h_InputSignal[i].y;

    }

It gives an error in cudaPlan2d:

Before cudaPlan call

cudaFft: malloc.c:3096: sYSMALLOc: Assertion `(old_top == (((mbinptr) (((char *) &((av)->bins[((1) - 1) * 2])) - __builtin_offsetof (struct malloc_chunk, fd)))) && old_size == 0) || ((unsigned long) (old_size) >= (unsigned long)((((__builtin_offsetof (struct malloc_chunk, fd_nextsize))+((2 * (sizeof(size_t))) - 1)) & ~((2 * (sizeof(size_t))) - 1))) && ((old_top)->size & 0x1) && ((unsigned long)old_end & pagemask) == 0)' failed.

The problem was here:

float2* h_InputSignal = new float2[N];

It should be

float2* h_InputSignal = new float2[N*N];