CUFFT is giving strange results...

Hi.

I’m trying to replicate the convolutionFFT2D of the nvidia gpu computing sdk, but the convolution operation is giving me some strange results. Can anyone see anything strange in the code?

The input values are all ‘1’.

The output of the convolution is ‘nan’.

Here is the code:

inline __device__ void mulAndScale(double2& a, const double2& b,

                                    const double& c)

{

    double2 t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};

    a = t;

}

__global__ void modulateAndNormalize_kernel( double2 *d_Dst,

                                            double2 *d_Src,

                                            int dataSize,

                                            double c)

{

   const int i = blockDim.x * blockIdx.x + threadIdx.x;

    if(i >= dataSize)

        return;

double2 a = d_Src[i];

    double2 b = d_Dst[i];

mulAndScale(a, b, c);

d_Dst[i] = a;

}

//Round a / b to nearest higher integer value

inline int iDivUp(int a, int b){

    return (a % b != 0) ? (a / b + 1) : (a / b);

}

void modulateAndNormalize(

    double2 *d_Dst,

    double2 *d_Src,

    int fftH,

    int fftW,

    int padding

){

    assert( fftW % 2 == 0 );

    const int dataSize = fftH * (fftW / 2 + padding);

modulateAndNormalize_kernel<<<iDivUp(dataSize, 256), 256>>>(

        d_Dst,

        d_Src,

        dataSize,

        1.0f / (double)(fftW * fftH)

    );

    //cutilCheckMsg("modulateAndNormalize() execution failed\n");

}

void k_CONV(double **data_pointers, int *args){

int N = args[0];

// create cufft plan

    cufftHandle fftPlanFwdX, fftPlanFwdY ,fftPlanInv;

    cufftPlan2d(&fftPlanFwdX, N, N, CUFFT_D2Z);

    cufftPlan2d(&fftPlanFwdY, N, N, CUFFT_D2Z);

    cufftPlan2d(&fftPlanInv, N, N, CUFFT_Z2D);

// allocate space on device (input x2, spectum x2)

    printf("Allocating data on device\n");

    // input data

    double *d_InputSignalX;

    double *d_InputSignalY;

    cudaMalloc((void **)&d_InputSignalX, N * N  * sizeof(double));

    cudaMalloc((void **)&d_InputSignalY, N * N  * sizeof(double));

// signal spectrum data

    double2 *spectrumX,

            *spectrumY;

    cudaMalloc((void **)&spectrumX, N * (N/2 + 1)  * sizeof(double2));

    cudaMalloc((void **)&spectrumY, N * (N/2 + 1)  * sizeof(double2));

// copy data to device

    printf("Copying data to device\n");

    cudaMemcpy(d_InputSignalX, data_pointers[0], N * sizeof(double), cudaMemcpyHostToDevice );

    cudaMemcpy(d_InputSignalY, data_pointers[1], N * sizeof(double), cudaMemcpyHostToDevice );

printf("Forward fft\n");

// call r2c cufft

    cufftExecD2Z(fftPlanFwdX, (cufftDoubleReal *)d_InputSignalX, (cufftDoubleComplex *)spectrumX);

    cufftExecD2Z(fftPlanFwdY, (cufftDoubleReal *)d_InputSignalY, (cufftDoubleComplex *)spectrumY);

// call modulateAndNormalize

    printf("Modulate and normalize\n");

    modulateAndNormalize(spectrumX, spectrumY, N, N, 1);

// call c2r

    printf("Inverse fft\n");

    cufftExecZ2D(fftPlanInv, (cufftDoubleComplex *)spectrumX, (cufftDoubleReal *)d_InputSignalX);

// copy data back to host

    printf("Copying result to host\n");

    cudaMemcpy(data_pointers[0], d_InputSignalX, N * N * sizeof(double), cudaMemcpyDeviceToHost);

printf("Cleaning plans\n");

// destroy plans

    cufftDestroy(fftPlanFwdX);

    cufftDestroy(fftPlanFwdY);

    cufftDestroy(fftPlanInv);

printf("Cleaning memory\n");

// free allocated memory

    cudaFree(spectrumX);

    cudaFree(spectrumY);

    cudaFree(d_InputSignalX);

    cudaFree(d_InputSignalY);

}

Thanks.