cufft question

I am doing a simple Complex to Complex FFT, but I get all sort of errors and I am not sure why.

[codebox]// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <time.h>

// includes, project

#include <cufft.h>

#include <cutil.h>

float frand (void)

{

float value;

value = ((float) rand()/(RAND_MAX));

return value;

}

global void conv_real2complex(float *a, cufftComplex *c, int nx, int ny)

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockIdx.y * blockDim.y + threadIdx.y;



if(i<nx && j<ny)

{

	c[i*nx+j].x = a[i*nx+j];

	c[i*nx+j].y = 0;

}

}

// Complex data type

typedef float2 Complex;

// Simple utility function to check for CUDA runtime errors

void checkCUDAError(const char *msg);

// Main program

int main (int argc, char**argv)

{

clock_t start = clock();

int i;

float *zone1, *zone1_d;

cufftComplex *in1, *in1_d, *in2, *in2_d;	

int j;

int nx = 64;

int ny = 64;

int nyh = (ny/2) + 1;;

unsigned int seed = 123456789;



// malloc arrays on host

cudaMallocHost((void**) &in1, sizeof(cufftComplex) * nx * ny);

cudaMallocHost((void**) &in2, sizeof(cufftComplex) * nx * ny);

cudaMallocHost((void**) &zone1, sizeof(float) * nx * ny);



// allocate arrays on device

Complex *out, *out_d;



cudaMalloc((void**) &in1_d, sizeof(cufftComplex) * nx * ny);

cudaMalloc((void**) &out_d, sizeof(cufftComplex) * nx * ny);

cudaMallocHost((void**) &out, sizeof(cufftComplex) * nx * ny); // on Host



srand(seed);



for ( i = 0; i < nx; i++ )

  {

    for ( j = 0; j < ny; j++ )

    {

      zone1[i*ny+j] = frand ( );

    }

  }



// print input host data --debug

printf ( "\n" );

printf ( "  Input Data:\n" );

printf ( "\n" );

for ( i = 0; i < nx; i++ )

 {

   for ( j = 0; j < ny; j++ )

   {

     printf ( "  %4d  %4d  %12f\n", i, j, zone1[i*ny+j] );

   }

 }



cudaMalloc((void**) &zone1_d, sizeof(float) * nx * ny);

cudaMemcpy(zone1_d, zone1, sizeof(cufftReal) * nx * ny, cudaMemcpyHostToDevice);



dim3 grid(32,16);

dim3 block(2,4);

conv_real2complex <<< grid, block >>> (zone1, in1_d, nx, ny);



// create plan for CUDA FFT

cufftHandle plan_forward;

cufftPlan2d(&plan_forward, nx, ny, CUFFT_C2C);



cufftExecC2C(plan_forward, in1_d, out_d, CUFFT_FORWARD);



// recreate input array	

cudaMalloc((void **)&in2_d, sizeof(cufftComplex) * nx * ny);



// backward plan

cufftHandle plan_backward;

cufftPlan2d(&plan_backward, nx, ny, CUFFT_C2C);



cufftExecC2C(plan_backward, (cufftComplex *)out_d, (cufftComplex *)in2_d, CUFFT_INVERSE);

// copy arrays to host

cudaMemcpy(out, out_d, sizeof(cufftComplex) * nx * nyh, cudaMemcpyDeviceToHost);

cudaMemcpy(in2, in2_d, sizeof(cufftComplex) * nx * ny, cudaMemcpyDeviceToHost);

	

// Check for any CUDA errors

checkCUDAError("cudaMemcpy calls");



// Print FFT output --debug

printf ( "\n" );

printf ( "  Output FFT Coefficients:\n" );

printf ( "\n" );

for (i = 0; i < nx * ny; i++)

{

    printf ("  %4d  %12f  %12f\n", i, out[i].x, out[i].y);

}



// Print recovered inverse FFT input --debug

printf ( "\n" );

printf ( "  Recovered input data divided by NX * NY:\n" );

printf ( "\n" );

for (i = 0; i < nx * ny; i++)

{

    printf ("  %4d  %12f  %12f\n", i, in2[i].x, in2[i].y);

}

// Free up allocated memory

cufftDestroy(plan_forward);

cufftDestroy(plan_backward);



cudaFree(in1_d);

cudaFree(in2_d);

cudaFree(out_d);



// Check for any CUDA errors

checkCUDAError("cudaFree");



free(in1);

free(in2);

free(out);

// Check elapsed time

printf("\n\n Elapsed time = %f\n\n", ((float)clock()-start)/CLOCKS_PER_SEC);

return 0;

}

void checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err) 

{

    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

    exit(-1);

}                         

}[/codebox]

This compiles perfectly in emulation mode. However this fails on the device.

I see this error on device:

[codebox]

running and generating the output file

cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/config.cu, line 331

cufft: ERROR: CUFFT_ALLOC_FAILED

cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/cufft.cu, line 147

cufft: ERROR: CUFFT_INVALID_PLAN

cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/config.cu, line 331

cufft: ERROR: CUFFT_ALLOC_FAILED

cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/cufft.cu, line 147

cufft: ERROR: CUFFT_INVALID_PLAN

Cuda error: cudaMemcpy calls: invalid device pointer. [/codebox]

I did a gdb on the executable and saw that the code fails at the

[codebox]Breakpoint 1, main (argc=1, argv=0x7fffbe4e5e98) at cufft_ctc.cu:113

113 conv_real2complex <<< grid, block >>> (zone1, in1_d, nx, ny);

(gdb) s

__device_stub__Z17conv_real2complexPfP6float2ii (__par0=0x2b91eda03000, __par1=0x3a90000, __par2=64, __par3=64)

at /tmp/tmpxft_00005231_00000000-1_cufft_ctc.cudafe1.stub.c:13

13 /tmp/tmpxft_00005231_00000000-1_cufft_ctc.cudafe1.stub.c: No such file or directory.

in /tmp/tmpxft_00005231_00000000-1_cufft_ctc.cudafe1.stub.c

(gdb) s

main (argc=1, argv=0x7fffbe4e5e98) at cufft_ctc.cu:117

117 cufftPlan2d(&plan_forward, nx, ny, CUFFT_C2C);

(gdb) s

cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/config.cu, line 331

cufft: ERROR: CUFFT_ALLOC_FAILED

119 cufftExecC2C(plan_forward, in1_d, out_d, CUFFT_FORWARD);

[/codebox]

The conv_real2complex function is same as mfatica’s presentation slides at SC08. Has someone seen this error?

Thanks in advance.

From a quick scan:

  1. You are passing the wrong pointer zone1 instead of zone1_d

  2. (not an error) Following your naming scheme, the call to the kernel should be <<< block, grid>>> not <<<grid, block>>> if you were planning to have 512 threads

  3. (not an error ) there is no need to define a forward and a backward plan, the direction is specified in the execution

Passing zone1_d makes the program compile on the device.

When I run a gdb on the executable, I see this:

[codebox]

Breakpoint 1, main (argc=1, argv=0x7fff2f00f9c8) at cufft_ctc.cu:113

113 conv_real2complex <<< block, grid >>> (zone1_d, in1_d, nx, ny);

(gdb) s

__device_stub__Z17conv_real2complexPfP6float2ii (__par0=0x3ab0000, __par1=0x3a90000, __par2=64, __par3=64)

at /tmp/tmpxft_000052af_00000000-1_cufft_ctc.cudafe1.stub.c:13

13 /tmp/tmpxft_000052af_00000000-1_cufft_ctc.cudafe1.stub.c: No such file or directory.

in /tmp/tmpxft_000052af_00000000-1_cufft_ctc.cudafe1.stub.c

[/codebox]

What does that mean? How do we know if something is wrong with the kernel ??

Also, Thanks for your prompt reply to my previous question.

I am doing 2 FFT computations in the same program.

[codebox]cutResetTimer(timer);

    cutStartTimer(timer);

// create plan for CUDA FFT

cufftHandle plan_forward1;

cufftResult status1 = CUFFT_SAFE_CALL(cufftPlan2d(&plan_forward1, Nx, Ny, CUFFT_C2C));

if (status1 != CUFFT_SUCCESS)

{

	printf("Error creating forward FFT plan!\n");

} 



CUFFT_SAFE_CALL(cufftExecC2C(plan_forward1, in1_d, out1_d, CUFFT_FORWARD));

cudaThreadSynchronize();



cutStopTimer( timer );  // Stop timer

optimizedTime = cutGetTimerValue(timer);

printf("Optimized average FFT1 computation time: %0.3f ms\n\n", optimizedTime);



//Destroy CUFFT context

    CUFFT_SAFE_CALL(cufftDestroy(plan_forward1));



//FFT2

cutResetTimer(timer);

    cutStartTimer(timer);

// create plan for CUDA FFT

cufftHandle plan_forward2;

cufftResult status2 = CUFFT_SAFE_CALL(cufftPlan2d(&plan_forward2, Nx, Ny, CUFFT_C2C));

if (status2 != CUFFT_SUCCESS)

{

	printf("Error creating forward FFT plan!\n");

} 



CUFFT_SAFE_CALL(cufftExecC2C(plan_forward2, in2_d, out2_d, CUFFT_FORWARD));

cudaThreadSynchronize();



cutStopTimer( timer );  // Stop timer

optimizedTime = cutGetTimerValue(timer);

printf("Optimized average FFT2 computation time: %0.3f ms\n\n", optimizedTime);

//Destroy CUFFT context

    CUFFT_SAFE_CALL(cufftDestroy(plan_forward2)); [/codebox]

My gdb output:

404 CUFFT_SAFE_CALL(cufftExecC2C(plan_forward1, in1_d, out1_d, CUFFT_FORWARD));

(gdb) s

cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/execute.cu, line 1070

cufft: ERROR: CUFFT_EXEC_FAILED

cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/execute.cu, line 316

cufft: ERROR: CUFFT_EXEC_FAILED

cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/cufft.cu, line 151

cufft: ERROR: CUFFT_EXEC_FAILED

Seems like the second FFT computation does not show errors. Only the first FFT execution fails. Is there something extra to be done for multiple CUFFT computations?

Thanks

In CUDA, if we use the cufftComplex data type, how is the normalization done?
for double data type, you can do this: z1/(double)(nx * ny)

When I try to do z1.x/(cufftComplex)(nx * ny)
I get an error that no constructor for converting float2 to int.

Is there a workaround?

I think that the complex data type is just a renamed float2 struct. I don’t think there’s any operators or casts available for it. What are you trying to do there?

I have a set of cufftComplex values, I do a FFT and then a IFFT and I want to compare the output of the FFT with the input to the FFT.

IF it’s a double I can always compare using z1/(double)(nx * ny)
How do I do it for cufftComplex. Also these values are compared with values in a Matlab program that uses “double” values