I am doing a simple Complex to Complex FFT, but I get all sort of errors and I am not sure why.
[codebox]// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
// includes, project
#include <cufft.h>
#include <cutil.h>
float frand (void)
{
float value;
value = ((float) rand()/(RAND_MAX));
return value;
}
global void conv_real2complex(float *a, cufftComplex *c, int nx, int ny)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i<nx && j<ny)
{
c[i*nx+j].x = a[i*nx+j];
c[i*nx+j].y = 0;
}
}
// Complex data type
typedef float2 Complex;
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char *msg);
// Main program
int main (int argc, char**argv)
{
clock_t start = clock();
int i;
float *zone1, *zone1_d;
cufftComplex *in1, *in1_d, *in2, *in2_d;
int j;
int nx = 64;
int ny = 64;
int nyh = (ny/2) + 1;;
unsigned int seed = 123456789;
// malloc arrays on host
cudaMallocHost((void**) &in1, sizeof(cufftComplex) * nx * ny);
cudaMallocHost((void**) &in2, sizeof(cufftComplex) * nx * ny);
cudaMallocHost((void**) &zone1, sizeof(float) * nx * ny);
// allocate arrays on device
Complex *out, *out_d;
cudaMalloc((void**) &in1_d, sizeof(cufftComplex) * nx * ny);
cudaMalloc((void**) &out_d, sizeof(cufftComplex) * nx * ny);
cudaMallocHost((void**) &out, sizeof(cufftComplex) * nx * ny); // on Host
srand(seed);
for ( i = 0; i < nx; i++ )
{
for ( j = 0; j < ny; j++ )
{
zone1[i*ny+j] = frand ( );
}
}
// print input host data --debug
printf ( "\n" );
printf ( " Input Data:\n" );
printf ( "\n" );
for ( i = 0; i < nx; i++ )
{
for ( j = 0; j < ny; j++ )
{
printf ( " %4d %4d %12f\n", i, j, zone1[i*ny+j] );
}
}
cudaMalloc((void**) &zone1_d, sizeof(float) * nx * ny);
cudaMemcpy(zone1_d, zone1, sizeof(cufftReal) * nx * ny, cudaMemcpyHostToDevice);
dim3 grid(32,16);
dim3 block(2,4);
conv_real2complex <<< grid, block >>> (zone1, in1_d, nx, ny);
// create plan for CUDA FFT
cufftHandle plan_forward;
cufftPlan2d(&plan_forward, nx, ny, CUFFT_C2C);
cufftExecC2C(plan_forward, in1_d, out_d, CUFFT_FORWARD);
// recreate input array
cudaMalloc((void **)&in2_d, sizeof(cufftComplex) * nx * ny);
// backward plan
cufftHandle plan_backward;
cufftPlan2d(&plan_backward, nx, ny, CUFFT_C2C);
cufftExecC2C(plan_backward, (cufftComplex *)out_d, (cufftComplex *)in2_d, CUFFT_INVERSE);
// copy arrays to host
cudaMemcpy(out, out_d, sizeof(cufftComplex) * nx * nyh, cudaMemcpyDeviceToHost);
cudaMemcpy(in2, in2_d, sizeof(cufftComplex) * nx * ny, cudaMemcpyDeviceToHost);
// Check for any CUDA errors
checkCUDAError("cudaMemcpy calls");
// Print FFT output --debug
printf ( "\n" );
printf ( " Output FFT Coefficients:\n" );
printf ( "\n" );
for (i = 0; i < nx * ny; i++)
{
printf (" %4d %12f %12f\n", i, out[i].x, out[i].y);
}
// Print recovered inverse FFT input --debug
printf ( "\n" );
printf ( " Recovered input data divided by NX * NY:\n" );
printf ( "\n" );
for (i = 0; i < nx * ny; i++)
{
printf (" %4d %12f %12f\n", i, in2[i].x, in2[i].y);
}
// Free up allocated memory
cufftDestroy(plan_forward);
cufftDestroy(plan_backward);
cudaFree(in1_d);
cudaFree(in2_d);
cudaFree(out_d);
// Check for any CUDA errors
checkCUDAError("cudaFree");
free(in1);
free(in2);
free(out);
// Check elapsed time
printf("\n\n Elapsed time = %f\n\n", ((float)clock()-start)/CLOCKS_PER_SEC);
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}[/codebox]
This compiles perfectly in emulation mode. However this fails on the device.
I see this error on device:
[codebox]
running and generating the output file
cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/config.cu, line 331
cufft: ERROR: CUFFT_ALLOC_FAILED
cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/cufft.cu, line 147
cufft: ERROR: CUFFT_INVALID_PLAN
cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/config.cu, line 331
cufft: ERROR: CUFFT_ALLOC_FAILED
cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/cufft.cu, line 147
cufft: ERROR: CUFFT_INVALID_PLAN
Cuda error: cudaMemcpy calls: invalid device pointer. [/codebox]
I did a gdb on the executable and saw that the code fails at the
[codebox]Breakpoint 1, main (argc=1, argv=0x7fffbe4e5e98) at cufft_ctc.cu:113
113 conv_real2complex <<< grid, block >>> (zone1, in1_d, nx, ny);
(gdb) s
__device_stub__Z17conv_real2complexPfP6float2ii (__par0=0x2b91eda03000, __par1=0x3a90000, __par2=64, __par3=64)
at /tmp/tmpxft_00005231_00000000-1_cufft_ctc.cudafe1.stub.c:13
13 /tmp/tmpxft_00005231_00000000-1_cufft_ctc.cudafe1.stub.c: No such file or directory.
in /tmp/tmpxft_00005231_00000000-1_cufft_ctc.cudafe1.stub.c
(gdb) s
main (argc=1, argv=0x7fffbe4e5e98) at cufft_ctc.cu:117
117 cufftPlan2d(&plan_forward, nx, ny, CUFFT_C2C);
(gdb) s
cufft: ERROR: /root/cuda-stuff/sw/rel/gpgpu/toolkit/r2.1/cufft/src/config.cu, line 331
cufft: ERROR: CUFFT_ALLOC_FAILED
119 cufftExecC2C(plan_forward, in1_d, out_d, CUFFT_FORWARD);
[/codebox]
The conv_real2complex function is same as mfatica’s presentation slides at SC08. Has someone seen this error?
Thanks in advance.