I have written a program to compute the Real to Complex 2D FFT and also it’s inverse. I compared the program to its equivalent version using the FFTW3 libraries and I find that the Complex FFT values are different. Is there something wrong in my cuFFT library calls?? Can I do this better?
My CUDA program:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
// includes, project
#include <cufft.h>
#include <cutil.h>
/*Define the timestamp function reference: Dr.Burkardt*/
void timestamp (void)
{
# define TIME_SIZE 40
static char time_buffer[TIME_SIZE];
const struct tm *tm;
size_t len;
time_t now;
now = time (NULL);
tm = localtime (&now);
len = strftime (time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm);
printf ("%s\n", time_buffer);
return;
# undef TIME_SIZE
}
double frand (void)
{
double value;
value = ((double) rand()/(RAND_MAX));
return value;
}
// Complex data type
typedef float2 Complex;
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char *msg);
// Main program
int main (int argc, char**argv)
{
timestamp ();
clock_t start = clock();
int i;
cufftReal *in1, *in1_d, *in2, *in2_d;
int j;
int nx = 8;
int ny = 10;
int nyh = (ny/2) + 1;;
unsigned int seed = 123456789;
// malloc arrays on host
cudaMallocHost((void**) &in1, sizeof(cufftReal) * nx * ny);
cudaMallocHost((void**) &in2, sizeof(cufftReal) * nx * ny);
srand(seed);
for ( i = 0; i < nx; i++ )
{
for ( j = 0; j < ny; j++ )
{
in1[i*ny+j] = frand ( );
}
}
// print input host data --debug
printf ( "\n" );
printf ( " Input Data:\n" );
printf ( "\n" );
for ( i = 0; i < nx; i++ )
{
for ( j = 0; j < ny; j++ )
{
printf ( " %4d %4d %12f\n", i, j, in1[i*ny+j] );
}
}
// allocate arrays on device
Complex *out, *out_d;
cudaMalloc((void**) &in1_d, sizeof(cufftReal) * nx * ny);
cudaMalloc((void**) &out_d, sizeof(Complex) * nx * nyh);
out = (Complex*)malloc(sizeof(Complex) * nx * ny);
// copy input array from host to device
cudaMemcpy(in1_d, in1, sizeof(cufftReal) * nx * ny, cudaMemcpyHostToDevice);
// create plan for CUDA FFT
cufftHandle plan_forward;
cufftPlan2d(&plan_forward, nx, ny, CUFFT_R2C);
cufftExecR2C(plan_forward, in1_d, out_d);
// recreate input array
cudaMalloc((void **)&in2_d, sizeof(cufftReal) * nx * ny);
// backward plan
cufftHandle plan_backward;
cufftPlan2d(&plan_backward, nx, ny, CUFFT_C2R);
cufftExecC2R(plan_backward, (cufftComplex *)out_d, (cufftReal *)in2_d);
// copy arrays to host
cudaMemcpy(out, out_d, sizeof(cufftComplex) * nx * nyh, cudaMemcpyDeviceToHost);
cudaMemcpy(in2, in2_d, sizeof(cufftReal) * nx * ny, cudaMemcpyDeviceToHost);
// Check for any CUDA errors
checkCUDAError("cudaMemcpy calls");
// Print FFT output --debug
printf ( "\n" );
printf ( " Output FFT Coefficients:\n" );
printf ( "\n" );
for (i = 0; i < nx; i++)
{
for (j = 0; j < nyh; j++)
{
printf (" %4d %4d %12f %12f\n", i, j, out[i*nyh+j].x, out[i*nyh+j].y);
}
}
// Print recovered inverse FFT input --debug
printf ( "\n" );
printf ( " Recovered input data divided by NX * NY:\n" );
printf ( "\n" );
for (i = 0; i < nx; i++)
{
for (j = 0; j < ny; j++)
{
printf (" %4d %4d %12f\n", i, j, in2[i*ny+j] /(double)(nx * ny));
}
}
// Free up allocated memory
cufftDestroy(plan_forward);
cufftDestroy(plan_backward);
cudaFree(in1_d);
cudaFree(in2_d);
cudaFree(out_d);
// Check for any CUDA errors
checkCUDAError("cudaFree");
free(in1);
free(in2);
free(out);
// Check elapsed time
printf("\n\n Elapsed time = %f\n\n", ((double)clock()-start)/CLOCKS_PER_SEC);
timestamp ();
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
Output from CUDA program:
03 February 2009 03:27:13 PM
Input Data:
0 0 0.218418
0 1 0.956318
0 2 0.829509
0 3 0.561695
0 4 0.415307
0 5 0.066119
0 6 0.257578
0 7 0.109957
0 8 0.043829
0 9 0.633966
1 0 0.061727
1 1 0.449539
1 2 0.401306
1 3 0.754673
1 4 0.797287
1 5 0.001838
1 6 0.897504
1 7 0.350752
1 8 0.094545
1 9 0.013617
2 0 0.859097
2 1 0.840847
2 2 0.123104
2 3 0.007512
2 4 0.260303
2 5 0.912484
2 6 0.113664
2 7 0.351629
2 8 0.822887
2 9 0.267132
3 0 0.692066
3 1 0.561662
3 2 0.861216
3 3 0.453794
3 4 0.911977
3 5 0.597917
3 6 0.188955
3 7 0.761492
3 8 0.396988
3 9 0.185314
4 0 0.574366
4 1 0.367027
4 2 0.617205
4 3 0.361529
4 4 0.212930
4 5 0.714471
4 6 0.117707
4 7 0.299329
4 8 0.825003
4 9 0.824660
5 0 0.061862
5 1 0.710781
5 2 0.088283
5 3 0.777994
5 4 0.745303
5 5 0.308675
5 6 0.899373
5 7 0.763537
5 8 0.761731
5 9 0.406970
6 0 0.938749
6 1 0.562088
6 2 0.017820
6 3 0.501103
6 4 0.041909
6 5 0.368851
6 6 0.271724
6 7 0.858573
6 8 0.029037
6 9 0.017442
7 0 0.152384
7 1 0.114319
7 2 0.353907
7 3 0.119308
7 4 0.206653
7 5 0.212924
7 6 0.612948
7 7 0.809519
7 8 0.587090
7 9 0.215492
Output FFT Coefficients:
0 0 32.741566 0.000000
0 1 7.654519 -11.672319
0 2 -2.128232 -2.822811
0 3 -2.354876 2.166018
0 4 -8.551808 1.766679
0 5 -4.507302 0.000002
1 0 30.582317 -0.000001
1 1 -8.999149 -6.985773
1 2 -4.520418 -3.622317
1 3 7.469131 0.787634
1 4 -9.499427 -3.260311
1 5 5.455593 -0.000002
2 0 36.469276 0.000000
2 1 5.774310 4.555152
2 2 9.389224 -1.576941
2 3 -6.039850 -10.389489
2 4 7.807747 -4.714308
2 5 -1.604395 -0.000000
3 0 44.891052 -0.000000
3 1 -1.431550 -6.360493
3 2 -1.120615 -0.992159
3 3 1.350446 -7.628465
3 4 4.474757 7.503319
3 5 3.928185 -0.000001
4 0 39.313808 0.000000
4 1 6.383594 2.811934
4 2 0.462761 5.475988
4 3 -8.306478 2.072738
4 4 5.657076 0.545425
4 5 -1.758442 0.000001
5 0 44.196064 0.000001
5 1 -7.094364 4.309771
5 2 -5.685015 -0.249042
5 3 3.803718 -4.238053
5 4 -9.002282 -7.386973
5 5 -3.291231 -0.000001
6 0 28.858362 -0.000000
6 1 3.034650 1.324703
6 2 3.565553 -7.520622
6 3 12.398600 -4.129048
6 4 8.157264 -1.007283
6 5 -8.070544 0.000003
7 0 27.076338 -0.000000
7 1 -3.624220 9.411853
7 2 -6.337792 -4.470581
7 3 0.647740 -0.481014
7 4 0.105779 2.042513
7 5 3.531358 -0.000001
Recovered input data divided by NX * NY:
0 0 0.218418
0 1 0.956318
0 2 0.829509
0 3 0.561695
0 4 0.415307
0 5 0.066119
0 6 0.257578
0 7 0.109957
0 8 0.043829
0 9 0.633966
1 0 0.061727
1 1 0.449539
1 2 0.401306
1 3 0.754673
1 4 0.797287
1 5 0.001838
1 6 0.897504
1 7 0.350752
1 8 0.094545
1 9 0.013617
2 0 0.859097
2 1 0.840847
2 2 0.123104
2 3 0.007512
2 4 0.260303
2 5 0.912484
2 6 0.113664
2 7 0.351629
2 8 0.822887
2 9 0.267132
3 0 0.692066
3 1 0.561662
3 2 0.861216
3 3 0.453794
3 4 0.911977
3 5 0.597917
3 6 0.188955
3 7 0.761492
3 8 0.396988
3 9 0.185314
4 0 0.574366
4 1 0.367027
4 2 0.617205
4 3 0.361529
4 4 0.212930
4 5 0.714471
4 6 0.117707
4 7 0.299329
4 8 0.825003
4 9 0.824660
5 0 0.061862
5 1 0.710781
5 2 0.088283
5 3 0.777994
5 4 0.745303
5 5 0.308675
5 6 0.899373
5 7 0.763537
5 8 0.761731
5 9 0.406970
6 0 0.938749
6 1 0.562088
6 2 0.017820
6 3 0.501103
6 4 0.041909
6 5 0.368850
6 6 0.271724
6 7 0.858572
6 8 0.029037
6 9 0.017442
7 0 0.152384
7 1 0.114319
7 2 0.353907
7 3 0.119308
7 4 0.206653
7 5 0.212924
7 6 0.612947
7 7 0.809519
7 8 0.587090
7 9 0.215492
Elapsed time = 0.024084
03 February 2009 03:27:13 PM
03 February 2009 03:05:46 PM
FFTW3_PRB
C version
Demonstrate the use of the C FFTW3 library.
TEST04
Demonstrate FFTW3 on a 8 by 10 array of real data.
Transform data to FFT coefficients.
Backtransform FFT coefficients to recover data.
Compare recovered data to original data.
Input Data:
0 0 0.218418
0 1 0.956318
0 2 0.829509
0 3 0.561695
0 4 0.415307
0 5 0.066119
0 6 0.257578
0 7 0.109957
0 8 0.043829
0 9 0.633966
1 0 0.061727
1 1 0.449539
1 2 0.401306
1 3 0.754673
1 4 0.797287
1 5 0.001838
1 6 0.897504
1 7 0.350752
1 8 0.094545
1 9 0.013617
2 0 0.859097
2 1 0.840847
2 2 0.123104
2 3 0.007512
2 4 0.260303
2 5 0.912484
2 6 0.113664
2 7 0.351629
2 8 0.822887
2 9 0.267132
3 0 0.692066
3 1 0.561662
3 2 0.861216
3 3 0.453794
3 4 0.911977
3 5 0.597917
3 6 0.188955
3 7 0.761492
3 8 0.396988
3 9 0.185314
4 0 0.574366
4 1 0.367027
4 2 0.617205
4 3 0.361529
4 4 0.212930
4 5 0.714471
4 6 0.117707
4 7 0.299329
4 8 0.825003
4 9 0.824660
5 0 0.061862
5 1 0.710781
5 2 0.088283
5 3 0.777994
5 4 0.745303
5 5 0.308675
5 6 0.899373
5 7 0.763537
5 8 0.761731
5 9 0.406970
6 0 0.938749
6 1 0.562088
6 2 0.017820
6 3 0.501103
6 4 0.041909
6 5 0.368851
6 6 0.271724
6 7 0.858573
6 8 0.029037
6 9 0.017442
7 0 0.152384
7 1 0.114319
7 2 0.353907
7 3 0.119308
7 4 0.206653
7 5 0.212924
7 6 0.612948
7 7 0.809519
7 8 0.587090
7 9 0.215492
Output FFT Coefficients:
0 0 35.516098 0.000000
0 1 0.212223 -0.325647
0 2 -0.796817 -1.972311
0 3 1.121054 -2.729960
0 4 -0.106362 -0.563868
0 5 -0.789597 0.000000
1 0 -3.599441 -1.322681
1 1 -2.191982 -1.782738
1 2 0.070179 -2.934993
1 3 0.035724 3.006339
1 4 -1.822177 -0.263801
1 5 0.394435 -1.616461
2 0 0.840967 -0.351373
2 1 -0.062276 -0.462812
2 2 -1.628608 1.812221
2 3 -1.545131 1.185329
2 4 -4.881608 3.889492
2 5 0.426149 0.661898
3 0 1.956379 0.580047
3 1 -2.275259 -1.889218
3 2 -0.699339 -0.267855
3 3 1.077098 -3.755201
3 4 -0.035203 -0.115335
3 5 -1.081650 0.000076
4 0 -1.170344 0.000000
4 1 5.499545 -0.419485
4 2 3.619144 0.361215
4 3 -2.196705 0.160014
4 4 3.373931 -0.288505
4 5 -3.195574 0.000000
5 0 1.956379 -0.580047
5 1 3.317326 -2.523240
5 2 0.767993 -0.595624
5 3 -0.112935 1.626594
5 4 -2.656801 0.656494
5 5 -1.081650 -0.000076
6 0 0.840967 0.351373
6 1 1.369564 -3.222248
6 2 -2.026454 1.125465
6 3 -2.709896 3.503994
6 4 0.166671 -1.881069
6 5 0.426149 -0.661898
7 0 -3.599441 1.322681
7 1 1.785378 -1.046931
7 2 -1.434329 -0.350927
7 3 1.975914 -0.831091
7 4 -2.590261 0.333270
7 5 0.394435 1.616461
Recovered input data divided by NX * NY:
0 0 0.218418
0 1 0.956318
0 2 0.829509
0 3 0.561695
0 4 0.415307
0 5 0.066119
0 6 0.257578
0 7 0.109957
0 8 0.043829
0 9 0.633966
1 0 0.061727
1 1 0.449539
1 2 0.401306
1 3 0.754673
1 4 0.797287
1 5 0.001838
1 6 0.897504
1 7 0.350752
1 8 0.094545
1 9 0.013617
2 0 0.859097
2 1 0.840847
2 2 0.123104
2 3 0.007512
2 4 0.260303
2 5 0.912484
2 6 0.113664
2 7 0.351629
2 8 0.822887
2 9 0.267132
3 0 0.692066
3 1 0.561662
3 2 0.861216
3 3 0.453794
3 4 0.911977
3 5 0.597917
3 6 0.188955
3 7 0.761492
3 8 0.396988
3 9 0.185314
4 0 0.574366
4 1 0.367027
4 2 0.617205
4 3 0.361529
4 4 0.212930
4 5 0.714471
4 6 0.117707
4 7 0.299329
4 8 0.825003
4 9 0.824660
5 0 0.061862
5 1 0.710781
5 2 0.088283
5 3 0.777994
5 4 0.745303
5 5 0.308675
5 6 0.899373
5 7 0.763537
5 8 0.761731
5 9 0.406970
6 0 0.938749
6 1 0.562088
6 2 0.017820
6 3 0.501103
6 4 0.041909
6 5 0.368851
6 6 0.271724
6 7 0.858573
6 8 0.029037
6 9 0.017442
7 0 0.152384
7 1 0.114319
7 2 0.353907
7 3 0.119308
7 4 0.206653
7 5 0.212924
7 6 0.612948
7 7 0.809519
7 8 0.587090
7 9 0.215492
Elapsed time = 0.001780
FFTW3_PRB
Normal end of execution.
03 February 2009 03:05:46 PM