2D FFT- Real to Complex and Complex to Real

I have written a program to compute the Real to Complex 2D FFT and also it’s inverse. I compared the program to its equivalent version using the FFTW3 libraries and I find that the Complex FFT values are different. Is there something wrong in my cuFFT library calls?? Can I do this better?

My CUDA program:

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <time.h>

// includes, project

#include <cufft.h>

#include <cutil.h>

/*Define the timestamp function reference: Dr.Burkardt*/

void timestamp (void)

{

# define TIME_SIZE 40

  static char time_buffer[TIME_SIZE];

  const struct tm *tm;

  size_t len;

  time_t now;

  now = time (NULL);

  tm = localtime (&now);

  len = strftime (time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm);

  printf ("%s\n", time_buffer);

  return;

# undef TIME_SIZE

}

double frand (void)

{

  double value;

  value = ((double) rand()/(RAND_MAX));

  return value;

}

// Complex data type

typedef float2 Complex;

// Simple utility function to check for CUDA runtime errors

void checkCUDAError(const char *msg);

// Main program

int main (int argc, char**argv)

{

	timestamp ();

	clock_t start = clock();

	int i;

	  cufftReal *in1, *in1_d, *in2, *in2_d;	

	int j;

	int nx = 8;

	int ny = 10;

	int nyh = (ny/2) + 1;;

	unsigned int seed = 123456789;

	

	// malloc arrays on host

	cudaMallocHost((void**) &in1, sizeof(cufftReal) * nx * ny);

	cudaMallocHost((void**) &in2, sizeof(cufftReal) * nx * ny);

	srand(seed);

	

	for ( i = 0; i < nx; i++ )

	  {

		for ( j = 0; j < ny; j++ )

		{

		  in1[i*ny+j] = frand ( );

		}

	  }

	

	// print input host data --debug

	printf ( "\n" );

	printf ( "  Input Data:\n" );

	printf ( "\n" );

 	for ( i = 0; i < nx; i++ )

	 {

	   for ( j = 0; j < ny; j++ )

	   {

		 printf ( "  %4d  %4d  %12f\n", i, j, in1[i*ny+j] );

	   }

	 }

	

	// allocate arrays on device

	Complex *out, *out_d;

	

	cudaMalloc((void**) &in1_d, sizeof(cufftReal) * nx * ny);

	cudaMalloc((void**) &out_d, sizeof(Complex) * nx * nyh);

	out = (Complex*)malloc(sizeof(Complex) * nx * ny);

	

	// copy input array from host to device

	cudaMemcpy(in1_d, in1, sizeof(cufftReal) * nx * ny, cudaMemcpyHostToDevice);

	

	// create plan for CUDA FFT

	cufftHandle plan_forward;

	cufftPlan2d(&plan_forward, nx, ny, CUFFT_R2C);

	

	cufftExecR2C(plan_forward, in1_d, out_d);

	

	// recreate input array	

	cudaMalloc((void **)&in2_d, sizeof(cufftReal) * nx * ny);

	

	// backward plan

	cufftHandle plan_backward;

	cufftPlan2d(&plan_backward, nx, ny, CUFFT_C2R);

	

	cufftExecC2R(plan_backward, (cufftComplex *)out_d, (cufftReal *)in2_d);

	// copy arrays to host

	cudaMemcpy(out, out_d, sizeof(cufftComplex) * nx * nyh, cudaMemcpyDeviceToHost);

	cudaMemcpy(in2, in2_d, sizeof(cufftReal) * nx * ny, cudaMemcpyDeviceToHost);

		

	// Check for any CUDA errors

	checkCUDAError("cudaMemcpy calls");

	

	// Print FFT output --debug

	printf ( "\n" );

	printf ( "  Output FFT Coefficients:\n" );

	printf ( "\n" );

	for (i = 0; i < nx; i++)

	{

	  for (j = 0; j < nyh; j++)

	  {

		printf ("  %4d  %4d  %12f  %12f\n", i, j, out[i*nyh+j].x, out[i*nyh+j].y);

	  }

	}

	

	// Print recovered inverse FFT input --debug

	printf ( "\n" );

	printf ( "  Recovered input data divided by NX * NY:\n" );

	printf ( "\n" );

	for (i = 0; i < nx; i++)

	{

	  for (j = 0; j < ny; j++)

	  {

		printf ("  %4d  %4d  %12f\n", i, j, in2[i*ny+j] /(double)(nx * ny));

	  }

	}

	

	// Free up allocated memory

	cufftDestroy(plan_forward);

	cufftDestroy(plan_backward);

	

	cudaFree(in1_d);

	cudaFree(in2_d);

	cudaFree(out_d);

	

	// Check for any CUDA errors

	checkCUDAError("cudaFree");

	

	free(in1);

	free(in2);

	free(out);

	// Check elapsed time

	printf("\n\n Elapsed time = %f\n\n", ((double)clock()-start)/CLOCKS_PER_SEC);

	timestamp ();

	return 0;

}

void checkCUDAError(const char *msg)

{

	cudaError_t err = cudaGetLastError();

	if( cudaSuccess != err) 

	{

		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

		exit(-1);

	}						 

}
Output from CUDA program:

03 February 2009 03:27:13 PM

Input Data:

	 0	 0	  0.218418

	 0	 1	  0.956318

	 0	 2	  0.829509

	 0	 3	  0.561695

	 0	 4	  0.415307

	 0	 5	  0.066119

	 0	 6	  0.257578

	 0	 7	  0.109957

	 0	 8	  0.043829

	 0	 9	  0.633966

	 1	 0	  0.061727

	 1	 1	  0.449539

	 1	 2	  0.401306

	 1	 3	  0.754673

	 1	 4	  0.797287

	 1	 5	  0.001838

	 1	 6	  0.897504

	 1	 7	  0.350752

	 1	 8	  0.094545

	 1	 9	  0.013617

	 2	 0	  0.859097

	 2	 1	  0.840847

	 2	 2	  0.123104

	 2	 3	  0.007512

	 2	 4	  0.260303

	 2	 5	  0.912484

	 2	 6	  0.113664

	 2	 7	  0.351629

	 2	 8	  0.822887

	 2	 9	  0.267132

	 3	 0	  0.692066

	 3	 1	  0.561662

	 3	 2	  0.861216

	 3	 3	  0.453794

	 3	 4	  0.911977

	 3	 5	  0.597917

	 3	 6	  0.188955

	 3	 7	  0.761492

	 3	 8	  0.396988

	 3	 9	  0.185314

	 4	 0	  0.574366

	 4	 1	  0.367027

	 4	 2	  0.617205

	 4	 3	  0.361529

	 4	 4	  0.212930

	 4	 5	  0.714471

	 4	 6	  0.117707

	 4	 7	  0.299329

	 4	 8	  0.825003

	 4	 9	  0.824660

	 5	 0	  0.061862

	 5	 1	  0.710781

	 5	 2	  0.088283

	 5	 3	  0.777994

	 5	 4	  0.745303

	 5	 5	  0.308675

	 5	 6	  0.899373

	 5	 7	  0.763537

	 5	 8	  0.761731

	 5	 9	  0.406970

	 6	 0	  0.938749

	 6	 1	  0.562088

	 6	 2	  0.017820

	 6	 3	  0.501103

	 6	 4	  0.041909

	 6	 5	  0.368851

	 6	 6	  0.271724

	 6	 7	  0.858573

	 6	 8	  0.029037

	 6	 9	  0.017442

	 7	 0	  0.152384

	 7	 1	  0.114319

	 7	 2	  0.353907

	 7	 3	  0.119308

	 7	 4	  0.206653

	 7	 5	  0.212924

	 7	 6	  0.612948

	 7	 7	  0.809519

	 7	 8	  0.587090

	 7	 9	  0.215492

Output FFT Coefficients:

	 0	 0	 32.741566	  0.000000

	 0	 1	  7.654519	-11.672319

	 0	 2	 -2.128232	 -2.822811

	 0	 3	 -2.354876	  2.166018

	 0	 4	 -8.551808	  1.766679

	 0	 5	 -4.507302	  0.000002

	 1	 0	 30.582317	 -0.000001

	 1	 1	 -8.999149	 -6.985773

	 1	 2	 -4.520418	 -3.622317

	 1	 3	  7.469131	  0.787634

	 1	 4	 -9.499427	 -3.260311

	 1	 5	  5.455593	 -0.000002

	 2	 0	 36.469276	  0.000000

	 2	 1	  5.774310	  4.555152

	 2	 2	  9.389224	 -1.576941

	 2	 3	 -6.039850	-10.389489

	 2	 4	  7.807747	 -4.714308

	 2	 5	 -1.604395	 -0.000000

	 3	 0	 44.891052	 -0.000000

	 3	 1	 -1.431550	 -6.360493

	 3	 2	 -1.120615	 -0.992159

	 3	 3	  1.350446	 -7.628465

	 3	 4	  4.474757	  7.503319

	 3	 5	  3.928185	 -0.000001

	 4	 0	 39.313808	  0.000000

	 4	 1	  6.383594	  2.811934

	 4	 2	  0.462761	  5.475988

	 4	 3	 -8.306478	  2.072738

	 4	 4	  5.657076	  0.545425

	 4	 5	 -1.758442	  0.000001

	 5	 0	 44.196064	  0.000001

	 5	 1	 -7.094364	  4.309771

	 5	 2	 -5.685015	 -0.249042

	 5	 3	  3.803718	 -4.238053

	 5	 4	 -9.002282	 -7.386973

	 5	 5	 -3.291231	 -0.000001

	 6	 0	 28.858362	 -0.000000

	 6	 1	  3.034650	  1.324703

	 6	 2	  3.565553	 -7.520622

	 6	 3	 12.398600	 -4.129048

	 6	 4	  8.157264	 -1.007283

	 6	 5	 -8.070544	  0.000003

	 7	 0	 27.076338	 -0.000000

	 7	 1	 -3.624220	  9.411853

	 7	 2	 -6.337792	 -4.470581

	 7	 3	  0.647740	 -0.481014

	 7	 4	  0.105779	  2.042513

	 7	 5	  3.531358	 -0.000001

Recovered input data divided by NX * NY:

	 0	 0	  0.218418

	 0	 1	  0.956318

	 0	 2	  0.829509

	 0	 3	  0.561695

	 0	 4	  0.415307

	 0	 5	  0.066119

	 0	 6	  0.257578

	 0	 7	  0.109957

	 0	 8	  0.043829

	 0	 9	  0.633966

	 1	 0	  0.061727

	 1	 1	  0.449539

	 1	 2	  0.401306

	 1	 3	  0.754673

	 1	 4	  0.797287

	 1	 5	  0.001838

	 1	 6	  0.897504

	 1	 7	  0.350752

	 1	 8	  0.094545

	 1	 9	  0.013617

	 2	 0	  0.859097

	 2	 1	  0.840847

	 2	 2	  0.123104

	 2	 3	  0.007512

	 2	 4	  0.260303

	 2	 5	  0.912484

	 2	 6	  0.113664

	 2	 7	  0.351629

	 2	 8	  0.822887

	 2	 9	  0.267132

	 3	 0	  0.692066

	 3	 1	  0.561662

	 3	 2	  0.861216

	 3	 3	  0.453794

	 3	 4	  0.911977

	 3	 5	  0.597917

	 3	 6	  0.188955

	 3	 7	  0.761492

	 3	 8	  0.396988

	 3	 9	  0.185314

	 4	 0	  0.574366

	 4	 1	  0.367027

	 4	 2	  0.617205

	 4	 3	  0.361529

	 4	 4	  0.212930

	 4	 5	  0.714471

	 4	 6	  0.117707

	 4	 7	  0.299329

	 4	 8	  0.825003

	 4	 9	  0.824660

	 5	 0	  0.061862

	 5	 1	  0.710781

	 5	 2	  0.088283

	 5	 3	  0.777994

	 5	 4	  0.745303

	 5	 5	  0.308675

	 5	 6	  0.899373

	 5	 7	  0.763537

	 5	 8	  0.761731

	 5	 9	  0.406970

	 6	 0	  0.938749

	 6	 1	  0.562088

	 6	 2	  0.017820

	 6	 3	  0.501103

	 6	 4	  0.041909

	 6	 5	  0.368850

	 6	 6	  0.271724

	 6	 7	  0.858572

	 6	 8	  0.029037

	 6	 9	  0.017442

	 7	 0	  0.152384

	 7	 1	  0.114319

	 7	 2	  0.353907

	 7	 3	  0.119308

	 7	 4	  0.206653

	 7	 5	  0.212924

	 7	 6	  0.612947

	 7	 7	  0.809519

	 7	 8	  0.587090

	 7	 9	  0.215492

Elapsed time = 0.024084

03 February 2009 03:27:13 PM
03 February 2009 03:05:46 PM

FFTW3_PRB

  C version

Demonstrate the use of the C FFTW3 library.

TEST04

  Demonstrate FFTW3 on a 8 by 10 array of real data.

Transform data to FFT coefficients.

  Backtransform FFT coefficients to recover data.

  Compare recovered data to original data.

Input Data:

	 0	 0	  0.218418

	 0	 1	  0.956318

	 0	 2	  0.829509

	 0	 3	  0.561695

	 0	 4	  0.415307

	 0	 5	  0.066119

	 0	 6	  0.257578

	 0	 7	  0.109957

	 0	 8	  0.043829

	 0	 9	  0.633966

	 1	 0	  0.061727

	 1	 1	  0.449539

	 1	 2	  0.401306

	 1	 3	  0.754673

	 1	 4	  0.797287

	 1	 5	  0.001838

	 1	 6	  0.897504

	 1	 7	  0.350752

	 1	 8	  0.094545

	 1	 9	  0.013617

	 2	 0	  0.859097

	 2	 1	  0.840847

	 2	 2	  0.123104

	 2	 3	  0.007512

	 2	 4	  0.260303

	 2	 5	  0.912484

	 2	 6	  0.113664

	 2	 7	  0.351629

	 2	 8	  0.822887

	 2	 9	  0.267132

	 3	 0	  0.692066

	 3	 1	  0.561662

	 3	 2	  0.861216

	 3	 3	  0.453794

	 3	 4	  0.911977

	 3	 5	  0.597917

	 3	 6	  0.188955

	 3	 7	  0.761492

	 3	 8	  0.396988

	 3	 9	  0.185314

	 4	 0	  0.574366

	 4	 1	  0.367027

	 4	 2	  0.617205

	 4	 3	  0.361529

	 4	 4	  0.212930

	 4	 5	  0.714471

	 4	 6	  0.117707

	 4	 7	  0.299329

	 4	 8	  0.825003

	 4	 9	  0.824660

	 5	 0	  0.061862

	 5	 1	  0.710781

	 5	 2	  0.088283

	 5	 3	  0.777994

	 5	 4	  0.745303

	 5	 5	  0.308675

	 5	 6	  0.899373

	 5	 7	  0.763537

	 5	 8	  0.761731

	 5	 9	  0.406970

	 6	 0	  0.938749

	 6	 1	  0.562088

	 6	 2	  0.017820

	 6	 3	  0.501103

	 6	 4	  0.041909

	 6	 5	  0.368851

	 6	 6	  0.271724

	 6	 7	  0.858573

	 6	 8	  0.029037

	 6	 9	  0.017442

	 7	 0	  0.152384

	 7	 1	  0.114319

	 7	 2	  0.353907

	 7	 3	  0.119308

	 7	 4	  0.206653

	 7	 5	  0.212924

	 7	 6	  0.612948

	 7	 7	  0.809519

	 7	 8	  0.587090

	 7	 9	  0.215492

Output FFT Coefficients:

	 0	 0	 35.516098	  0.000000

	 0	 1	  0.212223	 -0.325647

	 0	 2	 -0.796817	 -1.972311

	 0	 3	  1.121054	 -2.729960

	 0	 4	 -0.106362	 -0.563868

	 0	 5	 -0.789597	  0.000000

	 1	 0	 -3.599441	 -1.322681

	 1	 1	 -2.191982	 -1.782738

	 1	 2	  0.070179	 -2.934993

	 1	 3	  0.035724	  3.006339

	 1	 4	 -1.822177	 -0.263801

	 1	 5	  0.394435	 -1.616461

	 2	 0	  0.840967	 -0.351373

	 2	 1	 -0.062276	 -0.462812

	 2	 2	 -1.628608	  1.812221

	 2	 3	 -1.545131	  1.185329

	 2	 4	 -4.881608	  3.889492

	 2	 5	  0.426149	  0.661898

	 3	 0	  1.956379	  0.580047

	 3	 1	 -2.275259	 -1.889218

	 3	 2	 -0.699339	 -0.267855

	 3	 3	  1.077098	 -3.755201

	 3	 4	 -0.035203	 -0.115335

	 3	 5	 -1.081650	  0.000076

	 4	 0	 -1.170344	  0.000000

	 4	 1	  5.499545	 -0.419485

	 4	 2	  3.619144	  0.361215

	 4	 3	 -2.196705	  0.160014

	 4	 4	  3.373931	 -0.288505

	 4	 5	 -3.195574	  0.000000

	 5	 0	  1.956379	 -0.580047

	 5	 1	  3.317326	 -2.523240

	 5	 2	  0.767993	 -0.595624

	 5	 3	 -0.112935	  1.626594

	 5	 4	 -2.656801	  0.656494

	 5	 5	 -1.081650	 -0.000076

	 6	 0	  0.840967	  0.351373

	 6	 1	  1.369564	 -3.222248

	 6	 2	 -2.026454	  1.125465

	 6	 3	 -2.709896	  3.503994

	 6	 4	  0.166671	 -1.881069

	 6	 5	  0.426149	 -0.661898

	 7	 0	 -3.599441	  1.322681

	 7	 1	  1.785378	 -1.046931

	 7	 2	 -1.434329	 -0.350927

	 7	 3	  1.975914	 -0.831091

	 7	 4	 -2.590261	  0.333270

	 7	 5	  0.394435	  1.616461

Recovered input data divided by NX * NY:

	 0	 0	  0.218418

	 0	 1	  0.956318

	 0	 2	  0.829509

	 0	 3	  0.561695

	 0	 4	  0.415307

	 0	 5	  0.066119

	 0	 6	  0.257578

	 0	 7	  0.109957

	 0	 8	  0.043829

	 0	 9	  0.633966

	 1	 0	  0.061727

	 1	 1	  0.449539

	 1	 2	  0.401306

	 1	 3	  0.754673

	 1	 4	  0.797287

	 1	 5	  0.001838

	 1	 6	  0.897504

	 1	 7	  0.350752

	 1	 8	  0.094545

	 1	 9	  0.013617

	 2	 0	  0.859097

	 2	 1	  0.840847

	 2	 2	  0.123104

	 2	 3	  0.007512

	 2	 4	  0.260303

	 2	 5	  0.912484

	 2	 6	  0.113664

	 2	 7	  0.351629

	 2	 8	  0.822887

	 2	 9	  0.267132

	 3	 0	  0.692066

	 3	 1	  0.561662

	 3	 2	  0.861216

	 3	 3	  0.453794

	 3	 4	  0.911977

	 3	 5	  0.597917

	 3	 6	  0.188955

	 3	 7	  0.761492

	 3	 8	  0.396988

	 3	 9	  0.185314

	 4	 0	  0.574366

	 4	 1	  0.367027

	 4	 2	  0.617205

	 4	 3	  0.361529

	 4	 4	  0.212930

	 4	 5	  0.714471

	 4	 6	  0.117707

	 4	 7	  0.299329

	 4	 8	  0.825003

	 4	 9	  0.824660

	 5	 0	  0.061862

	 5	 1	  0.710781

	 5	 2	  0.088283

	 5	 3	  0.777994

	 5	 4	  0.745303

	 5	 5	  0.308675

	 5	 6	  0.899373

	 5	 7	  0.763537

	 5	 8	  0.761731

	 5	 9	  0.406970

	 6	 0	  0.938749

	 6	 1	  0.562088

	 6	 2	  0.017820

	 6	 3	  0.501103

	 6	 4	  0.041909

	 6	 5	  0.368851

	 6	 6	  0.271724

	 6	 7	  0.858573

	 6	 8	  0.029037

	 6	 9	  0.017442

	 7	 0	  0.152384

	 7	 1	  0.114319

	 7	 2	  0.353907

	 7	 3	  0.119308

	 7	 4	  0.206653

	 7	 5	  0.212924

	 7	 6	  0.612948

	 7	 7	  0.809519

	 7	 8	  0.587090

	 7	 9	  0.215492

Elapsed time = 0.001780

FFTW3_PRB

  Normal end of execution.

03 February 2009 03:05:46 PM