In some cases 3d fft doesn't work properly problems for x=513, y=257, z=257

We are using CUDA 1.0 and got some troubles with 3d fft. Was searching for similar problems in this forum but didn’t find.

We have problems with backward transformation for following dimensions: x=513, y=257, z=257. We are aware, that we could not expect great performance or good precision, still the results are much worse than one would expect.

An example: only one element in array (0,0,0) is nonzero (-1). we transform this vektor and than back. The right solution would yield -513257257 for (0,0,0) and 0 otherwise. But this is not the case: (0,0,0) is only about -132000 or 4% of the expeted value.

Here is the code (comparing a vektor with the same vektor after transforming forward and back):

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cufft.h>

#include <cutil.h>

// includes, kernels

int

main( int argc, char** argv) 

{

    float *d_in;

	float *h_in;

	float *d_out;

	float *h_out;

	

	int x=513, y=257, z=257;

	

	int size=x*y*(z/2+1)*2;

	int mem_size=sizeof(float)*size;

	h_in=(float*)malloc(mem_size);

	h_out=(float*)malloc(mem_size);

	

	for(int i=0;i<size;h_in[i++]=0.0f);

	h_in[0]=-1;

	

	

	cufftHandle planForward;

    cufftHandle planBackward;

	

	cufftPlan3d(&planForward, x, y, z, CUFFT_R2C);

    cufftPlan3d(&planBackward, x, y, z, CUFFT_C2R);

   cudaMalloc((void**)&d_in, 2*mem_size); 

	cudaMalloc((void**)&d_out, 2*mem_size); 

	

  cudaMemcpy(d_in, h_in, mem_size, cudaMemcpyHostToDevice);

  

  cufftExecR2C(planForward, (cufftReal *)d_in, (cufftComplex *)d_out);

  cufftExecC2R(planBackward, (cufftComplex *)d_out, (cufftReal *)d_in);

  

  

  cudaMemcpy(h_out, d_in, mem_size, cudaMemcpyDeviceToHost);

  

  

	

 int all=0;

  

  float eps=1e-5;

  

  for(int i=0;i<size && all<5;i++)

      if(abs(h_in[i]-h_out[i]/size)>eps) {all++; printf("%d : %0.8f <-> %.8f \n  ", i, h_in[i], h_out[i]/size);}

	

    

	cudaFree(d_out);

	cudaFree(d_in);

	free(h_in);

	free(h_out);

}

And the output is:

0 : -1.00000000 <-> -0.00388352

66049 : 0.00000000 <-> -0.00388340

66050 : 0.00000000 <-> 0.00001514

66305 : 0.00000000 <-> -0.00001518

132098 : 0.00000000 <-> -0.00388293

One more strange thing: for x=257, y=257, z=513 everything works fine.

To me, it looks like a bug, but I was too often wrong about such things :)

First of all I should say i’m newbie to CUDA. So everything or a part of written below may be wrong :)

I’ve tested your code (it doessn’t work on my 8600GT) and made some changes on it so it become:

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cufft.h>

#include <cutil.h>

// includes, kernels

int

main( int argc, char** argv) 

{

float *d_in;

float *h_in;

float *d_out;

float *h_out;

//!!!!!!

int x=192, y=192, z=192;

int size=x*y*(z/2+1)*2;

int mem_size=sizeof(float)*size;

h_in=(float*)malloc(mem_size);

h_out=(float*)malloc(mem_size);

// !!!!!

for(int i=0;i<size;h_in[i++]=rand() * 10 / ( RAND_MAX + 1 ) ); 

cufftHandle planForward;

   cufftHandle planBackward;

cufftPlan3d(&planForward, x, y, z, CUFFT_R2C);

cufftPlan3d(&planBackward, x, y, z, CUFFT_C2R);

cudaMalloc((void**)&d_in, 2*mem_size); 

cudaMalloc((void**)&d_out, 2*mem_size); 

cudaMemcpy(d_in, h_in, mem_size, cudaMemcpyHostToDevice);

cufftExecR2C(planForward, (cufftReal *)d_in, (cufftComplex *)d_out);

 cufftExecC2R(planBackward, (cufftComplex *)d_out, (cufftReal *)d_in);

cudaMemcpy(h_out, d_in, mem_size, cudaMemcpyDeviceToHost);

int all=0;

float eps=1e-5;

//!!!!!!

 for(int i=0;i<(x*y*z) && all<5;i++)

     if(abs(h_in[i]-h_out[i]/(x*y*z))>eps) {all++; printf("%d : %0.8f <-> %.8f \n  ", i, h_in[i], h_out[i]/(x*y*z));}

 

cufftDestroy(planForward);  // !!!!!

cufftDestroy(planBackward); // !!!!

cudaFree(d_out);

cudaFree(d_in);

free(h_in);

free(h_out);

getchar();

}

// !!! - changed things. It works on my device only with dimensions 256x256x256 so i have to decrease it. Also your code doestn’t work correctly without destroying plans. The main thing is that you should be careful with lengths of data. The 3d FFTs is done only on (xyz) elements but not on “size” of them. So in error detection loop i changed size with (xyz). But stil, the bigger numbers in data, the less accurancy fft is. And if you try to play with numbers in h_in you’ll see that.