We are using CUDA 1.0 and got some troubles with 3d fft. Was searching for similar problems in this forum but didn’t find.
We have problems with backward transformation for following dimensions: x=513, y=257, z=257. We are aware, that we could not expect great performance or good precision, still the results are much worse than one would expect.
An example: only one element in array (0,0,0) is nonzero (-1). we transform this vektor and than back. The right solution would yield -513257257 for (0,0,0) and 0 otherwise. But this is not the case: (0,0,0) is only about -132000 or 4% of the expeted value.
Here is the code (comparing a vektor with the same vektor after transforming forward and back):
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cufft.h>
#include <cutil.h>
// includes, kernels
int
main( int argc, char** argv)
{
float *d_in;
float *h_in;
float *d_out;
float *h_out;
int x=513, y=257, z=257;
int size=x*y*(z/2+1)*2;
int mem_size=sizeof(float)*size;
h_in=(float*)malloc(mem_size);
h_out=(float*)malloc(mem_size);
for(int i=0;i<size;h_in[i++]=0.0f);
h_in[0]=-1;
cufftHandle planForward;
cufftHandle planBackward;
cufftPlan3d(&planForward, x, y, z, CUFFT_R2C);
cufftPlan3d(&planBackward, x, y, z, CUFFT_C2R);
cudaMalloc((void**)&d_in, 2*mem_size);
cudaMalloc((void**)&d_out, 2*mem_size);
cudaMemcpy(d_in, h_in, mem_size, cudaMemcpyHostToDevice);
cufftExecR2C(planForward, (cufftReal *)d_in, (cufftComplex *)d_out);
cufftExecC2R(planBackward, (cufftComplex *)d_out, (cufftReal *)d_in);
cudaMemcpy(h_out, d_in, mem_size, cudaMemcpyDeviceToHost);
int all=0;
float eps=1e-5;
for(int i=0;i<size && all<5;i++)
if(abs(h_in[i]-h_out[i]/size)>eps) {all++; printf("%d : %0.8f <-> %.8f \n ", i, h_in[i], h_out[i]/size);}
cudaFree(d_out);
cudaFree(d_in);
free(h_in);
free(h_out);
}
And the output is:
0 : -1.00000000 <-> -0.00388352
66049 : 0.00000000 <-> -0.00388340
66050 : 0.00000000 <-> 0.00001514
66305 : 0.00000000 <-> -0.00001518
132098 : 0.00000000 <-> -0.00388293
One more strange thing: for x=257, y=257, z=513 everything works fine.
To me, it looks like a bug, but I was too often wrong about such things :)