Hi,
I am new to CUDA and stuck in a really wierd problem. I have this FFT program implemented in FORTRAN.
Its a 2 * 2 * 2 FFT in 3d. N = 8
CASE 1: SINGLE PRECISION FFTW CALL
accuracy.f
program test
implicit none
integer N
parameter(N=8)
integer plan
complex in
dimension in(N)
integer i
print *,'Input : '
do i = 1,N
in(i) = cmplx(float(i),float(i+1))
print *,in(i)
enddo
call sfftw_plan_dft_3d ( plan, 2, 2, 2, in, in, FFTW_FORWARD, FFTW_ESTIMATE)
call sfftw_execute ( plan )
call sfftw_destroy_plan ( plan )
print *,'Output : '
do i = 1,N
print *,in(i)
enddo
end
The output is:
Input :
(1.000000,2.000000)
(2.000000,3.000000)
(3.000000,4.000000)
(4.000000,5.000000)
(5.000000,6.000000)
(6.000000,7.000000)
(7.000000,8.000000)
(8.000000,9.000000)
Output :
(0.0000000E+00,0.0000000E+00)
(0.0000000E+00,0.0000000E+00)
(0.0000000E+00,0.0000000E+00)
(0.0000000E+00,0.0000000E+00)
(0.0000000E+00,0.0000000E+00)
(0.0000000E+00,0.0000000E+00)
(0.0000000E+00,0.0000000E+00)
(0.0000000E+00,0.0000000E+00)
CASE 2: CUDA FFT
accuracycuda.f
program test
implicit none
integer N
parameter(N=8)
integer*8 plan
complex in
dimension in(N)
integer i
print *,'Input : '
do i = 1,N
in(i) = cmplx(float(i),float(i+1))
print *,in(i)
enddo
call myfft(24,24,24,in, 1)
print *,'Output :'
do i = 1,N
print *,in(i)
enddo
end
myfft.cu
#include <cufft.h>
#include <cutil.h>
#include <cuComplex.h>
#include “cuda.h”
extern “C” void myfft_(int *n1, int *n2, int *n3,cufftComplex *data, int *direction)
{
int Nx = *n1;
int Ny = *n2;
int Nz = *n3;
int dir = *direction;
cufftComplex *d_data;
CUT_DEVICE_INIT();
CUDA_SAFE_CALL(cudaMalloc((void**) &d_data, sizeof(cufftComplex)*Nx*Ny*Nz));
CUDA_SAFE_CALL(cudaMemcpy(d_data, data, Nx*Ny*Nz*sizeof(cufftComplex), cudaMemcpyHostToDevice));
cufftHandle plan1;
CUDA_SAFE_CALL(cufftPlan3d(&plan1, Nz, Ny, Nx, CUFFT_C2C));
if(dir<0)
CUDA_SAFE_CALL(cufftExecC2C(plan1, (cufftComplex *)d_data, (cufftComplex *)d_data, CUFFT_FORWARD));
else
CUDA_SAFE_CALL(cufftExecC2C(plan1, (cufftComplex *)d_data, (cufftComplex *)d_data, CUFFT_INVERSE));
CUDA_SAFE_CALL(cudaMemcpy(data, d_data, Nx*Ny*Nz*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cufftDestroy(plan1));
cudaFree(d_data);
return;
}
The output is:
Input :
(1.000000,2.000000)
(2.000000,3.000000)
(3.000000,4.000000)
(4.000000,5.000000)
(5.000000,6.000000)
(6.000000,7.000000)
(7.000000,8.000000)
(8.000000,9.000000)
Output :
(36.00000,44.00000)
(-4.000000,-4.000000)
(-8.000000,-8.000000)
(0.0000000E+00,0.0000000E+00)
(-16.00000,-16.00000)
(0.0000000E+00,0.0000000E+00)
(0.0000000E+00,0.0000000E+00)
(0.0000000E+00,0.0000000E+00)
Can someone please advice why this difference in output ? I assume that CUDA FFT is based on FFTW model. So for same input, how can the output be different.
Any help would be appreciated.
Thanks