I have a simple problem in a matrix is fourier transformed transformed multiplied by a filter and then transformed back in real space. I made a host function which returns the result:
__host__ double energy_dft(cufftDoubleReal *dbbff,cufftDoubleReal *dppsi,double *dccc,cufftDoubleReal *hbbff,
cufftDoubleReal *hppsi,int llx,int lly,int llz,int totsize,int totsize_pad,int totsize_invspa,cufftHandle pprc,cufftHandle ppcr,dim3 ggrid, dim3 tthreads)
{
CUDA_CHECK( cudaMemcpy(hppsi, dppsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToHost) );
CUDA_CHECK( cudaMemcpy(dbbff, dppsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToDevice) );
cufftExecD2Z(pprc,dbbff,(cufftDoubleComplex*)dbbff);
kexcess_dft < < < ggrid,tthreads > > >((cufftDoubleComplex*)dbbff,dccc, totsize_invspa,totsize);
cufftExecZ2D(ppcr,(cufftDoubleComplex*)dbbff, dbbff);
totene < < < ggrid,tthreads > > > (dbbff,dppsi,totsize_pad);
CUDA_CHECK( cudaMemcpy(hbbff, dbbff, sizeof(double)*totsize_pad,cudaMemcpyDeviceToHost) );
double enne=0;
double ppmm=0;
int count=0;
for(int i=0;i < llx;i++)
{
for(int j=0;j < lly;j++)
{
for(int k=0;k < 2*(llz/2+1);k++)
{
if(k < llz)
{
enne=enne+hbbff[count];
}
count=count+1;
}}}
return enne/(double)totsize;
}
If I replace line 5 and 6 with something equivalent, I get the wrong results:
cufftExecD2Z(pprc,dppsi,(cufftDoubleComplex*)dbbff);
I wasted 1 week because of this. Is this normal ? Is it some kind of optimization? Should’nt it work just direct transform?