Is this a bug or someking of superoptimization????

I have a simple problem in a matrix is fourier transformed transformed multiplied by a filter and then transformed back in real space. I made a host function which returns the result:

__host__ double energy_dft(cufftDoubleReal *dbbff,cufftDoubleReal *dppsi,double *dccc,cufftDoubleReal *hbbff,
cufftDoubleReal *hppsi,int llx,int lly,int llz,int totsize,int totsize_pad,int totsize_invspa,cufftHandle pprc,cufftHandle ppcr,dim3 ggrid, dim3 tthreads)
{    
    CUDA_CHECK( cudaMemcpy(hppsi, dppsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToHost) ); 
    CUDA_CHECK( cudaMemcpy(dbbff, dppsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToDevice) ); 
    cufftExecD2Z(pprc,dbbff,(cufftDoubleComplex*)dbbff); 
    kexcess_dft < < < ggrid,tthreads > > >((cufftDoubleComplex*)dbbff,dccc, totsize_invspa,totsize);
    cufftExecZ2D(ppcr,(cufftDoubleComplex*)dbbff, dbbff);
    totene < < < ggrid,tthreads > > > (dbbff,dppsi,totsize_pad);
    CUDA_CHECK( cudaMemcpy(hbbff, dbbff, sizeof(double)*totsize_pad,cudaMemcpyDeviceToHost) );  
     
    double enne=0;
    double ppmm=0; 
    int count=0;   
    for(int i=0;i < llx;i++)
    { 
    for(int j=0;j < lly;j++)   
    {
    for(int k=0;k < 2*(llz/2+1);k++)
    { 
    if(k < llz)
    {  
    enne=enne+hbbff[count];    
    }
    count=count+1;  
    }}}  
    return enne/(double)totsize;
}

If I replace line 5 and 6 with something equivalent, I get the wrong results:

cufftExecD2Z(pprc,dppsi,(cufftDoubleComplex*)dbbff); 

I wasted 1 week because of this. Is this normal ? Is it some kind of optimization? Should’nt it work just direct transform?