Is this a bug or someking of superoptimization????

I have a simple problem in a matrix is fourier transformed transformed multiplied by a filter and then transformed back in real space. I made a host function which returns the result:

__host__ double energy_dft(cufftDoubleReal *dbbff,cufftDoubleReal *dppsi,double *dccc,cufftDoubleReal *hbbff,
cufftDoubleReal *hppsi,int llx,int lly,int llz,int totsize,int totsize_pad,int totsize_invspa,cufftHandle pprc,cufftHandle ppcr,dim3 ggrid, dim3 tthreads)
    CUDA_CHECK( cudaMemcpy(hppsi, dppsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToHost) ); 
    CUDA_CHECK( cudaMemcpy(dbbff, dppsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToDevice) ); 
    kexcess_dft < < < ggrid,tthreads > > >((cufftDoubleComplex*)dbbff,dccc, totsize_invspa,totsize);
    cufftExecZ2D(ppcr,(cufftDoubleComplex*)dbbff, dbbff);
    totene < < < ggrid,tthreads > > > (dbbff,dppsi,totsize_pad);
    CUDA_CHECK( cudaMemcpy(hbbff, dbbff, sizeof(double)*totsize_pad,cudaMemcpyDeviceToHost) );  
    double enne=0;
    double ppmm=0; 
    int count=0;   
    for(int i=0;i < llx;i++)
    for(int j=0;j < lly;j++)   
    for(int k=0;k < 2*(llz/2+1);k++)
    if(k < llz)
    return enne/(double)totsize;

If I replace line 5 and 6 with something equivalent, I get the wrong results:


I wasted 1 week because of this. Is this normal ? Is it some kind of optimization? Should’nt it work just direct transform?