Correct values not printing in CUDA C

Hii everyone. I was doing a project using CUDA C. Actually I need to calculate force values for which I have written C code and CUDA C code. Below I have attached my whole code using CUDA syntax. I am getting all the force values as zero which is not correct at all.
Personally what I feel is that values are not printing correctly due to following line in code:

printf("%lf %lf %lf\n",ForceX[h],ForceY[h],ForceZ[h]); because ForceX[h],ForceY[h],ForceZ[h] all are of cuDoubleComplex type.

This is the full code attached below:

%%cu
#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <math.h>

#include <complex.h>

#include <cuComplex.h>

#include <time.h>

cuDoubleComplex my_complex_exp(cuDoubleComplex ex1){

      cuDoubleComplex comp_1_w;

      double s, c;

      double e = exp(ex1.x);

      sincos(ex1.y, &s, &c);

      comp_1_w.x = c * e;

      comp_1_w.y = s * e;

      return comp_1_w;

}

void legendre(int lmax,int l, float theta_dummy, float Lmn[11]);

void force_calc(float x11[147], float y11[147], float z11[147]);

//cuDoubleComplex my_complex_exp(cuDoubleComplex ex1);

int main()

{

int h;

float x11[147], y11[147], z11[147];

cuDoubleComplex ForceX[147],ForceY[147],ForceZ[147];

FILE* fp;

clock_t start, end;

double cpu_time_used;

start = clock();

/////////////////////////////////////first calc///////////////////////////////////////////////////////

fp = fopen(“XYZ1.txt”, “r”);

for(h=0;h<147;h++)

{

fscanf(fp, “%f”, &x11[h]);

}

for(h=147;h<294;h++)

{

fscanf(fp, “%f”, &y11[h-147]);

}

for(h=294;h<441;h++)

{

fscanf(fp, “%f”, &z11[h-294]);

}

fclose(fp);

// fp = fopen(“forces1.txt”, “w”);

// for(h=0;h<147;h++)

// {

// fprintf(fp, “%f %f %f\n”, ForceX[h], ForceY[h], ForceZ[h]);

// }

// fclose(fp);

force_calc(x11,y11,z11);

end = clock();

cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;

printf(“calc took %lf seconds to execute \n”, cpu_time_used);

return 0;

}

void force_calc(float x11[147], float y11[147], float z11[147])

{

cuDoubleComplex ForceX[147],ForceY[147],ForceZ[147];

int natoms=147,etarad=9,etaang=5,lmax=10;

float pi=3.141592653589793;

float cons1[10][21];

float a011[60][30],a121[31][30],a231[30][1];

float rcut = 8.0;

float dx,dy,dz,r,d1,diff_1zr;

cuDoubleComplex term1,term2,term3,term4,func1,func2,rcutfun,Dcutfun,temp_term2,temp_term3,temp_term4;

float pcolhea1, fophi1, ficosec_theta1, dicos_theta1;

float derphy1,derphx1;

double dthetay1, dthetax1, dthetaz1;

cuDoubleComplex g1;

cuDoubleComplex g2,g3,g4;

cuDoubleComplex G_p;

cuDoubleComplex d_Gx_p,d_Gy_p,d_Gz_p;

float Lmn_all[147][10][11],Lmn[11];

float cons2;

double diff_Ytheta2;

cuDoubleComplex multf1, multf2, multf3;

int p,n,l,m,k1,j,k,h,s1,jj,j11,m_1,b_m,jj1,k2,m1;

float r2;

float sub1;

float rcut_inv = 0.125;

float Lmmnn;

//float d2Py[natoms],d2Pz[natoms];

cuDoubleComplex d2Px[natoms],d2Py[natoms],d2Pz[natoms];

int jt;

int k12, mm, b_mm, ll, ju;

float netar[9]={0.005,0.015,0.0230,0.038,0.060,0.090,0.150,0.260,0.480};

float netaa[5]={0.0028,0.0040,0.0110,0.0280,0.059};

cuDoubleComplex temp8;

float temp15,temp16,temp50,temp52;

float temp5[9];

double hv = -1;

cuDoubleComplex p_ch_p;

cuDoubleComplex tot_cnlm_sum_p[147][5][10][21];

cuDoubleComplex xt1,xt2,xt3,xt4,xt6;

cuDoubleComplex useY1;

cuDoubleComplex ex1;

cuDoubleComplex ex2,ex3;

cuDoubleComplex q;

cuDoubleComplex comp_1_w;

cuDoubleComplex mult1,mult4;

cuDoubleComplex sameder_xt,sameder_yt,sameder_zt;

cuDoubleComplex samediff_x,samediff_y,samediff_z;

//cuDoubleComplex temp4,temp6;

cuDoubleComplex temp12,temp14,temp17,temp18,temp19,temp20,temp21,temp22,temp23,temp24,temp25,temp26,temp27,temp28,temp30,temp40,temp41,temp42,temp43,temp44,temp45,temp46,temp51,temp53,temp54,temp55;

cuDoubleComplex diff_Ytheta1;

cuDoubleComplex diffY_phi;

cuDoubleComplex diffXY_phi;

cuDoubleComplex rcomp_der;

cuDoubleComplex dtheta_Y;

cuDoubleComplex dthetaX_Y;

cuDoubleComplex dthetaZ_Y;

q = make_cuDoubleComplex(0,1);

cuDoubleComplex derdiff_Ymainx;

cuDoubleComplex derdiffX_Ymainx;

cuDoubleComplex derdiffZ_Ymainx;

cuDoubleComplex derdiff_main1,derdiff_Ymain1,derdiff_Zmain1;

cuDoubleComplex der_main1,der_Ymain1,der_Zmain1;

cuDoubleComplex dpdx,dpdy,dpdz;

//////////////////////////////////////////////////////////////////////////////////////////////

////////////////////////////variable declaration for NN////////////////////////////////////////////////

int Lensym=59,LenHD2=30,LenHD1=30;

int nn,kk,i,z;

int b1;

cuDoubleComplex checkcmx[147][59],checkcmy[147][59],checkcmz[147][59];

cuDoubleComplex GiS_p[147][59];

cuDoubleComplex ress,ter;

cuDoubleComplex termm,resultt;

cuDoubleComplex alOj_x_p,y_p,Oj_y_p;

cuDoubleComplex AG_p,bkAG_p;

cuDoubleComplex Oj_x_p[147][30];

cuDoubleComplex GradHL5[147][30];

cuDoubleComplex GradHL6[147][30];

cuDoubleComplex GradHL24,SumJJx,SumJJy,SumJJz,tempx1,tempy1,tempz1,GradHL1,GradHL12,GradHL13,GradHL21,GradHL23,GradFx,GradFz,GradFy;

cuDoubleComplex gradientE[147][59];

cuDoubleComplex tt1,tt2,tt3,tt4;

float cons11[100];

FILE * fp1;

fp1 = fopen (“cons11.txt”, “r”);

int index = 0;

while(fscanf(fp1, “%f”,&cons11[index])==1)

  index++;

  fclose(fp1);

float meangtot_f[59];

FILE * fp2;

fp2 = fopen (“meangtot.txt”, “r”);

index = 0;

while(fscanf(fp2, “%f”,&meangtot_f[index])==1)

  index++;

  fclose(fp2);

float gtotmin_f[59];

FILE * fp3;

fp3 = fopen (“gtotmin.txt”, “r”);

index = 0;

while(fscanf(fp3, “%f”,&gtotmin_f[index])==1)

  index++;

  fclose(fp3);

float gtotmax_f[59];

FILE * fp4;

fp4 = fopen (“gtotmax.txt”, “r”);

index = 0;

while(fscanf(fp4, “%f”,&gtotmax_f[index])==1)

  index++;

  fclose(fp4);

float a01[1800];

FILE * fp5;

fp5 = fopen (“a01.txt”, “r”);

index = 0;

while(fscanf(fp5, “%f”,&a01[index])==1)

  index++;

  fclose(fp5);

float a12[930];

FILE * fp6;

fp6 = fopen (“a12.txt”, “r”);

index = 0;

while(fscanf(fp6, “%f”,&a12[index])==1)

  index++;

  fclose(fp6);

float a23[30];

FILE * fp7;

fp7 = fopen (“a23.txt”, “r”);

index = 0;

while(fscanf(fp7, “%f”,&a23[index])==1)

  index++;

  fclose(fp7);

for (l=0;l<10;l++)

{

for(m=0;m<21;m++)

{

if(m<2*l+1)

{

cons1[l][m] = cons11[(l*l)+m];

//printf("%d %d %.32f\n", l, m, cons1[l][m]);

}

else

{

cons1[l][m] = 0.0;

}

}

}

for(h=0; h<60; h++)

{

for(j=0; j<30; j++)

{

a011[h][j] = a01[h*30+j];

//printf("%f\n",a011[h][j]);

}

}

for(h=0; h<31; h++)

{

for(j=0; j<30; j++)

{

a121[h][j] = a12[h*30+j];

//printf("%f\n",a12[h*30+j]);

}

}

for(h=0; h<30; h++)

{

for(j=0; j<1; j++)

{

a231[h][j] = a23[h+j];

//printf("%f\n",a23[h+j]);

}

}

///start of main loop /////////////////////////////

//////////////Pre-calculation of tot_cnlm_sum_p for every atom/////////////////

for(h=0; h<natoms; h++)

{

for(jt=0; jt<natoms;jt++)

{

if(h != jt)

{

dx=(x11[h]-x11[jt]);

dy=(y11[h]-y11[jt]);

dz=(z11[h]-z11[jt]);

r2 =dxdx + dydy + dz*dz;

r=sqrtf(r2);

pcolhea1 = acosf(dz/r);

for (l=0;l<lmax;l++)

{

if (l==0)

{

Lmn[0]=1.0;

Lmn_all[jt][0][0] = 1.0;

}

else

{

legendre(lmax,l,pcolhea1,Lmn);

}

 for(b_m=0;b_m<19;b_m++)

{

if(b_m<2*l+1)

{

 m= -l + b_m;

 if (m >= 0)

 {

  Lmn_all[jt][l][m]= Lmn[m];

 }

}

}

}

}

}

cuDoubleComplex temp4,temp6;

//float temp5[9];

k2=0;

for(n=0;n<etaang;n++) // loop 15.2

{

for(ll=0;ll<lmax;ll++)

{

p_ch_p= make_cuDoubleComplex(0,0);

for(b_mm=0;b_mm<19;b_mm++)

{

 if(b_mm<2*ll+1)

 {

  xt2  = make_cuDoubleComplex(0,0);

  for(ju=0; ju<natoms;ju++)

  {

   if(h != ju)

   {

    dx=(x11[h]-x11[ju]);

    dy=(y11[h]-y11[ju]);

    dz=(z11[h]-z11[ju]);

    r2 =dx*dx + dy*dy + dz*dz;

    r=sqrtf(r2);

    fophi1= atan2f(dy,dx);

    if (r <= rcut)

    {

     rcutfun = make_cuDoubleComplex(0.5*(cosf(pi*r*rcut_inv)+1),0);

    }

    else

    {

     rcutfun = make_cuDoubleComplex(0,0);

    }

    //func1   = expf(-netaa[n]*r2);

temp50 = -netaa[n]*r2;

temp51 = make_cuDoubleComplex(temp50,0);

func1 = my_complex_exp (temp51);

    //term1   = func1*rcutfun;

term1   = cuCmul(func1,rcutfun);

    G_p    = term1;

    mm = -ll+b_mm;

    m_1 = abs(mm);

    if (mm<0)

    {

   cuDoubleComplex temp1 = cuCmul(q, make_cuDoubleComplex(-m_1,0));

   cuDoubleComplex ex1 = cuCmul(temp1, make_cuDoubleComplex(fophi1,0));

   cuDoubleComplex comp_1_w = my_complex_exp(ex1);

   /* __device__ cuDoubleComplex my_complex_exp(cuDoubleComplex ex1){

      cuDoubleComplex comp_1_w;

      double s, c;

      double e = exp(ex1.x);

      sincos(ex1.y, &s, &c);

      comp_1_w.x = c * e;

      comp_1_w.y = s * e;

      return comp_1_w;

    }*/

   //comp_1_w = cexpf(ex1);

    }

    else

    {

   cuDoubleComplex temp2 = cuCmul(q, make_cuDoubleComplex(m_1,0));

   cuDoubleComplex ex2 = cuCmul(temp2, make_cuDoubleComplex(fophi1,0));

   cuDoubleComplex ex3 = my_complex_exp (ex2);

   /* __device__ cuDoubleComplex my_complex_exp (cuDoubleComplex ex2){

        cuDoubleComplex ex3;

        double s, c;

        double e = exp(ex2.x);

        sincos(ex2.y, &s, &c);

        ex3.x = c * e;

        ex3.y = s * e;

        return ex3;

      }*/

   //ex3 = cexpf(ex2);

   float result = 1.0;

   int zz;

   for(zz=9; zz>=1; zz--)

   {

    if(zz<=mm)

    {

      result *= (-1.0);

    }

   }

     cuDoubleComplex comp_1_w = cuCmul(ex3, make_cuDoubleComplex(result, 0));

    }

     cuDoubleComplex mult1 = cuCmul(comp_1_w, make_cuDoubleComplex(cons1[11][b_mm], 0));

    Lmmnn = Lmn_all[ju][ll][m_1];

     useY1 = cuCmul(mult1, make_cuDoubleComplex(Lmmnn, 0));

     mult4 = cuCmul(useY1,G_p);

     xt2 = cuCadd(xt2,mult4);

    //xt2 = xt2 +  mult4;

   }

  }

  xt6 = cuConj(xt2);

  cuDoubleComplex temp3 = cuCmul(xt2,xt6);

  p_ch_p = cuCadd(p_ch_p,temp3);

  //p_ch_p = p_ch_p + (xt2 * xt6);

  //tot_cnlm_sum_r[(n*30870)+(ll*3087)+(b_mm*147)+h] = crealf(xt2);

  //tot_cnlm_sum_i[(n*30870)+(ll*3087)+(b_mm*147)+h] = cimagf(xt2);

  //memcpy((float*)(inpt+(n*30870)+(ll*3087)+(b_mm*147)+h), tot_cnlm_sum_r, 1*sizeof(float));

  //memcpy((float*)(inpt+154350+(n*30870)+(ll*3087)+(b_mm*147)+h), tot_cnlm_sum_i, 1*sizeof(float));

  tot_cnlm_sum_p[h][n][ll][b_mm] = xt2;

 }

}

sub1 = 1/(gtotmax_f[k2]-gtotmin_f[k2]);

cons2 = ((4*pi)/(float)(2*ll+1));

 temp4 = cuCmul(p_ch_p,make_cuDoubleComplex(cons2,0));

temp5[k2] = meangtot_f[k2] + gtotmin_f[k2];

temp6 = cuCsub(temp4, make_cuDoubleComplex(temp5[k2],0));

GiS_p[h][k2] = cuCmul(temp6, make_cuDoubleComplex(sub1,0));

//GiS_p[h][k2] = ((cons2 * p_ch_p)- meangtot_f[k2]-gtotmin_f[k2])*(sub1);

k2=k2+1;

}

}

}

//////////////Pre-calculation of GiS_p for every atom/////////////////

for(h=0; h<natoms; h++)

{

for(k=0;k<etarad;k++)

{

g1=make_cuDoubleComplex(0,0);

sub1 = 1/(gtotmax_f[k]-gtotmin_f[k]);

float sub2 = meangtot_f[k]+gtotmin_f[k];

for(j=0; j<natoms;j++)

{

if(h != j)

{

   dx=(x11[h]-x11[j]);

   dy=(y11[h]-y11[j]);

   dz=(z11[h]-z11[j]);

   r2 =dx*dx + dy*dy + dz*dz;

   r=sqrtf(r2);

   if (r <= rcut)

   {

    rcutfun = make_cuDoubleComplex(0.5*(cosf(pi*r*rcut_inv)+1),0);

   }

   else

   {

    rcutfun = make_cuDoubleComplex(0,0);

   }

   //func1   = expf(-netar[k]*r2);

 temp52 = -netar[k]*r2;

 temp53 = make_cuDoubleComplex(temp52,0);

 func1 = my_complex_exp(temp53);

   term1   = cuCmul(func1,rcutfun);

   g1 = cuCadd(g1,term1);

 }

}

 //temp8 = (g1-sub2)*(sub1);

temp8 = cuCmul(cuCsub(g1,make_cuDoubleComplex(sub2,0)),make_cuDoubleComplex(sub1,0));

GiS_p[h][k] = temp8;

//GiS_p[h][k] = (g1-sub2)*(sub1);

}

}

for(jj=0; jj<natoms; jj++)

{

for(i=0; i<30; i++)

{

resultt = make_cuDoubleComplex(0,0);

for(z=0; z<59; z++)

{

//termm = GiS_p[jj][z]*a011[z][i];

termm = cuCmul(GiS_p[jj][z],make_cuDoubleComplex(a011[z][i],0));

//resultt += termm;

resultt = cuCadd(resultt,termm);

}

AG_p = resultt;

//bkAG_p = AG_p + a011[Lensym][i];

bkAG_p = cuCadd(AG_p,make_cuDoubleComplex(a011[Lensym][i],0));

//Oj_x_p[jj][i] = (1.0 / ( 1.0 + expf(-bkAG_p)));

temp40 = cuCmul(bkAG_p,make_cuDoubleComplex(hv,0));

temp41 = my_complex_exp (temp40);

temp42 = cuCadd(temp41,make_cuDoubleComplex(1,0));

Oj_x_p[jj][i] = cuCdiv(make_cuDoubleComplex(1,0),temp42);

//GradHL5[jj][i] = (Oj_x_p[jj][i] *(1-Oj_x_p[jj][i]));

GradHL5[jj][i] = cuCmul(Oj_x_p[jj][i],cuCsub(make_cuDoubleComplex(1,0),Oj_x_p[jj][i]));

}

}

cuDoubleComplex Ei;

Ei = make_cuDoubleComplex(0,0);

for(jj=0; jj<natoms; jj++)

{

ress=make_cuDoubleComplex(0,0);

for(i=0; i<30; i++)

{

//resultt = 0.0;

resultt = make_cuDoubleComplex(0,0);

for(p=0; p<30; p++)

{

//termm = Oj_x_p[jj][p]*a121[p][i];

termm = cuCmul(Oj_x_p[jj][p],make_cuDoubleComplex(a121[p][i],0));

//resultt += termm;

resultt = cuCadd(resultt,termm);

}

alOj_x_p = resultt;

//y_p = (alOj_x_p + a121[LenHD1][i]);

y_p = cuCadd(alOj_x_p,make_cuDoubleComplex(a121[LenHD1][i],0));

//Oj_y_p = (1.0/(1.0+expf(-y_p)));

//Oj_x_p[jj][i] = (1.0 / ( 1.0 + expf(-bkAG_p)));

temp44 = cuCmul(y_p,make_cuDoubleComplex(hv,0));

temp45 = my_complex_exp (temp44);

temp46 = cuCadd(temp45,make_cuDoubleComplex(1,0));

Oj_y_p = cuCdiv(make_cuDoubleComplex(1,0),temp46);

//GradHL6[jj][i] = (Oj_y_p * (1-Oj_y_p));

GradHL6[jj][i] = cuCmul(Oj_y_p,cuCsub(make_cuDoubleComplex(1,0),Oj_y_p));

//ter = Oj_y_p*a231[i][0];

ter = cuCmul(Oj_y_p,make_cuDoubleComplex(a231[i][0],0));

ress = cuCadd(ress,ter);

}

//Ei += ress;

Ei = cuCadd(Ei,ress);

}

//```````````````````````````````````````````````````````````````

for(jj=0; jj<natoms; jj++)

{

for(z=0; z<Lensym; z++)

{

GradHL24 = make_cuDoubleComplex(0,0);

for (i=0; i<LenHD2; i++)

{

GradHL13 = make_cuDoubleComplex(0,0);

for(p=0; p<LenHD1; p++)

{

 //GradHL1  = (GradHL5[jj][p] * a011[z][p]);

GradHL1 = cuCmul(GradHL5[jj][p],make_cuDoubleComplex(a011[z][p],0));

 //GradHL12 = GradHL1   * a121[p][i];

GradHL12 = cuCmul(GradHL1,make_cuDoubleComplex(a121[p][i],0));

 //GradHL13 = GradHL13  + GradHL12;

GradHL13 = cuCadd(GradHL13,GradHL12);

}

 //GradHL21 = GradHL6[jj][i] * GradHL13;

GradHL21 = cuCmul(GradHL6[jj][i],GradHL13);

 //GradHL23 = GradHL21  * a231[i][0];

GradHL23 = cuCmul(GradHL21,make_cuDoubleComplex(a231[i][0],0));

 //GradHL24 = GradHL24  + GradHL23;

GradHL24 = cuCadd(GradHL24,GradHL23);

}

gradientE[jj][z] = GradHL24;

}

}

//////////////start of radial calc/////////////////

for(h=0; h<natoms; h++)

{

for(jt=0; jt<natoms;jt++)

{

if(h != jt)

{

dx=(x11[h]-x11[jt]);

dy=(y11[h]-y11[jt]);

dz=(z11[h]-z11[jt]);

r2 =dxdx + dydy + dz*dz;

r=sqrtf(r2);

pcolhea1 = acosf(dz/r);

for (l=0;l<lmax;l++)

{

if (l==0)

{

Lmn[0]=1.0;

Lmn_all[jt][0][0] = 1.0;

}

else

{

legendre(lmax,l,pcolhea1,Lmn);

}

for(b_m=0;b_m<19;b_m++)

{

if(b_m<2*l+1)

{

 m= -l + b_m;

 if (m >= 0)

 {

  Lmn_all[jt][l][m]= Lmn[m];

 }

}

}

}

}

}

for(k=0;k<etarad;k++)

{

g2=make_cuDoubleComplex(0,0);

g3=make_cuDoubleComplex(0,0);

g4=make_cuDoubleComplex(0,0);

sub1 = 1/(gtotmax_f[k]-gtotmin_f[k]);

for(j=0; j<natoms;j++)

{

if(h != j)

{

   dx=(x11[h]-x11[j]);

   dy=(y11[h]-y11[j]);

   dz=(z11[h]-z11[j]);

   r2 =dx*dx + dy*dy + dz*dz;

   r=sqrtf(r2);

   if (r <= rcut)

   {

    //rcutfun = 0.5*(cosf(pi*r*rcut_inv)+1);

rcutfun = make_cuDoubleComplex(0.5*(cosf(pi*r*rcut_inv)+1),0);

    //Dcutfun = 0.5*(pi*rcut_inv)*sinf(pi*r*rcut_inv);

Dcutfun = make_cuDoubleComplex(0.5*(pi*rcut_inv)*sinf(pi*r*rcut_inv),0);

   }

   else

   {

    //rcutfun = 0.0;

    //Dcutfun = 0.0;

rcutfun = make_cuDoubleComplex(0,0);

Dcutfun = make_cuDoubleComplex(0,0);

   }

   //func1   = expf(-netar[k]*r2);

 func1 = my_complex_exp(temp53);

   //func2   = (Dcutfun/r) + (2.0*rcutfun*netar[k]);

 temp54 = make_cuDoubleComplex(2.0*netar[k],0);

 temp55 = cuCmul(rcutfun,temp54);

 func2 = cuCadd(cuCdiv(Dcutfun,make_cuDoubleComplex(r,0)),temp55);

   //term2   = -func1*func2 * dx;

   //term3   = -func1*func2 * dy;

   //term4   = -func1*func2 * dz;

 

 temp_term2 = cuCmul(func1,func2);

 term2 = cuCmul(temp_term2,make_cuDoubleComplex(-dx,0));

  temp_term3 = cuCmul(func1,func2);

 term3 = cuCmul(temp_term3,make_cuDoubleComplex(-dy,0));

  temp_term4 = cuCmul(func1,func2);

 term4 = cuCmul(temp_term4,make_cuDoubleComplex(-dz,0));

   //checkcmx[j][k] = term2*(sub1);

   //checkcmy[j][k] = term3*(sub1);

   //checkcmz[j][k] = term4*(sub1);

 checkcmx[j][k] = cuCmul(term2,make_cuDoubleComplex(sub1,0));

 checkcmy[j][k] = cuCmul(term3,make_cuDoubleComplex(sub1,0));

 checkcmz[j][k] = cuCmul(term4,make_cuDoubleComplex(sub1,0));

 

   //g2 = g2 + term2;

   //g3 = g3 + term3;

   //g4 = g4 + term4;

 g2 = cuCadd(g2,term2);

 g3 = cuCadd(g3,term2);

 g4 = cuCadd(g4,term2);

 }

}

//checkcmx[h][k] = g2*(sub1);

//checkcmy[h][k] = g3*(sub1);

//checkcmz[h][k] = g4*(sub1);

checkcmx[h][k] = cuCmul(g2,make_cuDoubleComplex(sub1,0));

checkcmy[h][k] = cuCmul(g3,make_cuDoubleComplex(sub1,0));

checkcmz[h][k] = cuCmul(g4,make_cuDoubleComplex(sub1,0));

}

//////////////////end of radial calc////////////////////////////////////////

int n1=9;

for(n=0;n<etaang;n++)

{

for(l =0; l<lmax; l++)

{

sameder_xt = make_cuDoubleComplex(0,0);

sameder_yt = make_cuDoubleComplex(0,0);

sameder_zt = make_cuDoubleComplex(0,0);

//sameder_xt = 0.0+0.0*I;

//sameder_yt = 0.0+0.0*I;

//sameder_zt = 0.0+0.0*I;

sub1 = 1/(gtotmax_f[n1]-gtotmin_f[n1]);

for (jj = 0; jj<natoms; jj++)

{

  //d2Px[jj] = 0.0;

 d2Px[jj] = make_cuDoubleComplex(0,0);

  d2Py[jj] = make_cuDoubleComplex(0,0);

  d2Pz[jj] = make_cuDoubleComplex(0,0);

}

for(b_m = 0; b_m<19; b_m++)

{

 if(b_m<2*l+1)

 {

  samediff_x = make_cuDoubleComplex(0,0);

  samediff_y = make_cuDoubleComplex(0,0);

  samediff_z = make_cuDoubleComplex(0,0);

  //samediff_x = 0.0+0.0*I;

  //samediff_y = 0.0+0.0*I;

  //samediff_z = 0.0+0.0*I;

  xt3 = make_cuDoubleComplex(0,0);

  //xt3  = 0.0+0.0*I;

  //memcpy(tot_cnlm_r, (const float*)(inpt+(n*30870)+(l*3087)+(b_m*147)), 147*sizeof(float));

  //memcpy(tot_cnlm_i, (const float*)(inpt+154350+(n*30870)+(l*3087)+(b_m*147)), 147*sizeof(float));

  for(j=0; j<natoms;j++)

  {

   if(h != j)

   {

    j11 = j;

    dx=(x11[h]-x11[j]);

    dy=(y11[h]-y11[j]);

    dz=(z11[h]-z11[j]);

    r2 =dx*dx + dy*dy + dz*dz;

    r=sqrtf(r2);

    pcolhea1 = acosf(dz/r);

    fophi1= atan2f(dy,dx);

    dicos_theta1 = cosf(pcolhea1);

    ficosec_theta1 = 1/(sinf(pcolhea1));

    d1 = dx*dx + dy*dy;

    diff_1zr = dz/(r2 * sqrtf(d1));

    dthetax1 = diff_1zr * dx;

    dthetay1 = diff_1zr * dy;

    dthetaz1 = -sqrtf(d1)/(r2);

    derphx1 = -dy/d1;

    derphy1 =  dx/d1;

    if (r <= rcut)

    {

     //rcutfun = 0.5*(cosf(pi*r*rcut_inv)+1);

 rcutfun = make_cuDoubleComplex(0.5*(cosf(pi*r*rcut_inv)+1),0);

     //Dcutfun = 0.5*(pi*rcut_inv)*sinf(pi*r*rcut_inv);

 Dcutfun = make_cuDoubleComplex(0.5*(pi*rcut_inv)*sinf(pi*r*rcut_inv),0);

    }

    else

    {

     rcutfun = make_cuDoubleComplex(0,0);

     Dcutfun = make_cuDoubleComplex(0,0);

    }

    //func1   = expf(-netaa[n]*r2);

func1 = my_complex_exp(make_cuDoubleComplex(temp50,0));

    term1   = cuCmul(func1,rcutfun);

    //func2   = (Dcutfun/r + 2.0*rcutfun*netaa[n]);

temp54 = make_cuDoubleComplex(2.0*netaa[n],0);

 temp55 = cuCmul(rcutfun,temp54);

 func2 = cuCadd(cuCdiv(Dcutfun,make_cuDoubleComplex(r,0)),temp55);

    //term2   = -func1*func2 * dx;

    //term3   = -func1*func2 * dy;

    //term4   = -func1*func2 * dz;

temp_term2 = cuCmul(func1,func2);

 term2 = cuCmul(temp_term2,make_cuDoubleComplex(-dx,0));

  temp_term3 = cuCmul(func1,func2);

 term3 = cuCmul(temp_term3,make_cuDoubleComplex(-dy,0));

  temp_term4 = cuCmul(func1,func2);

 term4 = cuCmul(temp_term4,make_cuDoubleComplex(-dz,0));

    G_p    = term1;

    d_Gx_p = term2;

    d_Gy_p = term3;

    d_Gz_p = term4;

    m= -l + b_m;

    m_1=abs(m);

    if (m<0)

    {

      //ex1 = (-q * m_1 * fophi1);

     cuDoubleComplex temp11 = cuCmul(q, make_cuDoubleComplex(-m_1,0));

     cuDoubleComplex ex1 = cuCmul(temp11, make_cuDoubleComplex(fophi1,0));

      //comp_1_w = cexpf(ex1);

     cuDoubleComplex comp_1_w = my_complex_exp(ex1);

      //rcomp_der = -q*m_1* comp_1_w;

     temp30 = cuCmul(q, make_cuDoubleComplex(hv,0));

     temp12 = cuCmul(temp30, make_cuDoubleComplex(m_1,0));

     rcomp_der = cuCmul(temp12,comp_1_w);

    }

    else

    {

    //ex2 = (q * m_1 *  fophi1);

    cuDoubleComplex temp13 = cuCmul(q, make_cuDoubleComplex(m_1,0));

    cuDoubleComplex ex2 = cuCmul(temp13, make_cuDoubleComplex(fophi1,0));

    //ex3 = cexpf(ex2);

    cuDoubleComplex ex3 = my_complex_exp(ex2);

    float result = 1;

    for(int zz=9; zz>=1; zz--)

    {

      if(zz<=m)

      {

         result *= (-1);

      }

    }

    //comp_1_w= result * ex3;

    comp_1_w = cuCmul(ex3, make_cuDoubleComplex(result,0));

      //rcomp_der = q*m_1* comp_1_w;

    temp14 = cuCmul(q, make_cuDoubleComplex(m_1,0));

    rcomp_der = cuCmul(temp14,comp_1_w);

    }

    //mult1 = cons1[l][b_m]*comp_1_w;

    mult1 = cuCmul(comp_1_w,make_cuDoubleComplex(cons1[l][b_m],0));

    Lmmnn = Lmn_all[j][l][m_1];

    //useY1 = mult1*Lmmnn;

    useY1 = cuCmul(mult1, make_cuDoubleComplex(Lmmnn,0));

    //mult4 = (G_p*useY1);

    mult4 = cuCmul(useY1,G_p);

    //xt3 = xt3 +  mult4;

    xt3 = cuCadd(xt3,mult4);

    //diff_Ytheta1 = mult1*ficosec_theta1;

    diff_Ytheta1 = cuCmul(mult1, make_cuDoubleComplex(ficosec_theta1,0));

    //diffY_phi       = cons1[l][b_m]*Lmmnn*rcomp_der*derphx1;

    temp15 = cons1[l][b_m]*Lmmnn*derphx1;

    diffY_phi = cuCmul(rcomp_der, make_cuDoubleComplex(temp15,0));

    //diffXY_phi = cons1[l][b_m]*Lmmnn*rcomp_der*derphy1;

    temp16 =  cons1[l][b_m]*Lmmnn*derphy1;

    diffXY_phi = cuCmul(rcomp_der, make_cuDoubleComplex(temp16,0));

    if (m_1 <= l-1)

    {

      if (l == 0)

      {

       diff_Ytheta2 =0.0;

      }

      else

      {

       diff_Ytheta2 =(l*dicos_theta1*Lmmnn)-((l+m_1)*Lmn_all[j][l-1][m_1]);

      }

    }

    else

    {

      diff_Ytheta2 = l*dicos_theta1*Lmmnn;

    }

    

    //dtheta_Y  = (diff_Ytheta1*diff_Ytheta2)*dthetax1;

    temp17 = cuCmul(diff_Ytheta1,make_cuDoubleComplex(diff_Ytheta2,0));

    dtheta_Y = cuCmul(temp17,make_cuDoubleComplex(dthetax1,0));

    //dthetaX_Y = (diff_Ytheta1*diff_Ytheta2)*dthetay1;

    temp18 = cuCmul(diff_Ytheta1,make_cuDoubleComplex(diff_Ytheta2,0));

    dtheta_Y = cuCmul(temp18,make_cuDoubleComplex(dthetay1,0));

    //dthetaZ_Y = (diff_Ytheta1*diff_Ytheta2)*dthetaz1;

    temp19 = cuCmul(diff_Ytheta1,make_cuDoubleComplex(diff_Ytheta2,0));

    dthetaZ_Y = cuCmul(temp19,make_cuDoubleComplex(dthetaz1,0));

   //derdiff_Ymainx =  diffY_phi+dtheta_Y;

   derdiff_Ymainx =  cuCadd(diffY_phi,dtheta_Y);

   //derdiffX_Ymainx = diffXY_phi+dthetaX_Y;

   derdiffX_Ymainx = cuCadd(diffXY_phi,dthetaX_Y);        

   derdiffZ_Ymainx = dthetaZ_Y;        

    //derdiff_main1 = G_p* derdiff_Ymainx    +  d_Gx_p*useY1;

    derdiff_main1 = cuCadd(cuCmul(derdiff_Ymainx,G_p),cuCmul(useY1,d_Gx_p));

    //derdiff_Ymain1= G_p* derdiffX_Ymainx   +  d_Gy_p*useY1;

    derdiff_Ymain1 = cuCadd(cuCmul(derdiffX_Ymainx,G_p),cuCmul(useY1,d_Gy_p));

    //derdiff_Zmain1= G_p* derdiffZ_Ymainx   +  d_Gz_p*useY1;

    derdiff_Zmain1 = cuCadd(cuCmul(derdiffZ_Ymainx,G_p),cuCmul(useY1,d_Gz_p));

    //samediff_x = samediff_x + derdiff_main1;

    samediff_x = cuCadd(samediff_x,derdiff_main1);

    //samediff_y = samediff_y + derdiff_Ymain1;

    samediff_y = cuCadd(samediff_y,derdiff_Ymain1);

    //samediff_z = samediff_z + derdiff_Zmain1;

    samediff_z = cuCadd(samediff_z,derdiff_Zmain1);

    if (l==0 || (l%2)==0)

    {

     der_main1 = derdiff_main1;

     der_Ymain1 = derdiff_Ymain1;

     der_Zmain1 = derdiff_Zmain1;

    }

    else

    {

     //der_main1 = (-derdiff_main1);

     der_main1 = cuCmul(derdiff_main1, make_cuDoubleComplex(hv, 0));

     //der_Ymain1 = (-derdiff_Ymain1);

     der_Ymain1 = cuCmul(derdiff_Ymain1, make_cuDoubleComplex(hv, 0));   

     //der_Zmain1 = (-derdiff_Zmain1);

     der_Zmain1 = cuCmul(derdiff_Zmain1, make_cuDoubleComplex(hv, 0));

    }

    xt2=tot_cnlm_sum_p[j11][n][l][b_m];

    //xt2 = (tot_cnlm_r[(n*30870)+(l*3087)+(b_m*147)+j11])+(tot_cnlm_i[(n*30870)+(l*3087)+(b_m*147)+j11]*I);

    xt1=cuConj(xt2);

    dpdx = cuCadd(cuCmul(xt1,der_main1),cuCmul(xt2,cuConj(der_main1)));

    //dpdx = (xt1 * der_main1 + xt2 * conj(der_main1));

    dpdy = cuCadd(cuCmul(xt1,der_Ymain1),cuCmul(xt2,cuConj(der_Ymain1)));

    //dpdy = (xt1 * der_Ymain1+ xt2 * conj(der_Ymain1));

    dpdz = cuCadd(cuCmul(xt1,der_Zmain1),cuCmul(xt2,cuConj(der_Zmain1)));

    //dpdz = (xt1 * der_Zmain1+ xt2 * conj(der_Zmain1));

    //d2Px[j] = d2Px[j] + dpdx;

    d2Px[j] = cuCadd(dpdx,d2Px[j]);

    //d2Py[j] = d2Py[j] + dpdy;

    d2Py[j] = cuCadd(dpdy,d2Py[j]);

    //d2Pz[j] = d2Pz[j] + dpdz;

    d2Pz[j] = cuCadd(dpdy,d2Pz[j]);

    cons2 = ((4*pi)/(float)(2*l+1));

    //multf1 = (cons2*d2Px[j]);

    multf1 = cuCmul(d2Px[j],make_cuDoubleComplex(cons2,0));

    //multf2 = (cons2*d2Py[j]);

    multf2 = cuCmul(d2Py[j],make_cuDoubleComplex(cons2,0));

    //multf3 = (cons2*d2Pz[j]);

    multf3 = cuCmul(d2Pz[j],make_cuDoubleComplex(cons2,0));

   

    //checkcmx[j][n1] = (multf1)*(sub1);

    //checkcmy[j][n1] = (multf2)*(sub1);

    //checkcmz[j][n1] = (multf3)*(sub1);

    checkcmx[j][n1] = cuCmul(multf1,make_cuDoubleComplex(sub1,0));

    checkcmy[j][n1] = cuCmul(multf2,make_cuDoubleComplex(sub1,0));

    checkcmz[j][n1] = cuCmul(multf3,make_cuDoubleComplex(sub1,0));

   }

  }

  //xt3=tot_cnlm_sum_p[h][n][l][b_m];

  //xt4=conj(xt3);

  xt4 = cuConj(xt3);

  //sameder_xt = sameder_xt + (xt4 * samediff_x + xt3 * conj(samediff_x));

  temp20 = cuCmul(xt3,cuConj(samediff_x));

  temp21 = cuCmul(xt4,samediff_x);

  temp22 = cuCadd(temp20,temp21);

  sameder_xt = cuCadd(sameder_xt,temp22);

  //sameder_yt = sameder_yt + (xt4 * samediff_y + xt3 * conj(samediff_y));

  temp23 = cuCmul(xt3,cuConj(samediff_y));

  temp24 = cuCmul(xt4,samediff_y);

  temp25 = cuCadd(temp23,temp24);

  sameder_yt = cuCadd(sameder_yt,temp25);

  //sameder_zt = sameder_zt + (xt4 * samediff_z + xt3 * conj(samediff_z));

  temp26 = cuCmul(xt3,cuConj(samediff_z));

  temp27 = cuCmul(xt4,samediff_z);

  temp28 = cuCadd(temp26,temp27);

  sameder_zt = cuCadd(sameder_zt,temp28);

 }

}

cons2 = ((4*pi)/(float)(2*l+1));

//multf1 = (cons2*sameder_xt);

//multf2 = (cons2*sameder_yt);

//multf3 = (cons2*sameder_zt);

multf1 = cuCmul(sameder_xt,make_cuDoubleComplex(cons2,0));

multf2 = cuCmul(sameder_yt,make_cuDoubleComplex(cons2,0));

multf3 = cuCmul(sameder_zt,make_cuDoubleComplex(cons2,0));



//checkcmx[h][n1] = (multf1)*(sub1);

//checkcmy[h][n1] = (multf2)*(sub1);

//checkcmz[h][n1] = (multf3)*(sub1);

checkcmx[h][n1] = cuCmul(multf1,make_cuDoubleComplex(sub1,0));

checkcmy[h][n1] = cuCmul(multf2,make_cuDoubleComplex(sub1,0));

checkcmz[h][n1] = cuCmul(multf3,make_cuDoubleComplex(sub1,0));

n1=n1+1;

}

}

///////////////////////////////energy-force calc starts from here//////////////////////////////

SumJJx = make_cuDoubleComplex(0,0);

SumJJy = make_cuDoubleComplex(0,0);

SumJJz = make_cuDoubleComplex(0,0);

//SumJJx=0.0;

//SumJJy=0.0;

//SumJJz=0.0;

for(jj=0; jj<natoms; jj++)

{

//tempx1 = 0.0;

//tempy1 = 0.0;

//tempz1 = 0.0;

tempx1 = make_cuDoubleComplex(0,0);

tempy1 = make_cuDoubleComplex(0,0);

tempz1 = make_cuDoubleComplex(0,0);

for (z=0; z<Lensym; z++)

{

tt1 = gradientE[jj][z];

tt2 = checkcmx[jj][z];

tt3 = checkcmy[jj][z];

tt4 = checkcmz[jj][z];

//GradFx = tt1 * tt2;

//GradFy = tt1 * tt3;

//GradFz = tt1 * tt4;

GradFx = cuCmul(tt1,tt2);

GradFy = cuCmul(tt1,tt3);

GradFz = cuCmul(tt1,tt4);

//tempx1 = tempx1 + GradFx;

//tempy1 = tempy1 + GradFy;

//tempz1 = tempz1 + GradFz;

tempx1 = cuCadd(tempx1,GradFx);

tempy1 = cuCadd(tempy1,GradFy);

tempz1 = cuCadd(tempz1,GradFz);

}

//SumJJx = SumJJx + tempx1;

//SumJJy = SumJJy + tempy1;

//SumJJz = SumJJz + tempz1;

SumJJx = cuCadd(SumJJx,tempx1);

SumJJy = cuCadd(SumJJy,tempy1);

SumJJz = cuCadd(SumJJz,tempz1);

}

ForceX[h] = cuCmul(SumJJx,make_cuDoubleComplex(hv,0));

ForceY[h] = cuCmul(SumJJy,make_cuDoubleComplex(hv,0));

ForceZ[h] = cuCmul(SumJJz,make_cuDoubleComplex(hv,0));

//ForceX[h] = -SumJJx;

//ForceY[h] = -SumJJy;

//ForceZ[h] = -SumJJz;

printf("%lf %lf %lf\n",ForceX[h],ForceY[h],ForceZ[h]);

//printf("%f + i%f\n", creal(ForceX[h]), cimag(ForceX[h]));

//printf("%lf + i%lf\t %lf + i%lf\t %lf + i%lf\n",ForceX[h].x,ForceX[h].y,ForceY[h].x,ForceY[h].y,ForceZ[h].x,ForceZ[h].y);

}

//end = clock();

//cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;

//printf(“fun() took %lf seconds to execute \n”, cpu_time_used);

}//end of main function

//start of functions//

void legendre(int lmax, int l, float theta_dummy, float Lmn[11])

{

int u,i,size_x,jj,hh,ind,nind;

int m_5,j;

float c,tol;

float htol=2.2251e-308;

float inf = 1e+10;

float x,rootn[21],twocot;

float s,P[11],sn;

float tmp1,tmp2,tmp3;

float trm1, trm2, trm3, trm4, trm5;

float t,t11;

for (i=0;i<11;i++)

{

Lmn[i]=0.0;

}

x=cosf(theta_dummy);

float vec_u[21] = {0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0};

for (u = 0; u<21; u++)

{

rootn[u] = 0.0;

}

for (u = 0; u<19; u++)

{

if (u<2*l+1){

 rootn[u] = sqrtf(vec_u[u]);}

}

s = sqrtf(1-(x*x));

for (i = 0; i<9; i++)

{

if (i<l){

 P[i] = 0.0;}

}

twocot = x;

if(x == -1)

{

twocot = inf;

}

else if(x == 1)

{

twocot = -inf;

}

else if(x != 0)

{

twocot = -2*x/s;

}

sn = 0.0;

float xyz=1.0;

int sss;

for(sss=1; sss<=9; sss++)

{

if(sss<=l){

 xyz *= (-s);}

}

sn = xyz;

tol = sqrtf(htol);

hh=0;

nind=0;

if ((x != 1) && ((float)(fabsf(sn)) >= tol))

{

nind = hh+1;

hh = hh + 1;

}

if (hh > 0)

{

float d[15] = {0.5,0.25,0.166666667,0.125,0.1,0.083333333,0.071428571,0.0625,0.055555556,0.05,0.045454545,0.041666667,0.038461538,0.035714286,0.033333333};

float dd;

c = 1;

for (i =0; i<9; i++)

{

if (i<l){

  //dd = (1-1/d[i]);

  dd = (1-d[i]);

  c *= dd;

}

}

P[l] = sqrtf(c)*sn; //10

P[l-1] = P[l]twocotl/rootn[2*l]; //9

tmp1 = P[l];

tmp2 = P[l-1];

for (m_5 = 7; m_5>=0; m_5+=-1)

{

if (m_5<=l-2)

{

  trm1 = tmp2*twocot*(m_5+1);

  trm2 = tmp1*rootn[l+m_5+2];

  trm3 = rootn[l-m_5-1];

  trm4 = rootn[l+m_5+1];

  trm5 = rootn[l-m_5];

  tmp3 = (trm1-trm2*trm3)/(trm4*trm5);

  P[m_5] = tmp3;

  tmp1 = tmp2;

  tmp2 = tmp3;

  }

}

}

for (i=0; i<10; i++)

{

if (i<l+1)

{

  t = P[i];

  Lmn[i] = t;

}

}

float abc=1.0;

int uuu;

for(uuu=1; uuu<=9; uuu++){

if(uuu<=l){

abc *= x;}

}

if (s == 0)

{

  Lmn[0] = abc;

}

for (m_5=1; m_5<=8; m_5++)

{

if (m_5<=(l-1))

{

   c = 1;

   for (j = 1; j<=16; j++)

   {

       if (j<=2*m_5)

       {

          c*=rootn[l-m_5+2+j-2];

       }

   }

   t = Lmn[m_5];

   t11 = c*t;

   Lmn[m_5] = t11;

}

}

c = 1;

for (i = 0; i<18; i++)

{

  if (i<2*l)

  {

  c *= rootn[i+1];

  }

}

Lmn[l] *= c;

}

And I am getting output as follows:

Please anyone please suggest what changes or modifications can I do for getting correct output. I have tried a lot. Thank you so much in advance

[1] Posting a thousand lines of code for others to look at is usually not a good idea. If you came across such a posting, would you feel compelled to take a look at it? Why not?

[2] The code as posted did not compile successfully.

[3] After fixing some obvious issues I noticed that the compiler complains about a subscript out of range:

float cons1[10][21];
[...]
cuDoubleComplex mult1 = cuCmul(comp_1_w, make_cuDoubleComplex(cons1[11][b_mm], 0));

[4] The compiler warns about this assignment, which I am guessing is from a double-precision version of the code. As is, this equivalent to float htol = 0.0f; which is probably not what was intended.
float htol=2.2251e-308;

[5] The constituent parts of a cuDoubleComplex are of type double. You can print double with the %f or %g or %e format specifiers of printf(). My usual default is % 23.16e.

[6] Generally it is a good idea to turn on all warnings (e.g. -Wall -Werror) and fix all issues the compiler complains about.

[7] You should be able to figure where things go wrong by using standard debugging techniques. Personally, because I am a dinosaur who spent some time developing on embedded systems with bare bones functionality (think serial port access to a console) I am used to keep things very simple and simply instrument code with printf() that print some key data as the processing progresses and log that to a file. Clearly at some point some the data will start to deviate from what is expected. Then I work backwards in more detail from there until I have identified the programming step that is the root cause of the problem.

Modern programmers typically avail themselves of the powerful features of a modern debugger, so that is something you may want to get acquainted with.

@njuffa [1] A very big thanks to you for taking out your time to see my code and giving valuable suggestions and yeah I posted whole code which was not a good idea.
[2] Actually I am using google colab for running this cuda code and this code is successfully being compiled there.
[3] About the point [4] which you mentioned that float htol is not assigned the intented value of 2.2251e-308, then can you please suggest any modification that can be done in order to assign it the correct value?
[4] I also tried using . operator for accessing ForceX values by writing following line of code:

printf("%lf + i%lf\n",ForceX[h].x,ForceX[h].y);
But still I am getting output as zero which is attached below:

So final results are not what you expected. Not an uncommon occurrence. Time to start debugging the code. I would suggest starting at the beginning: Is the data being fed into this program the correct data? Then are multiple processing steps. Is intermediate data after each step what is expected? If not, start looking at the details of the processing step that failed until you have identified the problem.

@njuffa Thank you so much for your response.
Yes, I have started debugging the code. I am printing the values to see if all the intermediate data is correct or not.
Thanks again