Hi,everyone
I’m trying to accelerate my c++ program by OpenACC. But i have trouble in accelerating a complex loop.Here is my code
void f_relaxation()
{
double Sf[nf]={0.}, Rfeq[nf]={0.}, gg[nf]={0.}, ff[nf]={0.};
int i,j,k;
Sf[0]=1.0, Sf[3]=1.0, Sf[5]=1.0, Sf[7]=1.0; //issue one :Sf prevents parallelization
Sf[1]=1.6, Sf[2]=1.2;
Sf[14]=1.2;
#pragma acc kernels present(ux,uy,uz,tol,C,Rf,gama,rho,omg,muf,mob,prs) \
copyin(Sf)
#pragma acc loop private(Rfeq,gg,ff)//,independent
for(i=0; i<im; i++) //line 272
#pragma acc loop private(Rfeq,gg,ff)//,independent
for(j=0; j<jm; j++)
#pragma acc loop private(Rfeq,gg,ff)//,independent
for(k=0; k<km; k++)
{
Sf[4]=3.*(2.-tol[i][j][k])/(3.-tol[i][j][k]), Sf[6]=Sf[4], Sf[8]=Sf[4];
Sf[9]=omg[i][j][k], Sf[10]=Sf[9], Sf[11]=Sf[9], Sf[12]=Sf[9], Sf[13]=Sf[9];
//---------------relaxation--------------//
Rfeq[0]=prs[i][j][k];
Rfeq[1]=-prs[i][j][k]+rho[i][j][k]*(pow(ux[i][j][k],2)+pow(uy[i][j][k],2)+pow(uz[i][j][k],2))+3.*C[i][j][k]*muf[i][j][k];
Rfeq[2]=prs[i][j][k]-5.*rho[i][j][k]*(pow(ux[i][j][k],2)+pow(uy[i][j][k],2)+pow(uz[i][j][k],2))-45.*C[i][j][k]*muf[i][j][k];
Rfeq[3]=rho[i][j][k]*ux[i][j][k], Rfeq[4]=-7.*Rfeq[3]/3.;
Rfeq[5]=rho[i][j][k]*uy[i][j][k], Rfeq[6]=-7.*Rfeq[5]/3.;
Rfeq[7]=rho[i][j][k]*uz[i][j][k], Rfeq[8]=-7.*Rfeq[7]/3.;
Rfeq[9]=rho[i][j][k]*(2.*pow(ux[i][j][k],2)-pow(uy[i][j][k],2)-pow(uz[i][j][k],2));
Rfeq[10]=rho[i][j][k]*(pow(uy[i][j][k],2)-pow(uz[i][j][k],2));
Rfeq[11]=rho[i][j][k]*ux[i][j][k]*uy[i][j][k];
Rfeq[12]=rho[i][j][k]*uy[i][j][k]*uz[i][j][k];
Rfeq[13]=rho[i][j][k]*ux[i][j][k]*uz[i][j][k];
Rfeq[14]=0.0;
for(int kf=0; kf<nf; kf++) {ff[kf]=0.0;}
double drho_x=CDx_cal(i,j,k,rho);
double drho_y=CDy_cal(i,j,k,rho);
double drho_z=CDz_cal(i,j,k,rho);
double dC_x=CDx_cal(i,j,k,C);
double dC_y=CDy_cal(i,j,k,C);
double dC_z=CDz_cal(i,j,k,C); //dC_x,dC_y.dC_z:scalar dependence
double dmuf_x=CDx_cal(i,j,k,muf);
double dmuf_y=CDy_cal(i,j,k,muf);
double dmuf_z=CDz_cal(i,j,k,muf);
double dmob_x=CDx_cal(i,j,k,mob);
double dmob_y=CDy_cal(i,j,k,mob);
double dmob_z=CDz_cal(i,j,k,mob);
double d2muf=D2_cal(i,j,k,muf);
double diffusion=dmob_x*dmuf_x+dmob_y*dmuf_y+dmob_z*dmuf_z+mob[i][j][k]*d2muf;
double uxyz=pow(ux[i][j][k],2)+pow(uy[i][j][k],2)+pow(uz[i][j][k],2);
for(int kf=0; kf<nf; kf++)
{
double eu=ex[kf]*ux[i][j][k]+ey[kf]*uy[i][j][k]+ez[kf]*uz[i][j][k];
gama[i][j][k][kf]=w[kf]*(1.+3.*eu+4.5*eu*eu-1.5*uxyz);
double drho_dir=CD_cal_dir(i,j,k,kf,rho);
double dC_dir=CD_cal_dir(i,j,k,kf,C);
double term_rho=drho_dir-(ux[i][j][k]*drho_x+uy[i][j][k]*drho_y+uz[i][j][k]*drho_z)*dt;
term_rho*=(gama[i][j][k][kf]-w[kf])/3.;
double term_C=dC_dir-(ux[i][j][k]*dC_x+uy[i][j][k]*dC_y+uz[i][j][k]*dC_z)*dt;
if(k==0)
{
int i1=ni[i][j][k][0], i_1=ni[i][j][k][1];
int j1=nj[i][j][k][0], j_1=nj[i][j][k][1];
dC_x=(C[i1][j][k]-C[i_1][j][k])/(2.*ds); //line 322
dC_y=(C[i][j1][k]-C[i][j_1][k])/(2.*ds);
dC_z=omg_wtg/kappa*C[i][j][k]*(1.-C[i][j][k]); //issue two: dC_x,dC_y,dC_z scalar dependence
term_C=((ex[kf]-ux[i][j][k])*dC_x+(ey[kf]-uy[i][j][k])*dC_y+(ez[kf]-uz[i][j][k])*dC_z)*dt;
}
term_C*=gama[i][j][k][kf];
double term_Fa=-(rho_L-rho_G)*diffusion*((ex[kf]-ux[i][j][k])*ux[i][j][k]+(ey[kf]-uy[i][j][k])*uy[i][j][k]+(ez[kf]-uz[i][j][k])*uz[i][j][k])*dt;
term_Fa*=gama[i][j][k][kf];
gg[kf]=3.*(term_rho+muf[i][j][k]*term_C+term_Fa);
}
ff[0]=gg[0]+gg[1]+gg[2]+gg[3]+gg[4]+gg[5]+gg[6]+gg[7]+gg[8]+gg[9]+gg[10]+gg[11]+gg[12]+gg[13]+gg[14];
ff[1]=-2.*gg[0]-(gg[1]+gg[2]+gg[3]+gg[4]+gg[5]+gg[6])+gg[7]+gg[8]+gg[9]+gg[10]+gg[11]+gg[12]+gg[13]+gg[14];
ff[2]=16.*gg[0]-4.*(gg[1]+gg[2]+gg[3]+gg[4]+gg[5]+gg[6])+gg[7]+gg[8]+gg[9]+gg[10]+gg[11]+gg[12]+gg[13]+gg[14];
ff[3]=gg[1]-gg[2]+gg[7]-gg[8]-gg[9]+gg[10]-gg[11]+gg[12]+gg[13]-gg[14];
ff[4]=-4.*(gg[1]-gg[2])+gg[7]-gg[8]-gg[9]+gg[10]-gg[11]+gg[12]+gg[13]-gg[14];
ff[5]=gg[3]-gg[4]+gg[7]-gg[8]+gg[9]-gg[10]-gg[11]+gg[12]-gg[13]+gg[14];
ff[6]=-4.*(gg[3]-gg[4])+gg[7]-gg[8]+gg[9]-gg[10]-gg[11]+gg[12]-gg[13]+gg[14];
ff[7]=gg[5]-gg[6]+gg[7]-gg[8]+gg[9]-gg[10]+gg[11]-gg[12]+gg[13]-gg[14];
ff[8]=-4.*(gg[5]-gg[6])+gg[7]-gg[8]+gg[9]-gg[10]+gg[11]-gg[12]+gg[13]-gg[14];
ff[9]=2.*(gg[1]+gg[2])-(gg[3]+gg[4]+gg[5]+gg[6]);
ff[10]=gg[3]+gg[4]-gg[5]-gg[6];
ff[11]=gg[7]+gg[8]-gg[9]-gg[10]+gg[11]+gg[12]-gg[13]-gg[14];
ff[12]=gg[7]+gg[8]+gg[9]+gg[10]-gg[11]-gg[12]-gg[13]-gg[14];
ff[13]=gg[7]+gg[8]-gg[9]-gg[10]-gg[11]-gg[12]+gg[13]+gg[14];
ff[14]=gg[7]-gg[8]-gg[9]+gg[10]+gg[11]-gg[12]-gg[13]+gg[14];
for(int kf=0; kf<nf; kf++)
{Rf[i][j][k][kf]=Rf[i][j][k][kf]*(1.0-Sf[kf])+Rfeq[kf]*Sf[kf]+(1.-0.5*Sf[kf])*ff[kf]*dt;}
}
And the compiler’s feedback:
Loop carried dependence due to exposed use of Sf[:] prevents parallelization
274, Loop carried dependence due to exposed use of Sf[:] prevents parallelization
276, Loop carried dependence due to exposed use of Sf[:] prevents parallelization
Complex loop carried dependence of Sf prevents parallelization
.......
312, Loop carried scalar dependence for dC_x at line 322
Loop carried scalar dependence for dC_z at line 322
Loop carried scalar dependence for dC_y at line 322
in this subroutine, the sf[:] is a coefficient which should be assigned at the beginning of the loop and needed to calculate the value of Rf in the end. And i’m not sure if the dC_x: scalar dependence would lead to a wrong parallel result since i only want to parallelize the outer(i,j,k) loops .
Do you have any good idea to fix this out?
Thank you!