# Question on how to accelerate a complex loop

Hi,everyone
I’m trying to accelerate my c++ program by OpenACC. But i have trouble in accelerating a complex loop.Here is my code

``````void f_relaxation()
{
double  Sf[nf]={0.}, Rfeq[nf]={0.}, gg[nf]={0.}, ff[nf]={0.};
int i,j,k;

Sf[0]=1.0, Sf[3]=1.0, Sf[5]=1.0, Sf[7]=1.0;    //issue one :Sf prevents parallelization
Sf[1]=1.6, Sf[2]=1.2;
Sf[14]=1.2;

#pragma acc kernels present(ux,uy,uz,tol,C,Rf,gama,rho,omg,muf,mob,prs) \
copyin(Sf)
#pragma acc loop private(Rfeq,gg,ff)//,independent
for(i=0; i<im; i++)                                                 //line 272
#pragma acc loop private(Rfeq,gg,ff)//,independent
for(j=0; j<jm; j++)
#pragma acc loop private(Rfeq,gg,ff)//,independent
for(k=0; k<km; k++)
{
Sf[4]=3.*(2.-tol[i][j][k])/(3.-tol[i][j][k]), Sf[6]=Sf[4], Sf[8]=Sf[4];
Sf[9]=omg[i][j][k], Sf[10]=Sf[9], Sf[11]=Sf[9], Sf[12]=Sf[9], Sf[13]=Sf[9];
//---------------relaxation--------------//
Rfeq[0]=prs[i][j][k];
Rfeq[1]=-prs[i][j][k]+rho[i][j][k]*(pow(ux[i][j][k],2)+pow(uy[i][j][k],2)+pow(uz[i][j][k],2))+3.*C[i][j][k]*muf[i][j][k];
Rfeq[2]=prs[i][j][k]-5.*rho[i][j][k]*(pow(ux[i][j][k],2)+pow(uy[i][j][k],2)+pow(uz[i][j][k],2))-45.*C[i][j][k]*muf[i][j][k];
Rfeq[3]=rho[i][j][k]*ux[i][j][k], Rfeq[4]=-7.*Rfeq[3]/3.;
Rfeq[5]=rho[i][j][k]*uy[i][j][k], Rfeq[6]=-7.*Rfeq[5]/3.;
Rfeq[7]=rho[i][j][k]*uz[i][j][k], Rfeq[8]=-7.*Rfeq[7]/3.;
Rfeq[9]=rho[i][j][k]*(2.*pow(ux[i][j][k],2)-pow(uy[i][j][k],2)-pow(uz[i][j][k],2));
Rfeq[10]=rho[i][j][k]*(pow(uy[i][j][k],2)-pow(uz[i][j][k],2));
Rfeq[11]=rho[i][j][k]*ux[i][j][k]*uy[i][j][k];
Rfeq[12]=rho[i][j][k]*uy[i][j][k]*uz[i][j][k];
Rfeq[13]=rho[i][j][k]*ux[i][j][k]*uz[i][j][k];
Rfeq[14]=0.0;

for(int kf=0; kf<nf; kf++) {ff[kf]=0.0;}
double drho_x=CDx_cal(i,j,k,rho);
double drho_y=CDy_cal(i,j,k,rho);
double drho_z=CDz_cal(i,j,k,rho);
double dC_x=CDx_cal(i,j,k,C);
double dC_y=CDy_cal(i,j,k,C);
double dC_z=CDz_cal(i,j,k,C);       //dC_x,dC_y.dC_z:scalar dependence

double dmuf_x=CDx_cal(i,j,k,muf);
double dmuf_y=CDy_cal(i,j,k,muf);
double dmuf_z=CDz_cal(i,j,k,muf);
double dmob_x=CDx_cal(i,j,k,mob);
double dmob_y=CDy_cal(i,j,k,mob);
double dmob_z=CDz_cal(i,j,k,mob);
double d2muf=D2_cal(i,j,k,muf);
double diffusion=dmob_x*dmuf_x+dmob_y*dmuf_y+dmob_z*dmuf_z+mob[i][j][k]*d2muf;

double uxyz=pow(ux[i][j][k],2)+pow(uy[i][j][k],2)+pow(uz[i][j][k],2);
for(int kf=0; kf<nf; kf++)
{
double eu=ex[kf]*ux[i][j][k]+ey[kf]*uy[i][j][k]+ez[kf]*uz[i][j][k];
gama[i][j][k][kf]=w[kf]*(1.+3.*eu+4.5*eu*eu-1.5*uxyz);

double drho_dir=CD_cal_dir(i,j,k,kf,rho);
double dC_dir=CD_cal_dir(i,j,k,kf,C);

double term_rho=drho_dir-(ux[i][j][k]*drho_x+uy[i][j][k]*drho_y+uz[i][j][k]*drho_z)*dt;
term_rho*=(gama[i][j][k][kf]-w[kf])/3.;
double term_C=dC_dir-(ux[i][j][k]*dC_x+uy[i][j][k]*dC_y+uz[i][j][k]*dC_z)*dt;
if(k==0)
{
int i1=ni[i][j][k][0], i_1=ni[i][j][k][1];
int j1=nj[i][j][k][0], j_1=nj[i][j][k][1];
dC_x=(C[i1][j][k]-C[i_1][j][k])/(2.*ds);                     //line 322
dC_y=(C[i][j1][k]-C[i][j_1][k])/(2.*ds);
dC_z=omg_wtg/kappa*C[i][j][k]*(1.-C[i][j][k]);   //issue two: dC_x,dC_y,dC_z scalar dependence
term_C=((ex[kf]-ux[i][j][k])*dC_x+(ey[kf]-uy[i][j][k])*dC_y+(ez[kf]-uz[i][j][k])*dC_z)*dt;
}
term_C*=gama[i][j][k][kf];
double term_Fa=-(rho_L-rho_G)*diffusion*((ex[kf]-ux[i][j][k])*ux[i][j][k]+(ey[kf]-uy[i][j][k])*uy[i][j][k]+(ez[kf]-uz[i][j][k])*uz[i][j][k])*dt;
term_Fa*=gama[i][j][k][kf];
gg[kf]=3.*(term_rho+muf[i][j][k]*term_C+term_Fa);
}
ff[0]=gg[0]+gg[1]+gg[2]+gg[3]+gg[4]+gg[5]+gg[6]+gg[7]+gg[8]+gg[9]+gg[10]+gg[11]+gg[12]+gg[13]+gg[14];
ff[1]=-2.*gg[0]-(gg[1]+gg[2]+gg[3]+gg[4]+gg[5]+gg[6])+gg[7]+gg[8]+gg[9]+gg[10]+gg[11]+gg[12]+gg[13]+gg[14];
ff[2]=16.*gg[0]-4.*(gg[1]+gg[2]+gg[3]+gg[4]+gg[5]+gg[6])+gg[7]+gg[8]+gg[9]+gg[10]+gg[11]+gg[12]+gg[13]+gg[14];
ff[3]=gg[1]-gg[2]+gg[7]-gg[8]-gg[9]+gg[10]-gg[11]+gg[12]+gg[13]-gg[14];
ff[4]=-4.*(gg[1]-gg[2])+gg[7]-gg[8]-gg[9]+gg[10]-gg[11]+gg[12]+gg[13]-gg[14];
ff[5]=gg[3]-gg[4]+gg[7]-gg[8]+gg[9]-gg[10]-gg[11]+gg[12]-gg[13]+gg[14];
ff[6]=-4.*(gg[3]-gg[4])+gg[7]-gg[8]+gg[9]-gg[10]-gg[11]+gg[12]-gg[13]+gg[14];
ff[7]=gg[5]-gg[6]+gg[7]-gg[8]+gg[9]-gg[10]+gg[11]-gg[12]+gg[13]-gg[14];
ff[8]=-4.*(gg[5]-gg[6])+gg[7]-gg[8]+gg[9]-gg[10]+gg[11]-gg[12]+gg[13]-gg[14];
ff[9]=2.*(gg[1]+gg[2])-(gg[3]+gg[4]+gg[5]+gg[6]);
ff[10]=gg[3]+gg[4]-gg[5]-gg[6];
ff[11]=gg[7]+gg[8]-gg[9]-gg[10]+gg[11]+gg[12]-gg[13]-gg[14];
ff[12]=gg[7]+gg[8]+gg[9]+gg[10]-gg[11]-gg[12]-gg[13]-gg[14];
ff[13]=gg[7]+gg[8]-gg[9]-gg[10]-gg[11]-gg[12]+gg[13]+gg[14];
ff[14]=gg[7]-gg[8]-gg[9]+gg[10]+gg[11]-gg[12]-gg[13]+gg[14];

for(int kf=0; kf<nf; kf++)
{Rf[i][j][k][kf]=Rf[i][j][k][kf]*(1.0-Sf[kf])+Rfeq[kf]*Sf[kf]+(1.-0.5*Sf[kf])*ff[kf]*dt;}
}
``````

And the compiler’s feedback:

`````` Loop carried dependence due to exposed use of Sf[:] prevents parallelization
274, Loop carried dependence due to exposed use of Sf[:] prevents parallelization
276, Loop carried dependence due to exposed use of Sf[:] prevents parallelization
Complex loop carried dependence of Sf prevents parallelization
.......
312, Loop carried scalar dependence for dC_x at line 322
Loop carried scalar dependence for dC_z at line 322
Loop carried scalar dependence for dC_y at line 322
``````

in this subroutine, the sf[:] is a coefficient which should be assigned at the beginning of the loop and needed to calculate the value of Rf in the end. And i’m not sure if the dC_x: scalar dependence would lead to a wrong parallel result since i only want to parallelize the outer(i,j,k) loops .
Do you have any good idea to fix this out?
Thank you!

Hi Guo shuhao,

Do you have any good idea to fix this out?

The problem of course is that if “sf” is shared among all the threads, you’ll have a race conditions. However if you privatize the array using “private”, then you’d need to initialize all the elements of “sf” within each loop and can’t use the initial initialization.

Instead, try using the “firstprivate” clayse. Like “private”, each thread will get it’s own private array. But instead of being uninitialized, “firstprivate” initializes each private array with the host array’s value.

Also, you might try collapsing the loops, rather than putting the “loop” directive on each of the outer loops. Something like:

``````#pragma acc kernels loop \
present(ux,uy,uz,tol,C,Rf,gama,rho,omg,muf,mob,prs) \
private(Rfeq,gg,ff) firstprivate(Sf) collapse(3)
for(i=0; i<im; i++)
for(j=0; j<jm; j++)
for(k=0; k<km; k++)
{
``````

Hope this helps,
Mat

Hi, Mat
Thanks for the replying! I did what you told me. But there comes another problem. As i consulted the OpenACC API 2.6, the firstprivate clause is allowed on the parallel and serial constructs. So i changed the directives as below:

``````#pragma acc parallel loop collapse(3) \
present(ux,uy,uz,tol,C,Rf,gama,rho,omg,muf,mob,prs) \
private(Rfeq,gg,ff), firstprivate(Sf)
``````

And the feedback:

``````f_relaxation():
273, Generating present(tol[:][:][:],ux[:][:][:],uy[:][:][:],uz[:][:][:],rho[:][:][:],C[:][:][:],Rf[:][:][:][:],prs[:][:][:],muf[:][:][:],mob[:][:][:],gama[:][:][:][:],omg[:][:][:])
Accelerator kernel generated
Generating Tesla code
273, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
275,   /* blockIdx.x threadIdx.x collapsed */
277,   /* blockIdx.x threadIdx.x collapsed */
295, #pragma acc loop seq
314, #pragma acc loop seq
355, #pragma acc loop seq
273, Local memory used for Rfeq
Generating update device(Sf[:])
Local memory used for gg,ff
CUDA shared memory used for Sf
314, Loop carried scalar dependence for dC_x at line 324
Loop carried scalar dependence for dC_z at line 324
Loop carried scalar dependence for dC_y at line 324
``````

Then the compiler reported an error:

``````./xxx.exe
30	0.026	210.663	0.00213612	0.0005
call to cuMemcpyHtoDAsync returned error 1: Invalid value
``````

It seems the Memory copy went wrong, i can’t figure out why?
Thanks for helping me.
Guo shuhao

It seems the Memory copy went wrong, i can’t figure out why?

Possible.

What I’d recommend is setting the environment variable “PGI_ACC_NOTIFY=3”. This will have the runtime print out each kernel launch and where the data is getting copied. It should help determine which line in your source is causing the problem.

Alternatively, you can set “PGI_ACC_DEBUG=1” which will give very verbose information about not only line numbers but the arguments the runtime is using. Though, there’s a lot of information here so may be a bit more than what you need.

Once you determine where the error is occurring, it should help you determine why. If you still have issues, please post or send to PGI Customer Service (trs@pgroup.com) a full reproducing example and I can help you investigate.

-Mat