Question on how to accelerate a complex loop

Hi,everyone
I’m trying to accelerate my c++ program by OpenACC. But i have trouble in accelerating a complex loop.Here is my code

void f_relaxation()
{
	double  Sf[nf]={0.}, Rfeq[nf]={0.}, gg[nf]={0.}, ff[nf]={0.};
	int i,j,k;

        Sf[0]=1.0, Sf[3]=1.0, Sf[5]=1.0, Sf[7]=1.0;    //issue one :Sf prevents parallelization
        Sf[1]=1.6, Sf[2]=1.2;
        Sf[14]=1.2;

    #pragma acc kernels present(ux,uy,uz,tol,C,Rf,gama,rho,omg,muf,mob,prs) \
                        copyin(Sf)
    #pragma acc loop private(Rfeq,gg,ff)//,independent
	for(i=0; i<im; i++)                                                 //line 272
    #pragma acc loop private(Rfeq,gg,ff)//,independent
	for(j=0; j<jm; j++)
	#pragma acc loop private(Rfeq,gg,ff)//,independent
	for(k=0; k<km; k++)
	{
		Sf[4]=3.*(2.-tol[i][j][k])/(3.-tol[i][j][k]), Sf[6]=Sf[4], Sf[8]=Sf[4];
		Sf[9]=omg[i][j][k], Sf[10]=Sf[9], Sf[11]=Sf[9], Sf[12]=Sf[9], Sf[13]=Sf[9];
		//---------------relaxation--------------//
		Rfeq[0]=prs[i][j][k];
		Rfeq[1]=-prs[i][j][k]+rho[i][j][k]*(pow(ux[i][j][k],2)+pow(uy[i][j][k],2)+pow(uz[i][j][k],2))+3.*C[i][j][k]*muf[i][j][k];
		Rfeq[2]=prs[i][j][k]-5.*rho[i][j][k]*(pow(ux[i][j][k],2)+pow(uy[i][j][k],2)+pow(uz[i][j][k],2))-45.*C[i][j][k]*muf[i][j][k];
		Rfeq[3]=rho[i][j][k]*ux[i][j][k], Rfeq[4]=-7.*Rfeq[3]/3.;
		Rfeq[5]=rho[i][j][k]*uy[i][j][k], Rfeq[6]=-7.*Rfeq[5]/3.;
		Rfeq[7]=rho[i][j][k]*uz[i][j][k], Rfeq[8]=-7.*Rfeq[7]/3.;
		Rfeq[9]=rho[i][j][k]*(2.*pow(ux[i][j][k],2)-pow(uy[i][j][k],2)-pow(uz[i][j][k],2));
		Rfeq[10]=rho[i][j][k]*(pow(uy[i][j][k],2)-pow(uz[i][j][k],2));
		Rfeq[11]=rho[i][j][k]*ux[i][j][k]*uy[i][j][k];
		Rfeq[12]=rho[i][j][k]*uy[i][j][k]*uz[i][j][k];
		Rfeq[13]=rho[i][j][k]*ux[i][j][k]*uz[i][j][k];
		Rfeq[14]=0.0;

		for(int kf=0; kf<nf; kf++) {ff[kf]=0.0;}
		double drho_x=CDx_cal(i,j,k,rho);
		double drho_y=CDy_cal(i,j,k,rho);
		double drho_z=CDz_cal(i,j,k,rho);
		double dC_x=CDx_cal(i,j,k,C);
		double dC_y=CDy_cal(i,j,k,C);
		double dC_z=CDz_cal(i,j,k,C);       //dC_x,dC_y.dC_z:scalar dependence

		double dmuf_x=CDx_cal(i,j,k,muf);
		double dmuf_y=CDy_cal(i,j,k,muf);
		double dmuf_z=CDz_cal(i,j,k,muf);
		double dmob_x=CDx_cal(i,j,k,mob);
		double dmob_y=CDy_cal(i,j,k,mob);
		double dmob_z=CDz_cal(i,j,k,mob);
		double d2muf=D2_cal(i,j,k,muf);
		double diffusion=dmob_x*dmuf_x+dmob_y*dmuf_y+dmob_z*dmuf_z+mob[i][j][k]*d2muf;

		double uxyz=pow(ux[i][j][k],2)+pow(uy[i][j][k],2)+pow(uz[i][j][k],2);
		for(int kf=0; kf<nf; kf++)
		{
			double eu=ex[kf]*ux[i][j][k]+ey[kf]*uy[i][j][k]+ez[kf]*uz[i][j][k];
			gama[i][j][k][kf]=w[kf]*(1.+3.*eu+4.5*eu*eu-1.5*uxyz);

			double drho_dir=CD_cal_dir(i,j,k,kf,rho);
			double dC_dir=CD_cal_dir(i,j,k,kf,C);

			double term_rho=drho_dir-(ux[i][j][k]*drho_x+uy[i][j][k]*drho_y+uz[i][j][k]*drho_z)*dt;
			term_rho*=(gama[i][j][k][kf]-w[kf])/3.;
			double term_C=dC_dir-(ux[i][j][k]*dC_x+uy[i][j][k]*dC_y+uz[i][j][k]*dC_z)*dt;
			if(k==0)
			{
				int i1=ni[i][j][k][0], i_1=ni[i][j][k][1];
				int j1=nj[i][j][k][0], j_1=nj[i][j][k][1];
				dC_x=(C[i1][j][k]-C[i_1][j][k])/(2.*ds);                     //line 322
				dC_y=(C[i][j1][k]-C[i][j_1][k])/(2.*ds);
				dC_z=omg_wtg/kappa*C[i][j][k]*(1.-C[i][j][k]);   //issue two: dC_x,dC_y,dC_z scalar dependence
				term_C=((ex[kf]-ux[i][j][k])*dC_x+(ey[kf]-uy[i][j][k])*dC_y+(ez[kf]-uz[i][j][k])*dC_z)*dt;
			}
			term_C*=gama[i][j][k][kf];
			double term_Fa=-(rho_L-rho_G)*diffusion*((ex[kf]-ux[i][j][k])*ux[i][j][k]+(ey[kf]-uy[i][j][k])*uy[i][j][k]+(ez[kf]-uz[i][j][k])*uz[i][j][k])*dt;		
			term_Fa*=gama[i][j][k][kf];
			gg[kf]=3.*(term_rho+muf[i][j][k]*term_C+term_Fa);
		}
		ff[0]=gg[0]+gg[1]+gg[2]+gg[3]+gg[4]+gg[5]+gg[6]+gg[7]+gg[8]+gg[9]+gg[10]+gg[11]+gg[12]+gg[13]+gg[14];
		ff[1]=-2.*gg[0]-(gg[1]+gg[2]+gg[3]+gg[4]+gg[5]+gg[6])+gg[7]+gg[8]+gg[9]+gg[10]+gg[11]+gg[12]+gg[13]+gg[14];
		ff[2]=16.*gg[0]-4.*(gg[1]+gg[2]+gg[3]+gg[4]+gg[5]+gg[6])+gg[7]+gg[8]+gg[9]+gg[10]+gg[11]+gg[12]+gg[13]+gg[14];
		ff[3]=gg[1]-gg[2]+gg[7]-gg[8]-gg[9]+gg[10]-gg[11]+gg[12]+gg[13]-gg[14];
		ff[4]=-4.*(gg[1]-gg[2])+gg[7]-gg[8]-gg[9]+gg[10]-gg[11]+gg[12]+gg[13]-gg[14];
		ff[5]=gg[3]-gg[4]+gg[7]-gg[8]+gg[9]-gg[10]-gg[11]+gg[12]-gg[13]+gg[14];
		ff[6]=-4.*(gg[3]-gg[4])+gg[7]-gg[8]+gg[9]-gg[10]-gg[11]+gg[12]-gg[13]+gg[14];
		ff[7]=gg[5]-gg[6]+gg[7]-gg[8]+gg[9]-gg[10]+gg[11]-gg[12]+gg[13]-gg[14];
		ff[8]=-4.*(gg[5]-gg[6])+gg[7]-gg[8]+gg[9]-gg[10]+gg[11]-gg[12]+gg[13]-gg[14];
		ff[9]=2.*(gg[1]+gg[2])-(gg[3]+gg[4]+gg[5]+gg[6]);
		ff[10]=gg[3]+gg[4]-gg[5]-gg[6];
		ff[11]=gg[7]+gg[8]-gg[9]-gg[10]+gg[11]+gg[12]-gg[13]-gg[14];
		ff[12]=gg[7]+gg[8]+gg[9]+gg[10]-gg[11]-gg[12]-gg[13]-gg[14];
		ff[13]=gg[7]+gg[8]-gg[9]-gg[10]-gg[11]-gg[12]+gg[13]+gg[14];
		ff[14]=gg[7]-gg[8]-gg[9]+gg[10]+gg[11]-gg[12]-gg[13]+gg[14];

		for(int kf=0; kf<nf; kf++)
		{Rf[i][j][k][kf]=Rf[i][j][k][kf]*(1.0-Sf[kf])+Rfeq[kf]*Sf[kf]+(1.-0.5*Sf[kf])*ff[kf]*dt;}
	}

And the compiler’s feedback:

 Loop carried dependence due to exposed use of Sf[:] prevents parallelization
    274, Loop carried dependence due to exposed use of Sf[:] prevents parallelization
    276, Loop carried dependence due to exposed use of Sf[:] prevents parallelization
         Complex loop carried dependence of Sf prevents parallelization
.......
312, Loop carried scalar dependence for dC_x at line 322
         Loop carried scalar dependence for dC_z at line 322
         Loop carried scalar dependence for dC_y at line 322

in this subroutine, the sf[:] is a coefficient which should be assigned at the beginning of the loop and needed to calculate the value of Rf in the end. And i’m not sure if the dC_x: scalar dependence would lead to a wrong parallel result since i only want to parallelize the outer(i,j,k) loops .
Do you have any good idea to fix this out?
Thank you!

Hi Guo shuhao,

Do you have any good idea to fix this out?

The problem of course is that if “sf” is shared among all the threads, you’ll have a race conditions. However if you privatize the array using “private”, then you’d need to initialize all the elements of “sf” within each loop and can’t use the initial initialization.

Instead, try using the “firstprivate” clayse. Like “private”, each thread will get it’s own private array. But instead of being uninitialized, “firstprivate” initializes each private array with the host array’s value.

Also, you might try collapsing the loops, rather than putting the “loop” directive on each of the outer loops. Something like:

#pragma acc kernels loop \
   present(ux,uy,uz,tol,C,Rf,gama,rho,omg,muf,mob,prs) \
   private(Rfeq,gg,ff) firstprivate(Sf) collapse(3)
   for(i=0; i<im; i++)    
   for(j=0; j<jm; j++) 
   for(k=0; k<km; k++) 
   {

Hope this helps,
Mat

Hi, Mat
Thanks for the replying! I did what you told me. But there comes another problem. As i consulted the OpenACC API 2.6, the firstprivate clause is allowed on the parallel and serial constructs. So i changed the directives as below:

#pragma acc parallel loop collapse(3) \
     present(ux,uy,uz,tol,C,Rf,gama,rho,omg,muf,mob,prs) \
     private(Rfeq,gg,ff), firstprivate(Sf)

And the feedback:

f_relaxation():
    273, Generating present(tol[:][:][:],ux[:][:][:],uy[:][:][:],uz[:][:][:],rho[:][:][:],C[:][:][:],Rf[:][:][:][:],prs[:][:][:],muf[:][:][:],mob[:][:][:],gama[:][:][:][:],omg[:][:][:])
         Accelerator kernel generated
         Generating Tesla code
        273, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        275,   /* blockIdx.x threadIdx.x collapsed */
        277,   /* blockIdx.x threadIdx.x collapsed */
        295, #pragma acc loop seq
        314, #pragma acc loop seq
        355, #pragma acc loop seq
    273, Local memory used for Rfeq
         Generating update device(Sf[:])
         Local memory used for gg,ff
         CUDA shared memory used for Sf
    314, Loop carried scalar dependence for dC_x at line 324
         Loop carried scalar dependence for dC_z at line 324
         Loop carried scalar dependence for dC_y at line 324

Then the compiler reported an error:

./xxx.exe
30	0.026	210.663	0.00213612	0.0005
call to cuMemcpyHtoDAsync returned error 1: Invalid value

It seems the Memory copy went wrong, i can’t figure out why?
Thanks for helping me.
Guo shuhao

It seems the Memory copy went wrong, i can’t figure out why?

Possible.

What I’d recommend is setting the environment variable “PGI_ACC_NOTIFY=3”. This will have the runtime print out each kernel launch and where the data is getting copied. It should help determine which line in your source is causing the problem.

Alternatively, you can set “PGI_ACC_DEBUG=1” which will give very verbose information about not only line numbers but the arguments the runtime is using. Though, there’s a lot of information here so may be a bit more than what you need.

Once you determine where the error is occurring, it should help you determine why. If you still have issues, please post or send to PGI Customer Service (trs@pgroup.com) a full reproducing example and I can help you investigate.

-Mat