I am confused about how the private clause in the following subroutine is working. I have arrays named ‘F_L’, ‘F_R’ and ‘variable’ in my subroutine which I have declared as private. The issue is that, the private clause is working only when I am using it beside the vector clause (innermost loop) but not working when I am putting it at the Gang level. In either of the cases I am not seeing any errors in the -Minfo messages. I am confused why private clause in not working when it was put at the gang level.
I am guessing it has something to do with the IF conditions in the subroutine. Also, is there any advice of how loop scheduling be done when these kind of multiple IF and IFELSE conditions are present in the loops?
Subroutine (with the private clause at vector level):
SUBROUTINE HLLC_INTERCELL_FFLUX_ESTIMATE_I()
! Task: Given primitave vars on the cell faces (Qp_iphR, Qp_iphL), find "FLUX ON EACH FACE"
use declare_variables
implicit none
!Variables for HLLC
double precision, dimension(0:NImax, NJmax, NKmax, nblocks, nconserv) :: Fflux_iph
double precision :: Rho_L, Rho_R, Ux_L, Ux_R, Uy_L, Uy_R, Uz_L, Uz_R, P_L, P_R, C_L, C_R, E_L, E_R, h_L, h_R, E_local
double precision :: U_contra_L, U_contra_R, S_L, S_R, S, S_star
double precision, dimension(nconserv) :: F_L, F_R, variable
double precision :: sqrt_rho_L, sqrt_rho_R, Rho, divisor, u_average, v_average
double precision :: w_average, h_average, uvw_average,ucontra_ave, C_average
!$acc data create(Fflux_iph)
!$acc parallel loop gang collapse(3) default(present)
DO nbl = 1,nblocks
DO k = 1, NKmax
DO j = 1, NJmax
!$acc loop vector private(F_L,F_R,variable)
DO i = 0, NImax
if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then
!Local variables
Rho_L = Qp_iphL(i,j,k,nbl,1)
Rho_R = Qp_iphR(i,j,k,nbl,1)
Ux_L = Qp_iphL(i,j,k,nbl,2)
Ux_R = Qp_iphR(i,j,k,nbl,2)
Uy_L = Qp_iphL(i,j,k,nbl,3)
Uy_R = Qp_iphR(i,j,k,nbl,3)
Uz_L = Qp_iphL(i,j,k,nbl,4)
Uz_R = Qp_iphR(i,j,k,nbl,4)
P_L = Qp_iphL(i,j,k,nbl,5)
P_R = Qp_iphR(i,j,k,nbl,5)
C_L = Sqrt(Gamma*P_L/Rho_L*(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2))
C_R = Sqrt(Gamma*P_R/Rho_R*(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2))
E_L = P_L/(Gamma-1.d0) + Rho_L*(Ux_L**2+Uy_L**2+Uz_L**2)/2.d0
E_R = P_R/(Gamma-1.d0) + Rho_R*(Ux_R**2+Uy_R**2+Uz_R**2)/2.d0
h_L = (E_L + P_L)/Rho_L
h_R = (E_R + P_R)/Rho_R
U_contra_L = Ux_L*Ix_iph(i,j,k,nbl) + Uy_L*Iy_iph(i,j,k,nbl) + Uz_L*Iz_iph(i,j,k,nbl)
U_contra_R = Ux_R*Ix_iph(i,j,k,nbl) + Uy_R*Iy_iph(i,j,k,nbl) + Uz_R*Iz_iph(i,j,k,nbl)
! Roe averaging
sqrt_rho_L = sqrt(Rho_L)
sqrt_rho_R = sqrt(Rho_R)
Rho = sqrt(Rho_R/Rho_L)*Rho_L
divisor = 1.d0/(sqrt_rho_R+sqrt_rho_L)
h_average = ((h_L*sqrt_rho_L) + (h_R*sqrt_rho_R))*divisor
u_average = ((Ux_L*sqrt_rho_L) + (Ux_R*sqrt_rho_R))*divisor
v_average = ((Uy_L*sqrt_rho_L) + (Uy_R*sqrt_rho_R))*divisor
w_average = ((Uz_L*sqrt_rho_L) + (Uz_R*sqrt_rho_R))*divisor
ucontra_ave = u_average*Ix_iph(i,j,k,nbl) + v_average*Iy_iph(i,j,k,nbl) + w_average*Iz_iph(i,j,k,nbl)
uvw_average = 0.5d0 * (u_average**2.d0 + v_average**2.d0 + w_average**2.d0)
C_average = sqrt((gamma-1.d0)*(h_average - uvw_average))*SQRT(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2)
! Wave speeds
S_L = MIN(U_contra_L - C_L, ucontra_ave - C_average)
S_R = MAX(U_contra_R + C_R, ucontra_ave + C_average)
!S_L = MIN(U_contra_L - C_L, U_contra_R - C_R)
!S_R = MAX(U_contra_L + C_L, U_contra_R + C_R)
S_star = (Rho_R*U_contra_R*(S_R-U_contra_R) - Rho_L*U_contra_L*(S_L-U_contra_L) &
+ (P_L-P_R)*(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2)) &
/(Rho_R*(S_R-U_contra_R) - Rho_L*(S_L-U_contra_L))
!Step3: Estimate fluxes
F_L(1) = (rho_L*U_contra_L) /Jac_iph(i,j,k,nbl)
F_L(2) = (rho_L*U_contra_L*Ux_L + P_L*Ix_iph(i,j,k,nbl)) /Jac_iph(i,j,k,nbl)
F_L(3) = (rho_L*U_contra_L*Uy_L + P_L*Iy_iph(i,j,k,nbl)) /Jac_iph(i,j,k,nbl)
F_L(4) = ((rho_L*U_contra_L*Uz_L + P_L*Iz_iph(i,j,k,nbl)) /Jac_iph(i,j,k,nbl))*(1-f2D)
F_L(5) = (E_L + P_L) * U_contra_L /Jac_iph(i,j,k,nbl)
F_R(1) = (rho_R*U_contra_R) /Jac_iph(i,j,k,nbl)
F_R(2) = (rho_R*U_contra_R*Ux_R + P_R*Ix_iph(i,j,k,nbl)) /Jac_iph(i,j,k,nbl)
F_R(3) = (rho_R*U_contra_R*Uy_R + P_R*Iy_iph(i,j,k,nbl)) /Jac_iph(i,j,k,nbl)
F_R(4) = ((rho_R*U_contra_R*Uz_R + P_R*Iz_iph(i,j,k,nbl)) /Jac_iph(i,j,k,nbl))*(1-f2D)
F_R(5) = (E_R + P_R) * U_contra_R /Jac_iph(i,j,k,nbl)
IF (S_L>=0.d0)THEN
Fflux_iph(i,j,k,nbl,:) = F_L(:)
ELSEIF(S_L<=0.d0 .and. S_star>=0.d0) THEN
variable(1) = 1.d0
variable(2) = (S_star*Ix_iph(i,j,k,nbl) + Ux_L*(Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2) &
- Uy_L*Ix_iph(i,j,k,nbl)*Iy_iph(i,j,k,nbl)- Uz_L*Ix_iph(i,j,k,nbl)*Iz_iph(i,j,k,nbl))&
/(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2)
variable(3) = (S_star*Iy_iph(i,j,k,nbl) - Ux_L*(Ix_iph(i,j,k,nbl)*Iy_iph(i,j,k,nbl)) &
+ Uy_L*(Ix_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2) - Uz_L*(Iy_iph(i,j,k,nbl)*Iz_iph(i,j,k,nbl)))&
/(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2)
variable(4) = (S_star*Iz_iph(i,j,k,nbl) - Ux_L*(Ix_iph(i,j,k,nbl)*Iz_iph(i,j,k,nbl)) &
- Uy_L*(Iy_iph(i,j,k,nbl)*Iz_iph(i,j,k,nbl)) + Uz_L*(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2))&
/(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2)
variable(5) = E_L/Rho_L+ (S_star - U_contra_L)*(S_star/(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2)&
+P_L/(Rho_L*(S_L - U_contra_L)))
Fflux_iph(i,j,k,nbl,1) = F_L(1) + S_L*(Rho_L*(S_L - U_contra_L)/(S_L - S_star) *variable(1) - Rho_L)/Jac_iph(i,j,k,nbl)
Fflux_iph(i,j,k,nbl,2) = F_L(2) + S_L*(Rho_L*(S_L - U_contra_L)/(S_L - S_star) *variable(2) - Rho_L*Ux_L)/Jac_iph(i,j,k,nbl)
Fflux_iph(i,j,k,nbl,3) = F_L(3) + S_L*(Rho_L*(S_L - U_contra_L)/(S_L - S_star) *variable(3) - Rho_L*Uy_L)/Jac_iph(i,j,k,nbl)
Fflux_iph(i,j,k,nbl,4) = (F_L(4) + S_L*(Rho_L*(S_L - U_contra_L)/(S_L - S_star) &
*variable(4) - Rho_L*Uz_L)/Jac_iph(i,j,k,nbl))*(1-f2D)
Fflux_iph(i,j,k,nbl,5) = F_L(5) + S_L*(Rho_L*(S_L - U_contra_L)/(S_L - S_star) *variable(5) - E_L)/Jac_iph(i,j,k,nbl)
ELSEIF(S_star<=0.d0 .and. S_R>=0.d0) THEN
variable(1) = 1.d0
variable(2) = (S_star*Ix_iph(i,j,k,nbl) + Ux_R*(Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2) &
- Uy_R*Ix_iph(i,j,k,nbl)*Iy_iph(i,j,k,nbl)- Uz_R*Ix_iph(i,j,k,nbl)*Iz_iph(i,j,k,nbl))&
/(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2)
variable(3) = (S_star*Iy_iph(i,j,k,nbl) - Ux_R*(Ix_iph(i,j,k,nbl)*Iy_iph(i,j,k,nbl)) &
+ Uy_R*(Ix_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2)- Uz_R*(Iy_iph(i,j,k,nbl)*Iz_iph(i,j,k,nbl)))&
/(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2)
variable(4) = (S_star*Iz_iph(i,j,k,nbl) - Ux_R*(Ix_iph(i,j,k,nbl)*Iz_iph(i,j,k,nbl)) &
- Uy_R*(Iy_iph(i,j,k,nbl)*Iz_iph(i,j,k,nbl)) + Uz_R*(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2))&
/(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2)
variable(5) = E_R/Rho_R+ (S_star - U_contra_R)*(S_star/(Ix_iph(i,j,k,nbl)**2+Iy_iph(i,j,k,nbl)**2+Iz_iph(i,j,k,nbl)**2)&
+P_R/(Rho_R*(S_R - U_contra_R)))
Fflux_iph(i,j,k,nbl,1) = F_R(1) + S_R*(Rho_R*(S_R - U_contra_R)/(S_R - S_star) *variable(1) - Rho_R)/Jac_iph(i,j,k,nbl)
Fflux_iph(i,j,k,nbl,2) = F_R(2) + S_R*(Rho_R*(S_R - U_contra_R)/(S_R - S_star) *variable(2) - Rho_R*Ux_R)/Jac_iph(i,j,k,nbl)
Fflux_iph(i,j,k,nbl,3) = F_R(3) + S_R*(Rho_R*(S_R - U_contra_R)/(S_R - S_star) *variable(3) - Rho_R*Uy_R)/Jac_iph(i,j,k,nbl)
Fflux_iph(i,j,k,nbl,4) = (F_R(4)+S_R*(Rho_R*(S_R-U_contra_R)/(S_R-S_star)*variable(4)-Rho_R*Uz_R)/Jac_iph(i,j,k,nbl))*(1-f2D)
Fflux_iph(i,j,k,nbl,5) = F_R(5) + S_R*(Rho_R*(S_R - U_contra_R)/(S_R - S_star) *variable(5) - E_R)/Jac_iph(i,j,k,nbl)
ELSEIF(S_R<= 0.d0) THEN
Fflux_iph(i,j,k,nbl,:) = F_R(:)
ENDIF
endif
ENDDO
ENDDO
ENDDO
ENDDO
!$acc parallel loop gang collapse(4) default(present)
DO n_cons = 1,nconserv
DO nbl = 1,nblocks
DO k = 1, NKmax
DO j = 1, NJmax
!$acc loop vector
DO i = 1, NImax
if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then
! Residual_RHS(i,j,k,nbl,n_cons) = Residual_RHS(i,j,k,nbl,n_cons) - &
! (Fflux_iph(i,j,k,nbl,n_cons) - Fflux_iph(i-1,j,k,nbl,n_cons))
Residual_RHS(i,j,k,nbl,n_cons) = -(Fflux_iph(i,j,k,nbl,n_cons) - Fflux_iph(i-1,j,k,nbl,n_cons))
endif
ENDDO
ENDDO
ENDDO
ENDDO
ENDDO
!$acc end data
END SUBROUTINE
-Minfo message with private clause given at vector level:
!$acc data create(Fflux_iph)
!$acc parallel loop gang collapse(3) default(present)
DO nbl = 1,nblocks
DO k = 1, NKmax
DO j = 1, NJmax
!$acc loop vector private(F_L,F_R,variable)
DO i = 0, NImax
hllc_intercell_fflux_estimate_i:
1237, Generating create(fflux_iph(:,:,:,:,:)) [if not already present]
1238, Generating Tesla code
1239, !$acc loop gang collapse(3) ! blockidx%x
1240, ! blockidx%x collapsed
1241, ! blockidx%x collapsed
1243, !$acc loop vector(128) ! threadidx%x
1307, !$acc loop seq
1362, !$acc loop seq
1238, Generating default present(iz_iph(0:nimax,1:njmax,1:nkmax,1:nblocks),ix_iph(0:nimax,1:njmax,1:nkmax,1:nblocks),iy_iph(0:nimax,1:njmax,1:nkmax,1:nblocks),qp_iphr(0:nimax,1:njmax,1:nkmax,1:nblocks,1:5),qp_iphl(0:nimax,1:njmax,1:nkmax,1:nblocks,1:5),nk(1:nblocks),jac_iph(0:nimax,1:njmax,1:nkmax,1:nblocks),ni(1:nblocks),nj(1:nblocks))
1243, Loop is parallelizable
1307, Loop is parallelizable
1362, Loop is parallelizable
1374, Generating Tesla code
1375, !$acc loop gang collapse(4) ! blockidx%x
1376, ! blockidx%x collapsed
1377, ! blockidx%x collapsed
1378, ! blockidx%x collapsed
1380, !$acc loop vector(128) ! threadidx%x
1374, Generating default present(nj(1:nblocks),ni(1:nblocks),residual_rhs(1:nimax,1:njmax,1:nkmax,1:nblocks,1:nconserv),nk(1:nblocks))
1380, Loop is parallelizable
-Minfo message with private clause given at ganglevel:
!$acc data create(Fflux_iph)
!$acc parallel loop gang collapse(3) private(F_L,F_R,variable) default(present)
DO nbl = 1,nblocks
DO k = 1, NKmax
DO j = 1, NJmax
!$acc loop vector
DO i = 0, NImax
-Minfo message
hllc_intercell_fflux_estimate_i:
1237, Generating create(fflux_iph(:,:,:,:,:)) [if not already present]
1238, Generating Tesla code
1239, !$acc loop gang collapse(3) ! blockidx%x
1240, ! blockidx%x collapsed
1241, ! blockidx%x collapsed
1243, !$acc loop vector(128) ! threadidx%x
1307, !$acc loop seq
1362, !$acc loop seq
1238, Generating default present(iz_iph(0:nimax,1:njmax,1:nkmax,1:nblocks),ix_iph(0:nimax,1:njmax,1:nkmax,1:nblocks),iy_iph(0:nimax,1:njmax,1:nkmax,1:nblocks),qp_iphr(0:nimax,1:njmax,1:nkmax,1:nblocks,1:5),qp_iphl(0:nimax,1:njmax,1:nkmax,1:nblocks,1:5),nk(1:nblocks),jac_iph(0:nimax,1:njmax,1:nkmax,1:nblocks),ni(1:nblocks),nj(1:nblocks))
1243, Loop is parallelizable
1307, Loop is parallelizable
1362, Loop is parallelizable
1374, Generating Tesla code
1375, !$acc loop gang collapse(4) ! blockidx%x
1376, ! blockidx%x collapsed
1377, ! blockidx%x collapsed
1378, ! blockidx%x collapsed
1380, !$acc loop vector(128) ! threadidx%x
1374, Generating default present(nj(1:nblocks),ni(1:nblocks),residual_rhs(1:nimax,1:njmax,1:nkmax,1:nblocks,1:nconserv),nk(1:nblocks))
1380, Loop is parallelizable