I am trying to parallelize a complex loop structure as shown below. It has some calculation activities, internal DO loops and private arrays in it. I am using a few directives to port it to GPU as shown in code below. The problem is that I am seeing results with NaN values from it, and that is because of porting this subroutine to GPU. I tried to change the loop schedule but no luck, please kindly point out the mistake in this…
Subroutine:
SUBROUTINE MP5_FACE_INTERPOLATION_WITH_PRIMS_CHARS()
! Task: Given 'Primitive vars- Qp' at cell centers, interpolate those values on to faces
! Before interpolating Qp, Qp is transformed to Wp [d(Wp) = L*d(Qp)] (characteristic variables)
use declare_variables
implicit none
integer :: II
double precision, dimension(nprims) :: W_iph, W_imh
double precision, dimension(-2:3,nprims) :: Wp
double precision, dimension(nprims,nprims) :: L_Eig, R_Eig
double precision :: p, c, sqrt_rho, divisor, rho
double precision, dimension(3) :: PHI_hat, w, Gma, B
double precision, dimension(-2:2) :: PHI_L
double precision, parameter :: eps = 1.d-15
Gma(1) = 1.d0/10.d0
Gma(2) = 3.d0/5.d0
Gma(3) = 3.d0/10.d0
!$acc parallel loop gang collapse(3) private(L_Eig,R_Eig,Wp,W_iph,W_imh,PHI_L,B,w,Gma,PHI_hat)
DO nbl = 1,nblocks
DO k = 1,NKmax
DO j = 1,NJmax
!$acc loop vector
DO i = 0,NImax
if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then
! Roe-averaged 'Rho' and 'c' on i+1/2 face
sqrt_rho = sqrt(Qp(i+1,j,k,nbl,1)/Qp(i,j,k,nbl,1))
rho = sqrt_rho*Qp(i,j,k,nbl,1)
divisor = 1.d0/(sqrt_rho+1.d0)
p = (Qp(i,j,k,nbl,5) + (Qp(i+1,j,k,nbl,5)*sqrt_rho))*divisor
c = sqrt(gamma*p/rho)
! Left and right Eigen vectors on i+1/2 face
!--------------------------------------------
L_Eig(1,1) = 0.d0 ; R_Eig(1,1) = 0.5d0/c**2.d0
L_Eig(2,1) = Ix_c(i,j,k,nbl)*c**2.d0 ; R_Eig(2,1) = -0.5d0*Ix_c(i,j,k,nbl)/rho/c
L_Eig(3,1) = Iz_c(i,j,k,nbl)*c**2.d0 ; R_Eig(3,1) = -0.5d0*Iy_c(i,j,k,nbl)/rho/c
L_Eig(4,1) = -Iy_c(i,j,k,nbl)*c**2.d0 ; R_Eig(4,1) = -0.5d0*Iz_c(i,j,k,nbl)/rho/c
L_Eig(5,1) = 0.d0 ; R_Eig(5,1) = 0.5d0
;
L_Eig(1,2) = -Ix_c(i,j,k,nbl)*rho*c ; R_Eig(1,2) = Ix_c(i,j,k,nbl)/c**2.d0
L_Eig(2,2) = 0.d0 ; R_Eig(2,2) = 0.d0
L_Eig(3,2) = -Iy_c(i,j,k,nbl) ; R_Eig(3,2) = -Iz_c(i,j,k,nbl)
L_Eig(4,2) = -Iz_c(i,j,k,nbl) ; R_Eig(4,2) = Iy_c(i,j,k,nbl)
L_Eig(5,2) = Ix_c(i,j,k,nbl)*rho*c ; R_Eig(5,2) = 0.d0
;
L_Eig(1,3) = -Iy_c(i,j,k,nbl)*rho*c ; R_Eig(1,3) = Iz_c(i,j,k,nbl)/c**2.d0
L_Eig(2,3) = -Iz_c(i,j,k,nbl) ; R_Eig(2,3) = -Iy_c(i,j,k,nbl)
L_Eig(3,3) = Ix_c(i,j,k,nbl) ; R_Eig(3,3) = Ix_c(i,j,k,nbl)
L_Eig(4,3) = 0.d0 ; R_Eig(4,3) = 0.d0
L_Eig(5,3) = Iy_c(i,j,k,nbl)*rho*c ; R_Eig(5,3) = 0.d0
;
L_Eig(1,4) = -Iz_c(i,j,k,nbl)*rho*c ; R_Eig(1,4) = -Iy_c(i,j,k,nbl)/c**2.d0
L_Eig(2,4) = Iy_c(i,j,k,nbl) ; R_Eig(2,4) = -Iz_c(i,j,k,nbl)
L_Eig(3,4) = 0.d0 ; R_Eig(3,4) = 0.d0
L_Eig(4,4) = Ix_c(i,j,k,nbl) ; R_Eig(4,4) = Ix_c(i,j,k,nbl)
L_Eig(5,4) = Iz_c(i,j,k,nbl)*rho*c ; R_Eig(5,4) = 0.d0
;
L_Eig(1,5) = 1.d0 ; R_Eig(1,5) = 0.5d0/c**2.d0
L_Eig(2,5) = -Ix_c(i,j,k,nbl) ; R_Eig(2,5) = 0.5d0*Ix_c(i,j,k,nbl)/rho/c
L_Eig(3,5) = -Iz_c(i,j,k,nbl) ; R_Eig(3,5) = 0.5d0*Iy_c(i,j,k,nbl)/rho/c
L_Eig(4,5) = Iy_c(i,j,k,nbl) ; R_Eig(4,5) = 0.5d0*Iz_c(i,j,k,nbl)/rho/c
L_Eig(5,5) = 1.d0 ; R_Eig(5,5) = 0.5d0
!--------------------------------------------------------------------------------------------
! Use these eigen vectors at i, to convert 'Qp' at all II = -2,3 to 'Wp'
DO II=-2,3
! Charactristic primitives
!Wp(II,:) = matmul(L_Eig,Qp(i+II,j,k,nbl,:))
DO n_prim = 1,nprims
Wp(II,n_prim) = L_Eig(n_prim,1)*Qp(i+II,j,k,nbl,1) +&
L_Eig(n_prim,2)*Qp(i+II,j,k,nbl,2) +&
L_Eig(n_prim,3)*Qp(i+II,j,k,nbl,3) +&
L_Eig(n_prim,4)*Qp(i+II,j,k,nbl,4) +&
L_Eig(n_prim,5)*Qp(i+II,j,k,nbl,5)
ENDDO
ENDDO
!========================================================
DO n_prim = 1,nprims
! Cell Right face
PHI_L = Wp(-2:2,n_prim)
! Polynomials
PHI_hat(1) = (2.d0*PHI_L(-2) -7.d0*PHI_L(-1) +11.d0*PHI_L(0))/6.d0
PHI_hat(2) = ( -PHI_L(-1) +5.d0*PHI_L(0) + 2.d0*PHI_L(1))/6.d0
PHI_hat(3) = (2.d0*PHI_L( 0) +5.d0*PHI_L(1) - PHI_L(2))/6.d0
! Smoothness indicators
B(1) = 13.d0/12.d0*(PHI_L(-2) -2.d0*PHI_L(-1) +PHI_L(0))**2 + 0.25d0*( PHI_L(-2) -4.d0*PHI_L(-1) +3.d0*PHI_L(0))**2
B(2) = 13.d0/12.d0*(PHI_L(-1) -2.d0*PHI_L( 0) +PHI_L(1))**2 + 0.25d0*( PHI_L(-1) - PHI_L(1))**2
B(3) = 13.d0/12.d0*(PHI_L( 0) -2.d0*PHI_L( 1) +PHI_L(2))**2 + 0.25d0*(3.d0*PHI_L(0) -4.d0*PHI_L(1) +PHI_L(2))**2
! Non-linear weights
w(1) = Gma(1)/(eps+B(1))**2
w(2) = Gma(2)/(eps+B(2))**2
w(3) = Gma(3)/(eps+B(3))**2
W_iph(n_prim) = SUM(w*PHI_hat)/SUM(w)
!==============================================================================================
! Next cell Left face
PHI_L = Wp(-1:3,n_prim)
PHI_hat(1) = (2.d0*PHI_L(2) -7.d0*PHI_L( 1) +11.d0*PHI_L( 0))/6.d0
PHI_hat(2) = ( -PHI_L(1) +5.d0*PHI_L( 0) + 2.d0*PHI_L(-1))/6.d0
PHI_hat(3) = (2.d0*PHI_L(0) +5.d0*PHI_L(-1) - PHI_L(-2))/6.d0
! Smoothness indicators
B(1) = 13.d0/12.d0*(PHI_L(2) -2.d0*PHI_L( 1) +PHI_L( 0))**2 + 0.25d0*(PHI_L(2) -4.d0*PHI_L(1) +3.d0*PHI_L(0))**2
B(2) = 13.d0/12.d0*(PHI_L(1) -2.d0*PHI_L( 0) +PHI_L(-1))**2 + 0.25d0*(PHI_L(1) - PHI_L(-1))**2
B(3) = 13.d0/12.d0*(PHI_L(0) -2.d0*PHI_L(-1) +PHI_L(-2))**2 + 0.25d0*(3.d0*PHI_L(0) -4.d0*PHI_L(-1) +PHI_L(-2))**2
! Non-linear weights
w(1) = Gma(1)/(eps+B(1))**2
w(2) = Gma(2)/(eps+B(2))**2
w(3) = Gma(3)/(eps+B(3))**2
W_imh(n_prim) = SUM(w*PHI_hat)/SUM(w)
ENDDO
DO n_prim = 1,nprims
Qp_iphL(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_iph(1) + &
R_Eig(n_prim,2)*W_iph(2) + &
R_Eig(n_prim,3)*W_iph(3) + &
R_Eig(n_prim,4)*W_iph(4) + &
R_Eig(n_prim,5)*W_iph(5)
Qp_iphR(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_imh(1) + &
R_Eig(n_prim,2)*W_imh(2) + &
R_Eig(n_prim,3)*W_imh(3) + &
R_Eig(n_prim,4)*W_imh(4) + &
R_Eig(n_prim,5)*W_imh(5)
ENDDO
! PHI_iphL(i,j,k,nbl,:) = matmul(R_Eig,W_iph)
! PHI_iphR(i,j,k,nbl,:) = matmul(R_Eig,W_imh)
endif
ENDDO
ENDDO
ENDDO
ENDDO
!$acc parallel loop gang collapse(3) private(L_Eig,R_Eig,Wp,W_iph,W_imh,PHI_L,B,w,Gma,PHI_hat)
DO nbl = 1,nblocks
DO k = 1,NKmax
DO j = 0,NJmax
!$acc loop vector
DO i = 1,NImax
if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then
! Roe-averaged 'Rho' and 'c' on j+1/2 face
sqrt_rho =sqrt(Qp(i,j+1,k,nbl,1)/Qp(i,j,k,nbl,1))
rho =sqrt_rho*Qp(i,j,k,nbl,1)
divisor =1.0d0/(sqrt_rho+1.0d0)
p = (Qp(i,j,k,nbl,5) + (Qp(i,j+1,k,nbl,5)*sqrt_rho))*divisor
c = sqrt(gamma*p/rho)
! Left and right Eigen vectors on j+1/2 face
!--------------------------------
L_Eig(1,1) = 0.d0 ; R_Eig(1,1) = 0.5d0/c**2.d0
L_Eig(2,1) = Jx_c(i,j,k,nbl)*c**2.d0 ; R_Eig(2,1) = -0.5d0*Jx_c(i,j,k,nbl)/rho/c
L_Eig(3,1) = Jz_c(i,j,k,nbl)*c**2.d0 ; R_Eig(3,1) = -0.5d0*Jy_c(i,j,k,nbl)/rho/c
L_Eig(4,1) = -Jy_c(i,j,k,nbl)*c**2.d0 ; R_Eig(4,1) = -0.5d0*Jz_c(i,j,k,nbl)/rho/c
L_Eig(5,1) = 0.d0 ; R_Eig(5,1) = 0.5d0
;
L_Eig(1,2) = -Jx_c(i,j,k,nbl)*rho*c ; R_Eig(1,2) = Jx_c(i,j,k,nbl)/c**2.d0
L_Eig(2,2) = 0.d0 ; R_Eig(2,2) = 0.d0
L_Eig(3,2) = -Jy_c(i,j,k,nbl) ; R_Eig(3,2) = -Jz_c(i,j,k,nbl)
L_Eig(4,2) = -Jz_c(i,j,k,nbl) ; R_Eig(4,2) = Jy_c(i,j,k,nbl)
L_Eig(5,2) = Jx_c(i,j,k,nbl)*rho*c ; R_Eig(5,2) = 0.d0
;
L_Eig(1,3) = -Jy_c(i,j,k,nbl)*rho*c ; R_Eig(1,3) = Jz_c(i,j,k,nbl)/c**2.d0
L_Eig(2,3) = -Jz_c(i,j,k,nbl) ; R_Eig(2,3) = -Jy_c(i,j,k,nbl)
L_Eig(3,3) = Jx_c(i,j,k,nbl) ; R_Eig(3,3) = Jx_c(i,j,k,nbl)
L_Eig(4,3) = 0.d0 ; R_Eig(4,3) = 0.d0
L_Eig(5,3) = Jy_c(i,j,k,nbl)*rho*c ; R_Eig(5,3) = 0.d0
;
L_Eig(1,4) = -Jz_c(i,j,k,nbl)*rho*c ; R_Eig(1,4) = -Jy_c(i,j,k,nbl)/c**2.d0
L_Eig(2,4) = Jy_c(i,j,k,nbl) ; R_Eig(2,4) = -Jz_c(i,j,k,nbl)
L_Eig(3,4) = 0.d0 ; R_Eig(3,4) = 0.d0
L_Eig(4,4) = Jx_c(i,j,k,nbl) ; R_Eig(4,4) = Jx_c(i,j,k,nbl)
L_Eig(5,4) = Jz_c(i,j,k,nbl)*rho*c ; R_Eig(5,4) = 0.d0
;
L_Eig(1,5) = 1.d0 ; R_Eig(1,5) = 0.5d0/c**2.d0
L_Eig(2,5) = -Jx_c(i,j,k,nbl) ; R_Eig(2,5) = 0.5d0*Jx_c(i,j,k,nbl)/rho/c
L_Eig(3,5) = -Jz_c(i,j,k,nbl) ; R_Eig(3,5) = 0.5d0*Jy_c(i,j,k,nbl)/rho/c
L_Eig(4,5) = Jy_c(i,j,k,nbl) ; R_Eig(4,5) = 0.5d0*Jz_c(i,j,k,nbl)/rho/c
L_Eig(5,5) = 1.d0 ; R_Eig(5,5) = 0.5d0
!------------------------------------------------------------------------------------------
! Use these eigen vectors at i, to convert 'Qp' at all II = -2,3 to 'Wp'
DO II=-2,3
! Wp(II,:) = matmul(L_Eig,Qp(i,j+II,k,nbl,:))
DO n_prim = 1,nprims
Wp(II,n_prim) = L_Eig(n_prim,1)*Qp(i,j+II,k,nbl,1) +&
L_Eig(n_prim,2)*Qp(i,j+II,k,nbl,2) +&
L_Eig(n_prim,3)*Qp(i,j+II,k,nbl,3) +&
L_Eig(n_prim,4)*Qp(i,j+II,k,nbl,4) +&
L_Eig(n_prim,5)*Qp(i,j+II,k,nbl,5)
ENDDO
ENDDO
!========================================================
DO n_prim = 1,nprims
!$ Cell Right face
PHI_L = Wp(-2:2,n_prim)
! Polynomials
PHI_hat(1) = (2.d0*PHI_L(-2) -7.d0*PHI_L(-1) +11.d0*PHI_L(0))/6.d0
PHI_hat(2) = ( -PHI_L(-1) +5.d0*PHI_L(0) + 2.d0*PHI_L(1))/6.d0
PHI_hat(3) = (2.d0*PHI_L( 0) +5.d0*PHI_L(1) - PHI_L(2))/6.d0
! Smoothness indicators
B(1) = 13.d0/12.d0*(PHI_L(-2) -2.d0*PHI_L(-1) +PHI_L(0))**2 + 0.25d0*( PHI_L(-2) -4.d0*PHI_L(-1) +3.d0*PHI_L(0))**2
B(2) = 13.d0/12.d0*(PHI_L(-1) -2.d0*PHI_L( 0) +PHI_L(1))**2 + 0.25d0*( PHI_L(-1) - PHI_L(1))**2
B(3) = 13.d0/12.d0*(PHI_L( 0) -2.d0*PHI_L( 1) +PHI_L(2))**2 + 0.25d0*(3.d0*PHI_L(0) -4.d0*PHI_L(1) +PHI_L(2))**2
! Non-linear weights
w(1) = Gma(1)/(eps+B(1))**2
w(2) = Gma(2)/(eps+B(2))**2
w(3) = Gma(3)/(eps+B(3))**2
W_iph(n_prim) = SUM(w*PHI_hat)/SUM(w)
!==============================================================================================
!$ Next cell Left face
PHI_L = Wp(-1:3,n_prim)
PHI_hat(1) = (2.d0*PHI_L(2) -7.d0*PHI_L( 1) +11.d0*PHI_L( 0))/6.d0
PHI_hat(2) = ( -PHI_L(1) +5.d0*PHI_L( 0) + 2.d0*PHI_L(-1))/6.d0
PHI_hat(3) = (2.d0*PHI_L(0) +5.d0*PHI_L(-1) - PHI_L(-2))/6.d0
! Smoothness indicators
B(1) = 13.d0/12.d0*(PHI_L(2) -2.d0*PHI_L( 1) +PHI_L( 0))**2 + 0.25d0*(PHI_L(2) -4.d0*PHI_L(1) +3.d0*PHI_L(0))**2
B(2) = 13.d0/12.d0*(PHI_L(1) -2.d0*PHI_L( 0) +PHI_L(-1))**2 + 0.25d0*(PHI_L(1) - PHI_L(-1))**2
B(3) = 13.d0/12.d0*(PHI_L(0) -2.d0*PHI_L(-1) +PHI_L(-2))**2 + 0.25d0*(3.d0*PHI_L(0) -4.d0*PHI_L(-1) +PHI_L(-2))**2
! Non-linear weights
w(1) = Gma(1)/(eps+B(1))**2
w(2) = Gma(2)/(eps+B(2))**2
w(3) = Gma(3)/(eps+B(3))**2
W_imh(n_prim) = SUM(w*PHI_hat)/SUM(w)
ENDDO
DO n_prim = 1,nprims
Qp_jphL(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_iph(1) + &
R_Eig(n_prim,2)*W_iph(2) + &
R_Eig(n_prim,3)*W_iph(3) + &
R_Eig(n_prim,4)*W_iph(4) + &
R_Eig(n_prim,5)*W_iph(5)
Qp_jphR(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_imh(1) + &
R_Eig(n_prim,2)*W_imh(2) + &
R_Eig(n_prim,3)*W_imh(3) + &
R_Eig(n_prim,4)*W_imh(4) + &
R_Eig(n_prim,5)*W_imh(5)
ENDDO
! PHI_jphL(i,j,k,nbl,:) = matmul(R_Eig,W_iph)
! PHI_jphR(i,j,k,nbl,:) = matmul(R_Eig,W_imh)
endif
ENDDO
ENDDO
ENDDO
ENDDO
!!$acc wait
END
-Minfo=accel for this subroutine:
mp5_face_interpolation_with_prims_chars:
250, Generating Tesla code
251, !$acc loop gang collapse(3) ! blockidx%x
252, ! blockidx%x collapsed
253, ! blockidx%x collapsed
255, !$acc loop vector(128) ! threadidx%x
303, !$acc loop seq
307, !$acc loop seq
319, !$acc loop seq
322, !$acc loop seq
339, !$acc loop seq
343, !$acc loop seq
359, !$acc loop seq
364, !$acc loop seq
250, CUDA shared memory used for phi_hat,w,phi_l,b,gma
Generating implicit copyin(ix_c(0:nimax,1:njmax,1:nkmax,1:nblocks),iy_c(0:nimax,1:njmax,1:nkmax,1:nblocks),nk(1:nblocks),qp(-2:nimax+3,1:njmax,1:nkmax,1:nblocks,1:5)) [if not already present]
Generating implicit copy(qp_iphl(0:nimax,1:njmax,1:nkmax,1:nblocks,1:nprims),qp_iphr(0:nimax,1:njmax,1:nkmax,1:nblocks,1:nprims)) [if not already present]
Generating implicit copyin(iz_c(0:nimax,1:njmax,1:nkmax,1:nblocks),ni(1:nblocks),nj(1:nblocks)) [if not already present]
255, Loop is parallelizable
303, Loop is parallelizable
308, Accelerator restriction: induction variable live-out from loop: n_prim
313, Accelerator restriction: induction variable live-out from loop: n_prim
319, Loop carried reuse of phi_hat,b prevents parallelization
Complex loop carried dependence of phi_hat,b prevents parallelization
Loop carried reuse of phi_l,w prevents parallelization
Complex loop carried dependence of w prevents parallelization
322, Accelerator restriction: induction variable live-out from loop: n_prim
339, Accelerator restriction: induction variable live-out from loop: n_prim
Loop is parallelizable
343, Accelerator restriction: induction variable live-out from loop: n_prim
359, Accelerator restriction: induction variable live-out from loop: n_prim
Loop is parallelizable
361, Accelerator restriction: induction variable live-out from loop: n_prim
366, Accelerator restriction: induction variable live-out from loop: n_prim
372, Accelerator restriction: induction variable live-out from loop: n_prim
377, Accelerator restriction: induction variable live-out from loop: n_prim
389, Generating Tesla code
390, !$acc loop gang collapse(3) ! blockidx%x
391, ! blockidx%x collapsed
392, ! blockidx%x collapsed
394, !$acc loop vector(128) ! threadidx%x
441, !$acc loop seq
444, !$acc loop seq
455, !$acc loop seq
458, !$acc loop seq
475, !$acc loop seq
479, !$acc loop seq
495, !$acc loop seq
499, !$acc loop seq
389, CUDA shared memory used for phi_hat,w,phi_l,b,gma
Generating implicit copyin(jx_c(1:nimax,0:njmax,1:nkmax,1:nblocks),jy_c(1:nimax,0:njmax,1:nkmax,1:nblocks),nk(1:nblocks),qp(1:nimax,-2:njmax+3,1:nkmax,1:nblocks,1:5)) [if not already present]
Generating implicit copy(qp_jphr(1:nimax,0:njmax,1:nkmax,1:nblocks,1:nprims),qp_jphl(1:nimax,0:njmax,1:nkmax,1:nblocks,1:nprims)) [if not already present]
Generating implicit copyin(jz_c(1:nimax,0:njmax,1:nkmax,1:nblocks),ni(1:nblocks),nj(1:nblocks)) [if not already present]
394, Loop is parallelizable
441, Loop is parallelizable
445, Accelerator restriction: induction variable live-out from loop: n_prim
450, Accelerator restriction: induction variable live-out from loop: n_prim
455, Loop carried reuse of phi_hat,b prevents parallelization
Complex loop carried dependence of phi_hat,b prevents parallelization
Loop carried reuse of phi_l,w prevents parallelization
Complex loop carried dependence of w prevents parallelization
458, Accelerator restriction: induction variable live-out from loop: n_prim
475, Accelerator restriction: induction variable live-out from loop: n_prim
Loop is parallelizable
479, Accelerator restriction: induction variable live-out from loop: n_prim
495, Accelerator restriction: induction variable live-out from loop: n_prim
Loop is parallelizable
497, Accelerator restriction: induction variable live-out from loop: n_prim
501, Accelerator restriction: induction variable live-out from loop: n_prim
507, Accelerator restriction: induction variable live-out from loop: n_prim
512, Accelerator restriction: induction variable live-out from loop: n_prim
Thanks!