OpenACC: Parallelizing a loop structure with some operations and loops

I am trying to parallelize a complex loop structure as shown below. It has some calculation activities, internal DO loops and private arrays in it. I am using a few directives to port it to GPU as shown in code below. The problem is that I am seeing results with NaN values from it, and that is because of porting this subroutine to GPU. I tried to change the loop schedule but no luck, please kindly point out the mistake in this…

Subroutine:

SUBROUTINE MP5_FACE_INTERPOLATION_WITH_PRIMS_CHARS()

	! Task: Given 'Primitive vars- Qp' at cell centers, interpolate those values on to faces
	! Before interpolating Qp, Qp is transformed to Wp [d(Wp) = L*d(Qp)] (characteristic variables)

	use declare_variables
	implicit none
	
	integer :: II
	double precision, dimension(nprims) :: W_iph, W_imh
	double precision, dimension(-2:3,nprims) :: Wp
    double precision, dimension(nprims,nprims) :: L_Eig, R_Eig
    double precision :: p, c, sqrt_rho, divisor, rho
	
	double precision, dimension(3) :: PHI_hat, w, Gma, B
	double precision, dimension(-2:2) :: PHI_L
	double precision, parameter :: eps = 1.d-15
	
	Gma(1) = 1.d0/10.d0
	Gma(2) = 3.d0/5.d0
	Gma(3) = 3.d0/10.d0
	
	!$acc parallel loop gang collapse(3) private(L_Eig,R_Eig,Wp,W_iph,W_imh,PHI_L,B,w,Gma,PHI_hat)
	DO nbl = 1,nblocks
	DO k = 1,NKmax
	DO j = 1,NJmax
	!$acc loop vector
	DO i = 0,NImax
	if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then	
	
		! Roe-averaged 'Rho' and 'c' on i+1/2 face
		sqrt_rho = sqrt(Qp(i+1,j,k,nbl,1)/Qp(i,j,k,nbl,1))
		rho      = sqrt_rho*Qp(i,j,k,nbl,1)
		divisor  = 1.d0/(sqrt_rho+1.d0)
		
		p   = (Qp(i,j,k,nbl,5) + (Qp(i+1,j,k,nbl,5)*sqrt_rho))*divisor
		c   = sqrt(gamma*p/rho)
		
		! Left and right Eigen vectors on i+1/2 face
		!--------------------------------------------
		
		L_Eig(1,1) = 0.d0							;	R_Eig(1,1) = 0.5d0/c**2.d0
		L_Eig(2,1) = Ix_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(2,1) = -0.5d0*Ix_c(i,j,k,nbl)/rho/c
		L_Eig(3,1) = Iz_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(3,1) = -0.5d0*Iy_c(i,j,k,nbl)/rho/c
		L_Eig(4,1) = -Iy_c(i,j,k,nbl)*c**2.d0       ;	R_Eig(4,1) = -0.5d0*Iz_c(i,j,k,nbl)/rho/c
		L_Eig(5,1) = 0.d0                           ;	R_Eig(5,1) = 0.5d0
													;	
		L_Eig(1,2) = -Ix_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,2) = Ix_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,2) = 0.d0                           ;	R_Eig(2,2) = 0.d0
		L_Eig(3,2) = -Iy_c(i,j,k,nbl)               ;	R_Eig(3,2) = -Iz_c(i,j,k,nbl)
		L_Eig(4,2) = -Iz_c(i,j,k,nbl)               ;	R_Eig(4,2) = Iy_c(i,j,k,nbl)
		L_Eig(5,2) = Ix_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,2) = 0.d0
													;	
		L_Eig(1,3) = -Iy_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,3) = Iz_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,3) = -Iz_c(i,j,k,nbl)               ;	R_Eig(2,3) = -Iy_c(i,j,k,nbl)
		L_Eig(3,3) = Ix_c(i,j,k,nbl)                ;	R_Eig(3,3) = Ix_c(i,j,k,nbl)
		L_Eig(4,3) = 0.d0                           ;	R_Eig(4,3) = 0.d0
		L_Eig(5,3) = Iy_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,3) = 0.d0
													;	
		L_Eig(1,4) = -Iz_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,4) = -Iy_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,4) = Iy_c(i,j,k,nbl)                ;	R_Eig(2,4) = -Iz_c(i,j,k,nbl)
		L_Eig(3,4) = 0.d0                           ;	R_Eig(3,4) = 0.d0
		L_Eig(4,4) = Ix_c(i,j,k,nbl)                ;	R_Eig(4,4) = Ix_c(i,j,k,nbl)
		L_Eig(5,4) = Iz_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,4) = 0.d0
													;	
		L_Eig(1,5) = 1.d0                           ;	R_Eig(1,5) = 0.5d0/c**2.d0
		L_Eig(2,5) = -Ix_c(i,j,k,nbl)               ;	R_Eig(2,5) = 0.5d0*Ix_c(i,j,k,nbl)/rho/c
		L_Eig(3,5) = -Iz_c(i,j,k,nbl)               ;	R_Eig(3,5) = 0.5d0*Iy_c(i,j,k,nbl)/rho/c
		L_Eig(4,5) = Iy_c(i,j,k,nbl)                ;	R_Eig(4,5) = 0.5d0*Iz_c(i,j,k,nbl)/rho/c
		L_Eig(5,5) = 1.d0                           ;	R_Eig(5,5) = 0.5d0
		
		!--------------------------------------------------------------------------------------------
		
		! Use these eigen vectors at i, to convert 'Qp' at all II = -2,3 to 'Wp'
		
		DO II=-2,3
			! Charactristic primitives
			!Wp(II,:) = matmul(L_Eig,Qp(i+II,j,k,nbl,:))
			
			DO n_prim = 1,nprims
			Wp(II,n_prim) = L_Eig(n_prim,1)*Qp(i+II,j,k,nbl,1) +&
							L_Eig(n_prim,2)*Qp(i+II,j,k,nbl,2) +&
							L_Eig(n_prim,3)*Qp(i+II,j,k,nbl,3) +&
							L_Eig(n_prim,4)*Qp(i+II,j,k,nbl,4) +&
							L_Eig(n_prim,5)*Qp(i+II,j,k,nbl,5) 
			ENDDO
			
		ENDDO
		
		!========================================================
		
		DO n_prim = 1,nprims
		
			! Cell Right face
			PHI_L = Wp(-2:2,n_prim)
			
			! Polynomials
			PHI_hat(1) = (2.d0*PHI_L(-2) -7.d0*PHI_L(-1) +11.d0*PHI_L(0))/6.d0
			PHI_hat(2) = (    -PHI_L(-1) +5.d0*PHI_L(0)  + 2.d0*PHI_L(1))/6.d0
			PHI_hat(3) = (2.d0*PHI_L( 0) +5.d0*PHI_L(1)  -      PHI_L(2))/6.d0
			
			! Smoothness indicators
			B(1) = 13.d0/12.d0*(PHI_L(-2) -2.d0*PHI_L(-1) +PHI_L(0))**2 + 0.25d0*(     PHI_L(-2) -4.d0*PHI_L(-1) +3.d0*PHI_L(0))**2
			B(2) = 13.d0/12.d0*(PHI_L(-1) -2.d0*PHI_L( 0) +PHI_L(1))**2 + 0.25d0*(     PHI_L(-1) -     PHI_L(1))**2
			B(3) = 13.d0/12.d0*(PHI_L( 0) -2.d0*PHI_L( 1) +PHI_L(2))**2 + 0.25d0*(3.d0*PHI_L(0)  -4.d0*PHI_L(1) +PHI_L(2))**2
			
			! Non-linear weights
			w(1) = Gma(1)/(eps+B(1))**2
			w(2) = Gma(2)/(eps+B(2))**2
			w(3) = Gma(3)/(eps+B(3))**2
			
			W_iph(n_prim) = SUM(w*PHI_hat)/SUM(w)
			
			!==============================================================================================
			! Next cell Left face
			PHI_L = Wp(-1:3,n_prim)
			
			PHI_hat(1) = (2.d0*PHI_L(2) -7.d0*PHI_L( 1) +11.d0*PHI_L( 0))/6.d0
			PHI_hat(2) = (    -PHI_L(1) +5.d0*PHI_L( 0) + 2.d0*PHI_L(-1))/6.d0
			PHI_hat(3) = (2.d0*PHI_L(0) +5.d0*PHI_L(-1) -      PHI_L(-2))/6.d0
			
			! Smoothness indicators
			B(1) = 13.d0/12.d0*(PHI_L(2) -2.d0*PHI_L( 1) +PHI_L( 0))**2 + 0.25d0*(PHI_L(2) -4.d0*PHI_L(1) +3.d0*PHI_L(0))**2
			B(2) = 13.d0/12.d0*(PHI_L(1) -2.d0*PHI_L( 0) +PHI_L(-1))**2 + 0.25d0*(PHI_L(1) - PHI_L(-1))**2
			B(3) = 13.d0/12.d0*(PHI_L(0) -2.d0*PHI_L(-1) +PHI_L(-2))**2 + 0.25d0*(3.d0*PHI_L(0) -4.d0*PHI_L(-1) +PHI_L(-2))**2
			
			! Non-linear weights
			w(1) = Gma(1)/(eps+B(1))**2
			w(2) = Gma(2)/(eps+B(2))**2
			w(3) = Gma(3)/(eps+B(3))**2
			
			W_imh(n_prim) = SUM(w*PHI_hat)/SUM(w)
			
		ENDDO
		
		
		DO n_prim = 1,nprims
		
		Qp_iphL(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_iph(1) + &
									 R_Eig(n_prim,2)*W_iph(2) + &
									 R_Eig(n_prim,3)*W_iph(3) + &
									 R_Eig(n_prim,4)*W_iph(4) + &
									 R_Eig(n_prim,5)*W_iph(5)
									 
		Qp_iphR(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_imh(1) + &
									 R_Eig(n_prim,2)*W_imh(2) + &
									 R_Eig(n_prim,3)*W_imh(3) + &
									 R_Eig(n_prim,4)*W_imh(4) + &
									 R_Eig(n_prim,5)*W_imh(5)
		ENDDO
		
		! PHI_iphL(i,j,k,nbl,:) = matmul(R_Eig,W_iph)
		! PHI_iphR(i,j,k,nbl,:) = matmul(R_Eig,W_imh)
		
	endif
	ENDDO
	ENDDO
	ENDDO
	ENDDO
	
	
	!$acc parallel loop gang collapse(3) private(L_Eig,R_Eig,Wp,W_iph,W_imh,PHI_L,B,w,Gma,PHI_hat)
	DO nbl = 1,nblocks
	DO k = 1,NKmax
	DO j = 0,NJmax
	!$acc loop vector
	DO i = 1,NImax
	if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then
		! Roe-averaged 'Rho' and 'c' on j+1/2 face
		sqrt_rho =sqrt(Qp(i,j+1,k,nbl,1)/Qp(i,j,k,nbl,1))
		rho      =sqrt_rho*Qp(i,j,k,nbl,1)
		divisor  =1.0d0/(sqrt_rho+1.0d0)
		
		p   = (Qp(i,j,k,nbl,5) + (Qp(i,j+1,k,nbl,5)*sqrt_rho))*divisor
		c   = sqrt(gamma*p/rho)
		
		! Left and right Eigen vectors on j+1/2 face
		!--------------------------------
		
		L_Eig(1,1) = 0.d0							;	R_Eig(1,1) = 0.5d0/c**2.d0
		L_Eig(2,1) = Jx_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(2,1) = -0.5d0*Jx_c(i,j,k,nbl)/rho/c
		L_Eig(3,1) = Jz_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(3,1) = -0.5d0*Jy_c(i,j,k,nbl)/rho/c
		L_Eig(4,1) = -Jy_c(i,j,k,nbl)*c**2.d0       ;	R_Eig(4,1) = -0.5d0*Jz_c(i,j,k,nbl)/rho/c
		L_Eig(5,1) = 0.d0                           ;	R_Eig(5,1) = 0.5d0
													;	
		L_Eig(1,2) = -Jx_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,2) = Jx_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,2) = 0.d0                           ;	R_Eig(2,2) = 0.d0
		L_Eig(3,2) = -Jy_c(i,j,k,nbl)               ;	R_Eig(3,2) = -Jz_c(i,j,k,nbl)
		L_Eig(4,2) = -Jz_c(i,j,k,nbl)               ;	R_Eig(4,2) = Jy_c(i,j,k,nbl)
		L_Eig(5,2) = Jx_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,2) = 0.d0
													;	
		L_Eig(1,3) = -Jy_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,3) = Jz_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,3) = -Jz_c(i,j,k,nbl)               ;	R_Eig(2,3) = -Jy_c(i,j,k,nbl)
		L_Eig(3,3) = Jx_c(i,j,k,nbl)                ;	R_Eig(3,3) = Jx_c(i,j,k,nbl)
		L_Eig(4,3) = 0.d0                           ;	R_Eig(4,3) = 0.d0
		L_Eig(5,3) = Jy_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,3) = 0.d0
													;	
		L_Eig(1,4) = -Jz_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,4) = -Jy_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,4) = Jy_c(i,j,k,nbl)                ;	R_Eig(2,4) = -Jz_c(i,j,k,nbl)
		L_Eig(3,4) = 0.d0                           ;	R_Eig(3,4) = 0.d0
		L_Eig(4,4) = Jx_c(i,j,k,nbl)                ;	R_Eig(4,4) = Jx_c(i,j,k,nbl)
		L_Eig(5,4) = Jz_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,4) = 0.d0
													;	
		L_Eig(1,5) = 1.d0                           ;	R_Eig(1,5) = 0.5d0/c**2.d0
		L_Eig(2,5) = -Jx_c(i,j,k,nbl)               ;	R_Eig(2,5) = 0.5d0*Jx_c(i,j,k,nbl)/rho/c
		L_Eig(3,5) = -Jz_c(i,j,k,nbl)               ;	R_Eig(3,5) = 0.5d0*Jy_c(i,j,k,nbl)/rho/c
		L_Eig(4,5) = Jy_c(i,j,k,nbl)                ;	R_Eig(4,5) = 0.5d0*Jz_c(i,j,k,nbl)/rho/c
		L_Eig(5,5) = 1.d0                           ;	R_Eig(5,5) = 0.5d0
		
		!------------------------------------------------------------------------------------------
			
		! Use these eigen vectors at i, to convert 'Qp' at all II = -2,3 to 'Wp'
		
		DO II=-2,3
			! Wp(II,:) = matmul(L_Eig,Qp(i,j+II,k,nbl,:))
			
			DO n_prim = 1,nprims
			Wp(II,n_prim) = L_Eig(n_prim,1)*Qp(i,j+II,k,nbl,1) +&
							L_Eig(n_prim,2)*Qp(i,j+II,k,nbl,2) +&
							L_Eig(n_prim,3)*Qp(i,j+II,k,nbl,3) +&
							L_Eig(n_prim,4)*Qp(i,j+II,k,nbl,4) +&
							L_Eig(n_prim,5)*Qp(i,j+II,k,nbl,5) 
			ENDDO
		ENDDO
		
		!========================================================
		
		DO n_prim = 1,nprims
		
			!$ Cell Right face
			PHI_L = Wp(-2:2,n_prim)
			
			! Polynomials
			PHI_hat(1) = (2.d0*PHI_L(-2) -7.d0*PHI_L(-1) +11.d0*PHI_L(0))/6.d0
			PHI_hat(2) = (    -PHI_L(-1) +5.d0*PHI_L(0)  + 2.d0*PHI_L(1))/6.d0
			PHI_hat(3) = (2.d0*PHI_L( 0) +5.d0*PHI_L(1)  -      PHI_L(2))/6.d0
			
			! Smoothness indicators
			B(1) = 13.d0/12.d0*(PHI_L(-2) -2.d0*PHI_L(-1) +PHI_L(0))**2 + 0.25d0*(     PHI_L(-2) -4.d0*PHI_L(-1) +3.d0*PHI_L(0))**2
			B(2) = 13.d0/12.d0*(PHI_L(-1) -2.d0*PHI_L( 0) +PHI_L(1))**2 + 0.25d0*(     PHI_L(-1) -     PHI_L(1))**2
			B(3) = 13.d0/12.d0*(PHI_L( 0) -2.d0*PHI_L( 1) +PHI_L(2))**2 + 0.25d0*(3.d0*PHI_L(0)  -4.d0*PHI_L(1) +PHI_L(2))**2
			
			! Non-linear weights
			w(1) = Gma(1)/(eps+B(1))**2
			w(2) = Gma(2)/(eps+B(2))**2
			w(3) = Gma(3)/(eps+B(3))**2
			
			W_iph(n_prim) = SUM(w*PHI_hat)/SUM(w)
			
			!==============================================================================================
			!$ Next cell Left face
			PHI_L = Wp(-1:3,n_prim)
			
			PHI_hat(1) = (2.d0*PHI_L(2) -7.d0*PHI_L( 1) +11.d0*PHI_L( 0))/6.d0
			PHI_hat(2) = (    -PHI_L(1) +5.d0*PHI_L( 0) + 2.d0*PHI_L(-1))/6.d0
			PHI_hat(3) = (2.d0*PHI_L(0) +5.d0*PHI_L(-1) -      PHI_L(-2))/6.d0
			
			! Smoothness indicators
			B(1) = 13.d0/12.d0*(PHI_L(2) -2.d0*PHI_L( 1) +PHI_L( 0))**2 + 0.25d0*(PHI_L(2) -4.d0*PHI_L(1) +3.d0*PHI_L(0))**2
			B(2) = 13.d0/12.d0*(PHI_L(1) -2.d0*PHI_L( 0) +PHI_L(-1))**2 + 0.25d0*(PHI_L(1) - PHI_L(-1))**2
			B(3) = 13.d0/12.d0*(PHI_L(0) -2.d0*PHI_L(-1) +PHI_L(-2))**2 + 0.25d0*(3.d0*PHI_L(0) -4.d0*PHI_L(-1) +PHI_L(-2))**2
			
			! Non-linear weights
			w(1) = Gma(1)/(eps+B(1))**2
			w(2) = Gma(2)/(eps+B(2))**2
			w(3) = Gma(3)/(eps+B(3))**2
			
			W_imh(n_prim) = SUM(w*PHI_hat)/SUM(w)
			
		ENDDO
		
		DO n_prim = 1,nprims
		
		Qp_jphL(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_iph(1) + &
									 R_Eig(n_prim,2)*W_iph(2) + &
									 R_Eig(n_prim,3)*W_iph(3) + &
									 R_Eig(n_prim,4)*W_iph(4) + &
									 R_Eig(n_prim,5)*W_iph(5)
									 
		Qp_jphR(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_imh(1) + &
									 R_Eig(n_prim,2)*W_imh(2) + &
									 R_Eig(n_prim,3)*W_imh(3) + &
									 R_Eig(n_prim,4)*W_imh(4) + &
									 R_Eig(n_prim,5)*W_imh(5)
		ENDDO
		
		! PHI_jphL(i,j,k,nbl,:) = matmul(R_Eig,W_iph)
		! PHI_jphR(i,j,k,nbl,:) = matmul(R_Eig,W_imh)
		
	endif
	ENDDO
	ENDDO
	ENDDO
	ENDDO
	
	
	!!$acc wait


END

-Minfo=accel for this subroutine:

mp5_face_interpolation_with_prims_chars:
    250, Generating Tesla code
        251, !$acc loop gang collapse(3) ! blockidx%x
        252,   ! blockidx%x collapsed
        253,   ! blockidx%x collapsed
        255, !$acc loop vector(128) ! threadidx%x
        303, !$acc loop seq
        307, !$acc loop seq
        319, !$acc loop seq
        322, !$acc loop seq
        339, !$acc loop seq
        343, !$acc loop seq
        359, !$acc loop seq
        364, !$acc loop seq
    250, CUDA shared memory used for phi_hat,w,phi_l,b,gma
         Generating implicit copyin(ix_c(0:nimax,1:njmax,1:nkmax,1:nblocks),iy_c(0:nimax,1:njmax,1:nkmax,1:nblocks),nk(1:nblocks),qp(-2:nimax+3,1:njmax,1:nkmax,1:nblocks,1:5)) [if not already present]
         Generating implicit copy(qp_iphl(0:nimax,1:njmax,1:nkmax,1:nblocks,1:nprims),qp_iphr(0:nimax,1:njmax,1:nkmax,1:nblocks,1:nprims)) [if not already present]
         Generating implicit copyin(iz_c(0:nimax,1:njmax,1:nkmax,1:nblocks),ni(1:nblocks),nj(1:nblocks)) [if not already present]
    255, Loop is parallelizable
    303, Loop is parallelizable
    308, Accelerator restriction: induction variable live-out from loop: n_prim
    313, Accelerator restriction: induction variable live-out from loop: n_prim
    319, Loop carried reuse of phi_hat,b prevents parallelization
         Complex loop carried dependence of phi_hat,b prevents parallelization
         Loop carried reuse of phi_l,w prevents parallelization
         Complex loop carried dependence of w prevents parallelization
    322, Accelerator restriction: induction variable live-out from loop: n_prim
    339, Accelerator restriction: induction variable live-out from loop: n_prim
         Loop is parallelizable
    343, Accelerator restriction: induction variable live-out from loop: n_prim
    359, Accelerator restriction: induction variable live-out from loop: n_prim
         Loop is parallelizable
    361, Accelerator restriction: induction variable live-out from loop: n_prim
    366, Accelerator restriction: induction variable live-out from loop: n_prim
    372, Accelerator restriction: induction variable live-out from loop: n_prim
    377, Accelerator restriction: induction variable live-out from loop: n_prim
    389, Generating Tesla code
        390, !$acc loop gang collapse(3) ! blockidx%x
        391,   ! blockidx%x collapsed
        392,   ! blockidx%x collapsed
        394, !$acc loop vector(128) ! threadidx%x
        441, !$acc loop seq
        444, !$acc loop seq
        455, !$acc loop seq
        458, !$acc loop seq
        475, !$acc loop seq
        479, !$acc loop seq
        495, !$acc loop seq
        499, !$acc loop seq
    389, CUDA shared memory used for phi_hat,w,phi_l,b,gma
         Generating implicit copyin(jx_c(1:nimax,0:njmax,1:nkmax,1:nblocks),jy_c(1:nimax,0:njmax,1:nkmax,1:nblocks),nk(1:nblocks),qp(1:nimax,-2:njmax+3,1:nkmax,1:nblocks,1:5)) [if not already present]
         Generating implicit copy(qp_jphr(1:nimax,0:njmax,1:nkmax,1:nblocks,1:nprims),qp_jphl(1:nimax,0:njmax,1:nkmax,1:nblocks,1:nprims)) [if not already present]
         Generating implicit copyin(jz_c(1:nimax,0:njmax,1:nkmax,1:nblocks),ni(1:nblocks),nj(1:nblocks)) [if not already present]
    394, Loop is parallelizable
    441, Loop is parallelizable
    445, Accelerator restriction: induction variable live-out from loop: n_prim
    450, Accelerator restriction: induction variable live-out from loop: n_prim
    455, Loop carried reuse of phi_hat,b prevents parallelization
         Complex loop carried dependence of phi_hat,b prevents parallelization
         Loop carried reuse of phi_l,w prevents parallelization
         Complex loop carried dependence of w prevents parallelization
    458, Accelerator restriction: induction variable live-out from loop: n_prim
    475, Accelerator restriction: induction variable live-out from loop: n_prim
         Loop is parallelizable
    479, Accelerator restriction: induction variable live-out from loop: n_prim
    495, Accelerator restriction: induction variable live-out from loop: n_prim
         Loop is parallelizable
    497, Accelerator restriction: induction variable live-out from loop: n_prim
    501, Accelerator restriction: induction variable live-out from loop: n_prim
    507, Accelerator restriction: induction variable live-out from loop: n_prim
    512, Accelerator restriction: induction variable live-out from loop: n_prim

Thanks!

“n_prim” is a module variable so has global storage which will prevent parallelization. To fix, add “n_prim” to a “private” clause or use a local variable of the index.

-Mat

1 Like

Hi, even with n_prim in private, its still the same.

I did all the corrections you have mentioned in the other post (Gma into private, manual matrix multiplication etc.). I am seeing an excellent performance (around 17x) and I am very happy, but the final result (Qp - is my final result) is all NaNs. Did I miss anything else?

Actually on second look, Gma should not be privatized, or at least made “firstprivate” since it’s initialized before the loops. private variables aren’t initialized so these values will be garbage.

        Gma(1) = 1.d0/10.d0
        Gma(2) = 3.d0/5.d0
        Gma(3) = 3.d0/10.d0

        !$acc data copyin(Gma)
        !$acc parallel loop default(present) gang collapse(3) private(L_Eig,R_Eig,Wp,W_iph,W_imh,PHI_L,B,w,n_prim,PHI_hat)
        DO nbl = 1,nblocks
        DO k = 1,NKmax
        DO j = 1,NJmax

Note since Gma is a fixed size stack array, so it can’t be managed using Unified Memory and must be added to a data region.

How can I tell if the answers are correct? The two output files (grid.xyz, flow.f) are in binary.

1 Like

Sorry about that. I should have printed it on screen. Can you please add the following, after line number 100 (call OUTPUT(1)) to print the solution on screen.

print*, 'SOLUTION'
print*, Qp(NImax/2, NJmax/2,1,1,1)
print*, Qp(NImax/2, NJmax/2,1,1,2)
print*, Qp(NImax/2, NJmax/2,1,1,3)
print*, Qp(NImax/2, NJmax/2,1,1,4)
print*, Qp(NImax/2, NJmax/2,1,1,5)

These values are coming out to be NaN for me.

BTW I am using -ta:tesla:managed flag, results are different without it.

Thanks!

Thanks, I see the Nans now. I have some other projects that I need to work on, so can give this too much attention right now, but usually these types of things are either a missing data synchronization or some race condition or other dependency in the parallel loops.

I did a quick test of changing the parallel loops to “serial” loops so that the code runs sequentially on the device. Nans no longer occur suggesting that there’s a data race someplace. My next step would be to that incrementally start adding back parallel until I can isolate which loop(s) the data race occur.

I started but will need to stop for now. The loops in BOUNDARY_CONDITIONS are suspect since there appears to be a dependency. There might be others as well

Thanks Mat. Those hints will help. I will take some time and try to debug it myself. I will let you know. Thanks again. I am loving OpenACC and the technical support from you people at NVIDIA.

Hi Mat, with the strategy you told it clearly appears that the error is in the “MP5_FACE_INTERPOLATION_WITH_PRIMS_CHARS” subroutine. I came to this conclusion by observing no NaN values appearing on the screen when I sequentially ran this subroutine on the gpu (by using ‘!$acc loop seq’ directive) and remaining subroutines with parallelization. But I tried my best but could not figure out what is going wrong when these loops are being parallelized. The -Minfo is saying: ‘CUDA shared memory used for phi_hat,b,w,phi_l’. I dont exactly understand why these arrays are going in to CUDA sharing. Please kindly have a look at it once. The full Minfo message for the subroutine is like this:

mp5_face_interpolation_with_prims_chars:
    246, Generating Tesla code
        248, !$acc loop gang collapse(3) ! blockidx%x
        250,   ! blockidx%x collapsed
        252,   ! blockidx%x collapsed
        255, !$acc loop vector(128) ! threadidx%x
        303, !$acc loop seq
        307, !$acc loop seq
        320, !$acc loop seq
        374, !$acc loop seq
    246, CUDA shared memory used for phi_hat,b,w,phi_l
         Generating default present(nk(1:nblocks),iy_c(0:nimax,1:njmax,1:nkmax,1:nblocks),ix_c(0:nimax,1:njmax,1:nkmax,1:nblocks),qp(-2:nimax+3,1:njmax,1:nkmax,1:nblocks,1:5),qp_iphl(0:nimax,1:njmax,1:nkmax,1:nblocks,1:nprims),qp_iphr(0:nimax,1:njmax,1:nkmax,1:nblocks,1:nprims),iz_c(0:nimax,1:njmax,1:nkmax,1:nblocks),ni(1:nblocks),nj(1:nblocks))
    255, Loop is parallelizable
    399, Generating Tesla code
        401, !$acc loop gang collapse(3) ! blockidx%x
        403,   ! blockidx%x collapsed
        405,   ! blockidx%x collapsed
        408, !$acc loop vector(128) ! threadidx%x
        455, !$acc loop seq
        458, !$acc loop seq
        469, !$acc loop seq
        524, !$acc loop seq
    399, CUDA shared memory used for phi_hat,b,w,phi_l
         Generating default present(nk(1:nblocks),jy_c(1:nimax,0:njmax,1:nkmax,1:nblocks),jx_c(1:nimax,0:njmax,1:nkmax,1:nblocks),qp(1:nimax,-2:njmax+3,1:nkmax,1:nblocks,1:5),qp_jphr(1:nimax,0:njmax,1:nkmax,1:nblocks,1:nprims),qp_jphl(1:nimax,0:njmax,1:nkmax,1:nblocks,1:nprims),jz_c(1:nimax,0:njmax,1:nkmax,1:nblocks),ni(1:nblocks),nj(1:nblocks))
    408, Loop is parallelizable

The corresponding subroutine is like this:

SUBROUTINE MP5_FACE_INTERPOLATION_WITH_PRIMS_CHARS()

	use declare_variables
	implicit none
	
	integer :: II
	double precision, dimension(nprims) :: W_iph, W_imh
	double precision, dimension(-2:3,nprims) :: Wp
    double precision, dimension(nprims,nprims) :: L_Eig, R_Eig
    double precision :: p, c, sqrt_rho, divisor, rho
	
	double precision, dimension(3) :: PHI_hat, w, B
	double precision, dimension(-2:2) :: PHI_L
	double precision, parameter :: eps = 1.d-15
	


	!$acc parallel loop gang collapse(3) default(present) private(L_Eig,R_Eig,Wp,W_iph,W_imh,n_prim,PHI_L,PHI_hat,B,w)
	!!$acc loop seq private(L_Eig,R_Eig,Wp,W_iph,W_imh,PHI_L,B,w,n_prim,PHI_hat)
	DO nbl = 1,nblocks
	!!$acc loop seq
	DO k = 1,NKmax
	!!$acc loop seq
	DO j = 1,NJmax
	!$acc loop vector
	!!$acc loop seq
	DO i = 0,NImax
	if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then	
	
		! Roe-averaged 'Rho' and 'c' on i+1/2 face
		sqrt_rho = sqrt(Qp(i+1,j,k,nbl,1)/Qp(i,j,k,nbl,1))
		rho      = sqrt_rho*Qp(i,j,k,nbl,1)
		divisor  = 1.d0/(sqrt_rho+1.d0)
		
		p   = (Qp(i,j,k,nbl,5) + (Qp(i+1,j,k,nbl,5)*sqrt_rho))*divisor
		c   = sqrt(gamma*p/rho)
		
		! Left and right Eigen vectors on i+1/2 face
		!--------------------------------------------
		
		L_Eig(1,1) = 0.d0							;	R_Eig(1,1) = 0.5d0/c**2.d0
		L_Eig(2,1) = Ix_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(2,1) = -0.5d0*Ix_c(i,j,k,nbl)/rho/c
		L_Eig(3,1) = Iz_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(3,1) = -0.5d0*Iy_c(i,j,k,nbl)/rho/c
		L_Eig(4,1) = -Iy_c(i,j,k,nbl)*c**2.d0       ;	R_Eig(4,1) = -0.5d0*Iz_c(i,j,k,nbl)/rho/c
		L_Eig(5,1) = 0.d0                           ;	R_Eig(5,1) = 0.5d0
													;	
		L_Eig(1,2) = -Ix_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,2) = Ix_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,2) = 0.d0                           ;	R_Eig(2,2) = 0.d0
		L_Eig(3,2) = -Iy_c(i,j,k,nbl)               ;	R_Eig(3,2) = -Iz_c(i,j,k,nbl)
		L_Eig(4,2) = -Iz_c(i,j,k,nbl)               ;	R_Eig(4,2) = Iy_c(i,j,k,nbl)
		L_Eig(5,2) = Ix_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,2) = 0.d0
													;	
		L_Eig(1,3) = -Iy_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,3) = Iz_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,3) = -Iz_c(i,j,k,nbl)               ;	R_Eig(2,3) = -Iy_c(i,j,k,nbl)
		L_Eig(3,3) = Ix_c(i,j,k,nbl)                ;	R_Eig(3,3) = Ix_c(i,j,k,nbl)
		L_Eig(4,3) = 0.d0                           ;	R_Eig(4,3) = 0.d0
		L_Eig(5,3) = Iy_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,3) = 0.d0
													;	
		L_Eig(1,4) = -Iz_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,4) = -Iy_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,4) = Iy_c(i,j,k,nbl)                ;	R_Eig(2,4) = -Iz_c(i,j,k,nbl)
		L_Eig(3,4) = 0.d0                           ;	R_Eig(3,4) = 0.d0
		L_Eig(4,4) = Ix_c(i,j,k,nbl)                ;	R_Eig(4,4) = Ix_c(i,j,k,nbl)
		L_Eig(5,4) = Iz_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,4) = 0.d0
													;	
		L_Eig(1,5) = 1.d0                           ;	R_Eig(1,5) = 0.5d0/c**2.d0
		L_Eig(2,5) = -Ix_c(i,j,k,nbl)               ;	R_Eig(2,5) = 0.5d0*Ix_c(i,j,k,nbl)/rho/c
		L_Eig(3,5) = -Iz_c(i,j,k,nbl)               ;	R_Eig(3,5) = 0.5d0*Iy_c(i,j,k,nbl)/rho/c
		L_Eig(4,5) = Iy_c(i,j,k,nbl)                ;	R_Eig(4,5) = 0.5d0*Iz_c(i,j,k,nbl)/rho/c
		L_Eig(5,5) = 1.d0                           ;	R_Eig(5,5) = 0.5d0
		
		!--------------------------------------------------------------------------------------------
		
		! Use these eigen vectors at i, to convert 'Qp' at all II = -2,3 to 'Wp'
		!$acc loop seq
		DO II=-2,3
			! Charactristic primitives
			!Wp(II,:) = matmul(L_Eig,Qp(i+II,j,k,nbl,:))
			!$acc loop seq
			DO n_prim = 1,nprims
			Wp(II,n_prim) = L_Eig(n_prim,1)*Qp(i+II,j,k,nbl,1) +&
							L_Eig(n_prim,2)*Qp(i+II,j,k,nbl,2) +&
							L_Eig(n_prim,3)*Qp(i+II,j,k,nbl,3) +&
							L_Eig(n_prim,4)*Qp(i+II,j,k,nbl,4) +&
							L_Eig(n_prim,5)*Qp(i+II,j,k,nbl,5) 
			ENDDO
			
		ENDDO
		
		!========================================================
		
		!$acc loop seq
		DO n_prim = 1,nprims
		
			! Cell Right face
			! PHI_L = Wp(-2:2,n_prim)
			PHI_L(-2) = Wp(-2,n_prim)
			PHI_L(-1) = Wp(-1,n_prim)
			PHI_L(+0) = Wp(+0,n_prim)
			PHI_L(+1) = Wp(+1,n_prim)
			PHI_L(+2) = Wp(+2,n_prim)
			
			! Polynomials
			PHI_hat(1) = (2.d0*PHI_L(-2) -7.d0*PHI_L(-1) +11.d0*PHI_L(0))/6.d0
			PHI_hat(2) = (    -PHI_L(-1) +5.d0*PHI_L(0)  + 2.d0*PHI_L(1))/6.d0
			PHI_hat(3) = (2.d0*PHI_L( 0) +5.d0*PHI_L(1)  -      PHI_L(2))/6.d0
			
			! Smoothness indicators
			B(1) = 13.d0/12.d0*(PHI_L(-2) -2.d0*PHI_L(-1) +PHI_L(0))**2 + 0.25d0*(     PHI_L(-2) -4.d0*PHI_L(-1) +3.d0*PHI_L(0))**2
			B(2) = 13.d0/12.d0*(PHI_L(-1) -2.d0*PHI_L( 0) +PHI_L(1))**2 + 0.25d0*(     PHI_L(-1) -     PHI_L(1))**2
			B(3) = 13.d0/12.d0*(PHI_L( 0) -2.d0*PHI_L( 1) +PHI_L(2))**2 + 0.25d0*(3.d0*PHI_L(0)  -4.d0*PHI_L(1) +PHI_L(2))**2
			
			w(1) = (1.d0/10.d0)/(eps+B(1))**2
			w(2) = (3.d0/5.d0) /(eps+B(2))**2
			w(3) = (3.d0/10.d0)/(eps+B(3))**2
			
			W_iph(n_prim) = (w(1)*PHI_hat(1)+w(2)*PHI_hat(2)+w(3)*PHI_hat(3))/(w(1)+w(2)+w(3))
		
			!==============================================================================================
			! Next cell Left face
			! PHI_L = Wp(-1:3,n_prim)
			PHI_L(-2) = Wp(-1,n_prim)
			PHI_L(-1) = Wp(+0,n_prim)
			PHI_L(+0) = Wp(+1,n_prim)
			PHI_L(+1) = Wp(+2,n_prim)
			PHI_L(+2) = Wp(+3,n_prim)
			
			PHI_hat(1) = (2.d0*PHI_L(2) -7.d0*PHI_L( 1) +11.d0*PHI_L( 0))/6.d0
			PHI_hat(2) = (    -PHI_L(1) +5.d0*PHI_L( 0) + 2.d0*PHI_L(-1))/6.d0
			PHI_hat(3) = (2.d0*PHI_L(0) +5.d0*PHI_L(-1) -      PHI_L(-2))/6.d0
			
			! Smoothness indicators
			B(1) = 13.d0/12.d0*(PHI_L(2) -2.d0*PHI_L( 1) +PHI_L( 0))**2 + 0.25d0*(PHI_L(2) -4.d0*PHI_L(1) +3.d0*PHI_L(0))**2
			B(2) = 13.d0/12.d0*(PHI_L(1) -2.d0*PHI_L( 0) +PHI_L(-1))**2 + 0.25d0*(PHI_L(1) - PHI_L(-1))**2
			B(3) = 13.d0/12.d0*(PHI_L(0) -2.d0*PHI_L(-1) +PHI_L(-2))**2 + 0.25d0*(3.d0*PHI_L(0) -4.d0*PHI_L(-1) +PHI_L(-2))**2
			
			w(1) = (1.d0/10.d0)/(eps+B(1))**2
			w(2) = (3.d0/5.d0) /(eps+B(2))**2
			w(3) = (3.d0/10.d0)/(eps+B(3))**2
			
			! W_imh(n_prim) = SUM(w*PHI_hat)/SUM(w)
			W_imh(n_prim) = (w(1)*PHI_hat(1)+w(2)*PHI_hat(2)+w(3)*PHI_hat(3))/(w(1)+w(2)+w(3))
			
		ENDDO
		
		!$acc loop seq
		DO n_prim = 1,nprims
		
			Qp_iphL(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_iph(1) + &
										R_Eig(n_prim,2)*W_iph(2) + &
										R_Eig(n_prim,3)*W_iph(3) + &
										R_Eig(n_prim,4)*W_iph(4) + &
										R_Eig(n_prim,5)*W_iph(5)
										
			Qp_iphR(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_imh(1) + &
										R_Eig(n_prim,2)*W_imh(2) + &
										R_Eig(n_prim,3)*W_imh(3) + &
										R_Eig(n_prim,4)*W_imh(4) + &
										R_Eig(n_prim,5)*W_imh(5)
		ENDDO
		
		! PHI_iphL(i,j,k,nbl,:) = matmul(R_Eig,W_iph)
		! PHI_iphR(i,j,k,nbl,:) = matmul(R_Eig,W_imh)
		
	endif
	ENDDO
	ENDDO
	ENDDO
	ENDDO
	
	
	!$acc parallel loop gang collapse(3) default(present) private(L_Eig,R_Eig,Wp,W_iph,W_imh,n_prim,PHI_L,PHI_hat,B,w)
	!!$acc loop seq private(L_Eig,R_Eig,Wp,W_iph,W_imh,PHI_L,B,w,n_prim,PHI_hat)
	DO nbl = 1,nblocks
	!!$acc loop seq
	DO k = 1,NKmax
	!!$acc loop seq
	DO j = 0,NJmax
	!$acc loop vector
	!!$acc loop seq
	DO i = 1,NImax
	if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then
		! Roe-averaged 'Rho' and 'c' on j+1/2 face
		sqrt_rho =sqrt(Qp(i,j+1,k,nbl,1)/Qp(i,j,k,nbl,1))
		rho      =sqrt_rho*Qp(i,j,k,nbl,1)
		divisor  =1.0d0/(sqrt_rho+1.0d0)
		
		p   = (Qp(i,j,k,nbl,5) + (Qp(i,j+1,k,nbl,5)*sqrt_rho))*divisor
		c   = sqrt(gamma*p/rho)
		
		! Left and right Eigen vectors on j+1/2 face
		!--------------------------------
		
		L_Eig(1,1) = 0.d0							;	R_Eig(1,1) = 0.5d0/c**2.d0
		L_Eig(2,1) = Jx_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(2,1) = -0.5d0*Jx_c(i,j,k,nbl)/rho/c
		L_Eig(3,1) = Jz_c(i,j,k,nbl)*c**2.d0        ;	R_Eig(3,1) = -0.5d0*Jy_c(i,j,k,nbl)/rho/c
		L_Eig(4,1) = -Jy_c(i,j,k,nbl)*c**2.d0       ;	R_Eig(4,1) = -0.5d0*Jz_c(i,j,k,nbl)/rho/c
		L_Eig(5,1) = 0.d0                           ;	R_Eig(5,1) = 0.5d0
													;	
		L_Eig(1,2) = -Jx_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,2) = Jx_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,2) = 0.d0                           ;	R_Eig(2,2) = 0.d0
		L_Eig(3,2) = -Jy_c(i,j,k,nbl)               ;	R_Eig(3,2) = -Jz_c(i,j,k,nbl)
		L_Eig(4,2) = -Jz_c(i,j,k,nbl)               ;	R_Eig(4,2) = Jy_c(i,j,k,nbl)
		L_Eig(5,2) = Jx_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,2) = 0.d0
													;	
		L_Eig(1,3) = -Jy_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,3) = Jz_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,3) = -Jz_c(i,j,k,nbl)               ;	R_Eig(2,3) = -Jy_c(i,j,k,nbl)
		L_Eig(3,3) = Jx_c(i,j,k,nbl)                ;	R_Eig(3,3) = Jx_c(i,j,k,nbl)
		L_Eig(4,3) = 0.d0                           ;	R_Eig(4,3) = 0.d0
		L_Eig(5,3) = Jy_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,3) = 0.d0
													;	
		L_Eig(1,4) = -Jz_c(i,j,k,nbl)*rho*c         ;	R_Eig(1,4) = -Jy_c(i,j,k,nbl)/c**2.d0
		L_Eig(2,4) = Jy_c(i,j,k,nbl)                ;	R_Eig(2,4) = -Jz_c(i,j,k,nbl)
		L_Eig(3,4) = 0.d0                           ;	R_Eig(3,4) = 0.d0
		L_Eig(4,4) = Jx_c(i,j,k,nbl)                ;	R_Eig(4,4) = Jx_c(i,j,k,nbl)
		L_Eig(5,4) = Jz_c(i,j,k,nbl)*rho*c          ;	R_Eig(5,4) = 0.d0
													;	
		L_Eig(1,5) = 1.d0                           ;	R_Eig(1,5) = 0.5d0/c**2.d0
		L_Eig(2,5) = -Jx_c(i,j,k,nbl)               ;	R_Eig(2,5) = 0.5d0*Jx_c(i,j,k,nbl)/rho/c
		L_Eig(3,5) = -Jz_c(i,j,k,nbl)               ;	R_Eig(3,5) = 0.5d0*Jy_c(i,j,k,nbl)/rho/c
		L_Eig(4,5) = Jy_c(i,j,k,nbl)                ;	R_Eig(4,5) = 0.5d0*Jz_c(i,j,k,nbl)/rho/c
		L_Eig(5,5) = 1.d0                           ;	R_Eig(5,5) = 0.5d0
		
		!------------------------------------------------------------------------------------------
			
		! Use these eigen vectors at i, to convert 'Qp' at all II = -2,3 to 'Wp'
		!$acc loop seq
		DO II=-2,3
			! Wp(II,:) = matmul(L_Eig,Qp(i,j+II,k,nbl,:))
			!$acc loop seq
			DO n_prim = 1,nprims
			Wp(II,n_prim) = L_Eig(n_prim,1)*Qp(i,j+II,k,nbl,1) +&
							L_Eig(n_prim,2)*Qp(i,j+II,k,nbl,2) +&
							L_Eig(n_prim,3)*Qp(i,j+II,k,nbl,3) +&
							L_Eig(n_prim,4)*Qp(i,j+II,k,nbl,4) +&
							L_Eig(n_prim,5)*Qp(i,j+II,k,nbl,5) 
			ENDDO
		ENDDO
		
		!========================================================
		!$acc loop seq
		DO n_prim = 1,nprims
		
			! Cell Right face
			! PHI_L = Wp(-2:2,n_prim)
			PHI_L(-2) = Wp(-2,n_prim)
			PHI_L(-1) = Wp(-1,n_prim)
			PHI_L(+0) = Wp(+0,n_prim)
			PHI_L(+1) = Wp(+1,n_prim)
			PHI_L(+2) = Wp(+2,n_prim)
			
			! Polynomials
			PHI_hat(1) = (2.d0*PHI_L(-2) -7.d0*PHI_L(-1) +11.d0*PHI_L(0))/6.d0
			PHI_hat(2) = (    -PHI_L(-1) +5.d0*PHI_L(0)  + 2.d0*PHI_L(1))/6.d0
			PHI_hat(3) = (2.d0*PHI_L( 0) +5.d0*PHI_L(1)  -      PHI_L(2))/6.d0
			
			! Smoothness indicators
			B(1) = 13.d0/12.d0*(PHI_L(-2) -2.d0*PHI_L(-1) +PHI_L(0))**2 + 0.25d0*(     PHI_L(-2) -4.d0*PHI_L(-1) +3.d0*PHI_L(0))**2
			B(2) = 13.d0/12.d0*(PHI_L(-1) -2.d0*PHI_L( 0) +PHI_L(1))**2 + 0.25d0*(     PHI_L(-1) -     PHI_L(1))**2
			B(3) = 13.d0/12.d0*(PHI_L( 0) -2.d0*PHI_L( 1) +PHI_L(2))**2 + 0.25d0*(3.d0*PHI_L(0)  -4.d0*PHI_L(1) +PHI_L(2))**2
			
			! Non-linear weights
			w(1) = (1.d0/10.d0)/(eps+B(1))**2
			w(2) = (3.d0/5.d0) /(eps+B(2))**2
			w(3) = (3.d0/10.d0)/(eps+B(3))**2
			
			W_iph(n_prim) = (w(1)*PHI_hat(1)+w(2)*PHI_hat(2)+w(3)*PHI_hat(3))/(w(1)+w(2)+w(3))
		
			!==============================================================================================
			! Next cell Left face
			! PHI_L = Wp(-1:3,n_prim)
			PHI_L(-2) = Wp(-1,n_prim)
			PHI_L(-1) = Wp(+0,n_prim)
			PHI_L(+0) = Wp(+1,n_prim)
			PHI_L(+1) = Wp(+2,n_prim)
			PHI_L(+2) = Wp(+3,n_prim)
			
			PHI_hat(1) = (2.d0*PHI_L(2) -7.d0*PHI_L( 1) +11.d0*PHI_L( 0))/6.d0
			PHI_hat(2) = (    -PHI_L(1) +5.d0*PHI_L( 0) + 2.d0*PHI_L(-1))/6.d0
			PHI_hat(3) = (2.d0*PHI_L(0) +5.d0*PHI_L(-1) -      PHI_L(-2))/6.d0
			
			! Smoothness indicators
			B(1) = 13.d0/12.d0*(PHI_L(2) -2.d0*PHI_L( 1) +PHI_L( 0))**2 + 0.25d0*(PHI_L(2) -4.d0*PHI_L(1) +3.d0*PHI_L(0))**2
			B(2) = 13.d0/12.d0*(PHI_L(1) -2.d0*PHI_L( 0) +PHI_L(-1))**2 + 0.25d0*(PHI_L(1) - PHI_L(-1))**2
			B(3) = 13.d0/12.d0*(PHI_L(0) -2.d0*PHI_L(-1) +PHI_L(-2))**2 + 0.25d0*(3.d0*PHI_L(0) -4.d0*PHI_L(-1) +PHI_L(-2))**2
			
			! Non-linear weights
			w(1) = (1.d0/10.d0)/(eps+B(1))**2
			w(2) = (3.d0/5.d0) /(eps+B(2))**2
			w(3) = (3.d0/10.d0)/(eps+B(3))**2
			
			W_imh(n_prim) = (w(1)*PHI_hat(1)+w(2)*PHI_hat(2)+w(3)*PHI_hat(3))/(w(1)+w(2)+w(3))
			
		ENDDO
		
		!$acc loop seq
		DO n_prim = 1,nprims
		
			Qp_jphL(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_iph(1) + &
										R_Eig(n_prim,2)*W_iph(2) + &
										R_Eig(n_prim,3)*W_iph(3) + &
										R_Eig(n_prim,4)*W_iph(4) + &
										R_Eig(n_prim,5)*W_iph(5)
										
			Qp_jphR(i,j,k,nbl,n_prim) = R_Eig(n_prim,1)*W_imh(1) + &
										R_Eig(n_prim,2)*W_imh(2) + &
										R_Eig(n_prim,3)*W_imh(3) + &
										R_Eig(n_prim,4)*W_imh(4) + &
										R_Eig(n_prim,5)*W_imh(5)
		ENDDO
		
		! PHI_jphL(i,j,k,nbl,:) = matmul(R_Eig,W_iph)
		! PHI_jphR(i,j,k,nbl,:) = matmul(R_Eig,W_imh)
		
	endif
	ENDDO
	ENDDO
	ENDDO
	ENDDO
	
	
	!!$acc wait


END

Ok, this is probably it. The arrays are private to the loop on which the private clause appears but would be shared by any inner loop. (For gang private arrays, as an optimization, we try to put them in CUDA shared memory since it’s much faster to access).

To fix, you probably just need move where you put the private clause:

 	!$acc parallel loop gang collapse(3) default(present) 
	DO nbl = 1,nblocks
	!!$acc loop seq
	DO k = 1,NKmax
	!!$acc loop seq
	DO j = 1,NJmax
	!$acc loop vector private(L_Eig,R_Eig,Wp,W_iph,W_imh,n_prim,PHI_L,PHI_hat,B,w)
	DO i = 0,NImax
1 Like

Thanks Mat! It worked. My code is almost 15x faster now (with only a week of programming) and I still see some scope for optimization. I still have lots of questions about OpenACC. I will post them slowly. I am so grateful for the excellent support.

This topic was automatically closed 60 days after the last reply. New replies are no longer allowed.