Hi,
I am running a large OpenACC code (MAS) using 19.4 and some of my test runs are getting similar, but wrong answers. After much tracking down, I have narrowed it down to a single routine. I have sifted through the routine and cannot spot anything strange.
Below is the routine. The routine works correctly on the CPU using any compiler. I already ruled out the routine being called (filter_tp_hh).
Any ideas?
c#######################################################################
subroutine char_bc_0
c
c-----------------------------------------------------------------------
c
c ****** Get the boundary values at r=R0 using the characteristics.
c ****** This version solves the "parallel characteristics".
c
c-----------------------------------------------------------------------
c
use number_types
use globals
use mesh
use fields
use characteristics
use vars
use mpidefs
use emerging_flux_params
c
c-----------------------------------------------------------------------
c
implicit none
c
c-----------------------------------------------------------------------
c
real(r_typ), parameter :: one=1._r_typ
real(r_typ), parameter :: half=.5_r_typ
c
c-----------------------------------------------------------------------
c
c ****** Temporary storage for BC arrays.
c
real(r_typ), dimension(nt,np) :: brhat,bthat,bphat,vee
real(r_typ), dimension(nt,np) :: vpar_t,vpar_p
real(r_typ), dimension(2,ntm1,npm1) :: vee_mm
c
c-----------------------------------------------------------------------
c
integer :: i,j,k
real(r_typ) :: br,bt,bp
real(r_typ) :: brh,bth,bph
real(r_typ) :: bsq,bmag,bmag_i
real(r_typ) :: dir,fac
real(r_typ) :: ps,rhos,cs
real(r_typ) :: qp,qm
real(r_typ) :: dvdr,dvdt,dvdp
real(r_typ) :: dpdr,dpdt,dpdp
real(r_typ) :: dvds,dpds,vr,vt,vp,dveedt
c
integer :: ierr
integer :: ierr2
integer, save :: iseq=0
character(3) :: seq
character(16) :: fname
logical, parameter :: debug=.false.
c
c-----------------------------------------------------------------------
c
ierr=0
ierr2=0
c
c ****** The boundary velocity is determined only on processors
c ****** that include the lower radial boundary.
c
if (rb0) then
c
!$acc enter data create(brhat,bthat,bphat,vee,vpar_t,vpar_p,vee_mm)
!$acc kernels loop collapse(2) default(present) async(1)
do k=1,np
do j=1,nt
vpar_t(j,k)=0.
vpar_p(j,k)=0.
vee(j,k)=0.
brhat(j,k)=0.
bthat(j,k)=0.
bphat(j,k)=0.
enddo
enddo
c
if (debug.and.iamp0) then
iseq=iseq+1
write (seq,'(i3.3)') iseq
fname='char'//seq//'.dat'
call ffopen (24,trim(fname),'rw',ierr2)
end if
if (debug) call check_error_on_p0 (ierr2)
c
if (debug.and.iamp0) then
write (24,'(100(a,a))')
& 'th',
& achar(9),'V',
& achar(9),'br',
& achar(9),'bt',
& achar(9),'brhat',
& achar(9),'bthat',
& achar(9),'p',
& achar(9),'rho',
& achar(9),'c',
& achar(9),'dV/ds',
& achar(9),'dp/ds',
& achar(9),'dV/dt'
end if
!$acc wait(1)
c
c ****** Get the parallel velocity.
c
!$acc kernels default(present) present(b,v)
!$acc loop
do k=1,npm1
!$acc loop
do j=1,ntm1
!$acc loop seq
do i=1,2
br=AVGR (b%r,i+1,j ,k )
bt=AVGT (b%t,i ,j+1,k )
bp=AVGP (b%p,i ,j ,k+1)
vr=AVGTP (v%r,i ,j+1,k+1)
vt=AVGRP (v%t,i+1,j ,k+1)
vp=AVGRT (v%p,i+1,j+1,k )
bsq=br**2+bt**2+bp**2
bmag=sqrt(bsq)
if (bmag.eq.0.) then
brh=one
bth=0.
bph=0.
else
bmag_i=one/bmag
brh=br*bmag_i
bth=bt*bmag_i
bph=bp*bmag_i
end if
vee_mm(i,j,k)=vr*brh+vt*bth+vp*bph
enddo
enddo
enddo
!$acc end kernels
c
c ****** Loop over all theta and phi points.
c
!$acc kernels default(present) present(b,v) copy(ierr)
!$acc loop
do k=2,npm1
!$acc loop
do j=2,ntm1
c
c ****** Get the direction of B.
c
br=AVGRTP(b%r,2,j,k)
bt=AVGP (b%t,1,j,k)
bp=AVGT (b%p,1,j,k)
bmag=sqrt(br**2+bt**2+bp**2)
if (bmag.eq.0.) then
brhat(j,k)=one
bthat(j,k)=0.
bphat(j,k)=0.
else
bmag_i=one/bmag
brhat(j,k)=br*bmag_i
bthat(j,k)=bt*bmag_i
bphat(j,k)=bp*bmag_i
end if
c
c ****** Project along B in the inward radial direction.
c
dir=sign(one,brhat(j,k))
c
c ****** Initialize the variables.
c
ps=AVGR(pres,2,j,k)
rhos=AVGR(rho,2,j,k)
cs=sqrt(gamma*ps/rhos)
c
vr=AVG (v%r,1,j,k)
vt=AVGRT(v%t,2,j,k)
vp=AVGRP(v%p,2,j,k)
c
vee(j,k)=brhat(j,k)*vr+bthat(j,k)*vt+bphat(j,k)*vp
c
c ****** Get derivatives for projecting along the characteristic.
c
qp=AVG(pres,2,j,k)
qm=AVG(pres,1,j,k)
dpdr=(qp-qm)*drh_i(2)
qp=AVGRT(pres,2,j+1,k)
qm=AVGRT(pres,2,j ,k)
dpdt=(qp-qm)*dth_i(j)*r_i(1)
qp=AVGRP(pres,2,j,k+1)
qm=AVGRP(pres,2,j,k )
dpdp=(qp-qm)*dp_mult*dph_i(k)*r_i(1)*sth_i(j)
c
dpds=brhat(j,k)*dpdr+bthat(j,k)*dpdt+bphat(j,k)*dpdp
c
c ****** Get dV/ds.
c
qp=AVGTP (vee_mm,2,j ,k )
qm=AVGTP (vee_mm,1,j ,k )
dvdr=(qp-qm)*drh_i(2)
c
qp=AVGP (vee_mm,1,j ,k )
qm=AVGP (vee_mm,1,j-1,k )
dvdt=(qp-qm)*dth_i(j)*r_i(1)
c
qp=AVGT (vee_mm,1,j ,k )
qm=AVGT (vee_mm,1,j ,k-1)
dvdp=(qp-qm)*dp_mult*dph_i(k)*r_i(1)*sth_i(j)
c
dvds=(brhat(j,k)*dvdr+bthat(j,k)*dvdt+bphat(j,k)*dvdp)
c
ccc dveedt=-dpds/rhos
ccc & -vee(j,k)*dvds
ccc & -g0*brhat(j,k)*r_i(1)**2
c
ccc *JL* Use characterisic formulation instead of momentum
ccc equation above.
c
fac=(dir*vee(j,k)-cs)
dveedt=fac*dir*dpds/(rhos*cs)
& -fac*dvds
& -g0*dir*brhat(j,k)*r_i(1)**2
& +2.*vee(j,k)*dir*cs/r(1)
c
if (debug.and.iamp0.and.k.eq.2) then
write (24,'(100(1pe12.5,a))')
& th(j),
& achar(9),vee(j,k),
& achar(9),br,
& achar(9),bt,
& achar(9),brhat(j,k),
& achar(9),bthat(j,k),
& achar(9),ps,
& achar(9),rhos,
& achar(9),cs,
& achar(9),dvds,
& achar(9),dpds,
& achar(9),dveedt
end if
c
ccc *JL* vee(j,k)=vee(j,k)+dveedt*dtime
ccc *JL* Need dir with use of characteristic equation above.
c
vee(j,k)=vee(j,k)+dir*dveedt*dtime
c
ccc if (eflux%active) then
ccc if (eflux_factor*eflux_vr(j,k).gt.eflux%vr_max)
ccc & vee(j,k)=0.
ccc end if
c
c ****** When UBZERO=.true., set the inward flow to zero if
c ****** it is negative.
c
if (ubzero.and.dir*vee(j,k).lt.0.) then
vee(j,k)=0.
end if
c
c ****** Check for supersonic inflow. If there is supersonic infow,
c ****** set IERR=1 and exit the (j,k) loops.
c
if (dir*vee(j,k).ge.cs) then
write (13,*)
if (expert_user_override%limit_supersonic_inflow) then
write (13,*) '### WARNING in CHAR_BC_0:'
else
write (13,*) '### ERROR in CHAR_BC_0:'
endif
write (13,*) '### Supersonic inflow at r=R0.'
write (13,*) 'NTIME = ',ntime
write (13,*) 'TIME = ',time
write (13,*) 'CS = ',cs
write (13,*) 'DIR*VEE = ',dir*vee(j,k)
write (13,*) 'theta = ',th(j)
write (13,*) 'phi = ',ph(k)
if (expert_user_override%limit_supersonic_inflow) then
vee(j,k)=cs*dir
else
!$acc atomic write
ierr=1
endif
end if
c
enddo
enddo
!$acc end kernels
c
end if
c
c ****** Check whether to terminate the code.
c ****** The rb0 conditional is seperated here
c ****** because this check needs all ranks to call it.
c
call check_error_on_any_proc (ierr)
c
c ****** The boundary velocity is determined only on processors
c ****** that include the lower radial boundary.
c
if (rb0) then
c
c ****** Filter the parallel velocity.
c
do i=1,nfiltub
call filter_tp_hh (vee)
enddo
c
c ****** If we are using the parallel flow model, save the BC
c ****** on the parallel velocity.
c
if (freeze_b) then
vb%r0%par(:,:)=0.
do k=2,npm1
do j=2,ntm1
vb%r0%par(j,k)=vee(j,k)
enddo
enddo
end if
c
c ****** Set the BC velocity to be parallel to B. This is done
c ****** even when using the parallel flow model because we still
c ****** set BCs on the vector velocity.
c
!$acc kernels loop collapse(2) default(present) present(vb)
do k=2,npm1
do j=2,ntm1
vb%r0%r(j,k)=vee(j,k)*brhat(j,k)
vpar_t (j,k)=vee(j,k)*bthat(j,k)
vpar_p (j,k)=vee(j,k)*bphat(j,k)
enddo
enddo
!$acc end kernels
c
c ****** Seam the vt and vp BC arrays.
c
call seam_2d_tp_acc (vpar_t,nt,np,.true.,.false.)
call seam_2d_tp_acc (vpar_p,nt,np,.false.,.true.)
c
c ****** Average the theta and phi components to the
c ****** appropriate mesh.
c
!$acc parallel default(present) present(vb)
!$acc loop collapse(2)
do k=2,npm1
do j=2,ntm-1
vb%r0%t(j,k)=half*(vpar_t(j,k)+vpar_t(j+1,k))
enddo
enddo
c
!$acc loop collapse(2)
do k=2,npm-1
do j=2,ntm1
vb%r0%p(j,k)=half*(vpar_p(j,k)+vpar_p(j,k+1))
enddo
enddo
!$acc end parallel
c
if (debug.and.iamp0) then
close (24)
end if
c
!$acc exit data delete(brhat,bthat,bphat,vee,vpar_t,vpar_p,vee_mm)
end if
c
return
end
c#######################################################################