Code getting wrong answers in 19.04 - worked in 18.10

Hi,

I am running a large OpenACC code (MAS) using 19.4 and some of my test runs are getting similar, but wrong answers. After much tracking down, I have narrowed it down to a single routine. I have sifted through the routine and cannot spot anything strange.

Below is the routine. The routine works correctly on the CPU using any compiler. I already ruled out the routine being called (filter_tp_hh).

Any ideas?

c#######################################################################
      subroutine char_bc_0
c
c-----------------------------------------------------------------------
c
c ****** Get the boundary values at r=R0 using the characteristics.
c ****** This version solves the "parallel characteristics".
c
c-----------------------------------------------------------------------
c
      use number_types
      use globals
      use mesh
      use fields
      use characteristics
      use vars
      use mpidefs
      use emerging_flux_params
c
c-----------------------------------------------------------------------
c
      implicit none
c
c-----------------------------------------------------------------------
c
      real(r_typ), parameter :: one=1._r_typ
      real(r_typ), parameter :: half=.5_r_typ
c
c-----------------------------------------------------------------------
c
c ****** Temporary storage for BC arrays.
c
      real(r_typ), dimension(nt,np) :: brhat,bthat,bphat,vee
      real(r_typ), dimension(nt,np) :: vpar_t,vpar_p
      real(r_typ), dimension(2,ntm1,npm1) :: vee_mm
c
c-----------------------------------------------------------------------
c
      integer :: i,j,k
      real(r_typ) :: br,bt,bp
      real(r_typ) :: brh,bth,bph
      real(r_typ) :: bsq,bmag,bmag_i
      real(r_typ) :: dir,fac
      real(r_typ) :: ps,rhos,cs
      real(r_typ) :: qp,qm
      real(r_typ) :: dvdr,dvdt,dvdp
      real(r_typ) :: dpdr,dpdt,dpdp
      real(r_typ) :: dvds,dpds,vr,vt,vp,dveedt
c
      integer :: ierr
      integer :: ierr2
      integer, save :: iseq=0
      character(3) :: seq
      character(16) :: fname
      logical, parameter :: debug=.false.
c
c-----------------------------------------------------------------------
c
      ierr=0
      ierr2=0
c
c ****** The boundary velocity is determined only on processors
c ****** that include the lower radial boundary.
c
      if (rb0) then
c
!$acc enter data create(brhat,bthat,bphat,vee,vpar_t,vpar_p,vee_mm)
!$acc kernels loop collapse(2) default(present) async(1)
      do k=1,np
        do j=1,nt
          vpar_t(j,k)=0.
          vpar_p(j,k)=0.
          vee(j,k)=0.
          brhat(j,k)=0.
          bthat(j,k)=0.
          bphat(j,k)=0.
        enddo
      enddo
c
      if (debug.and.iamp0) then
        iseq=iseq+1
        write (seq,'(i3.3)') iseq
        fname='char'//seq//'.dat'
        call ffopen (24,trim(fname),'rw',ierr2)
      end if
      if (debug) call check_error_on_p0 (ierr2)
c
      if (debug.and.iamp0) then
        write (24,'(100(a,a))')
     &                        'th',
     &               achar(9),'V',
     &               achar(9),'br',
     &               achar(9),'bt',
     &               achar(9),'brhat',
     &               achar(9),'bthat',
     &               achar(9),'p',
     &               achar(9),'rho',
     &               achar(9),'c',
     &               achar(9),'dV/ds',
     &               achar(9),'dp/ds',
     &               achar(9),'dV/dt'
      end if
!$acc wait(1)
c
c ****** Get the parallel velocity.
c
!$acc kernels default(present) present(b,v)
!$acc loop
      do k=1,npm1
!$acc loop
        do j=1,ntm1
!$acc loop seq
          do i=1,2
            br=AVGR  (b%r,i+1,j  ,k  )
            bt=AVGT  (b%t,i  ,j+1,k  )
            bp=AVGP  (b%p,i  ,j  ,k+1)
            vr=AVGTP (v%r,i  ,j+1,k+1)
            vt=AVGRP (v%t,i+1,j  ,k+1)
            vp=AVGRT (v%p,i+1,j+1,k  )
            bsq=br**2+bt**2+bp**2
            bmag=sqrt(bsq)
            if (bmag.eq.0.) then
              brh=one
              bth=0.
              bph=0.
            else
              bmag_i=one/bmag
              brh=br*bmag_i
              bth=bt*bmag_i
              bph=bp*bmag_i
            end if
            vee_mm(i,j,k)=vr*brh+vt*bth+vp*bph
          enddo
        enddo
      enddo
!$acc end kernels
c
c ****** Loop over all theta and phi points.
c
!$acc kernels default(present) present(b,v) copy(ierr)
!$acc loop
      do k=2,npm1
!$acc loop
        do j=2,ntm1
c
c ****** Get the direction of B.
c
          br=AVGRTP(b%r,2,j,k)
          bt=AVGP  (b%t,1,j,k)
          bp=AVGT  (b%p,1,j,k)
          bmag=sqrt(br**2+bt**2+bp**2)
          if (bmag.eq.0.) then
            brhat(j,k)=one
            bthat(j,k)=0.
            bphat(j,k)=0.
          else
            bmag_i=one/bmag
            brhat(j,k)=br*bmag_i
            bthat(j,k)=bt*bmag_i
            bphat(j,k)=bp*bmag_i
          end if
c
c ****** Project along B in the inward radial direction.
c
          dir=sign(one,brhat(j,k))
c
c ****** Initialize the variables.
c
          ps=AVGR(pres,2,j,k)
          rhos=AVGR(rho,2,j,k)
          cs=sqrt(gamma*ps/rhos)
c
          vr=AVG  (v%r,1,j,k)
          vt=AVGRT(v%t,2,j,k)
          vp=AVGRP(v%p,2,j,k)
c
          vee(j,k)=brhat(j,k)*vr+bthat(j,k)*vt+bphat(j,k)*vp
c
c ****** Get derivatives for projecting along the characteristic.
c
          qp=AVG(pres,2,j,k)
          qm=AVG(pres,1,j,k)
          dpdr=(qp-qm)*drh_i(2)
          qp=AVGRT(pres,2,j+1,k)
          qm=AVGRT(pres,2,j  ,k)
          dpdt=(qp-qm)*dth_i(j)*r_i(1)
          qp=AVGRP(pres,2,j,k+1)
          qm=AVGRP(pres,2,j,k  )
          dpdp=(qp-qm)*dp_mult*dph_i(k)*r_i(1)*sth_i(j)
c
          dpds=brhat(j,k)*dpdr+bthat(j,k)*dpdt+bphat(j,k)*dpdp
c
c ****** Get dV/ds.
c
          qp=AVGTP (vee_mm,2,j  ,k  )
          qm=AVGTP (vee_mm,1,j  ,k  )
          dvdr=(qp-qm)*drh_i(2)
c
          qp=AVGP  (vee_mm,1,j  ,k  )
          qm=AVGP  (vee_mm,1,j-1,k  )
          dvdt=(qp-qm)*dth_i(j)*r_i(1)
c
          qp=AVGT  (vee_mm,1,j  ,k  )
          qm=AVGT  (vee_mm,1,j  ,k-1)
          dvdp=(qp-qm)*dp_mult*dph_i(k)*r_i(1)*sth_i(j)
c
          dvds=(brhat(j,k)*dvdr+bthat(j,k)*dvdt+bphat(j,k)*dvdp)
c
ccc          dveedt=-dpds/rhos
ccc     &           -vee(j,k)*dvds
ccc     &           -g0*brhat(j,k)*r_i(1)**2
c
ccc *JL* Use characterisic formulation instead of momentum
ccc      equation above.
c
          fac=(dir*vee(j,k)-cs)
          dveedt=fac*dir*dpds/(rhos*cs)
     &           -fac*dvds
     &           -g0*dir*brhat(j,k)*r_i(1)**2
     &       +2.*vee(j,k)*dir*cs/r(1)
c
          if (debug.and.iamp0.and.k.eq.2) then
            write (24,'(100(1pe12.5,a))')
     &                            th(j),
     &                   achar(9),vee(j,k),
     &                   achar(9),br,
     &                   achar(9),bt,
     &                   achar(9),brhat(j,k),
     &                   achar(9),bthat(j,k),
     &                   achar(9),ps,
     &                   achar(9),rhos,
     &                   achar(9),cs,
     &                   achar(9),dvds,
     &                   achar(9),dpds,
     &                   achar(9),dveedt
          end if
c
ccc *JL*          vee(j,k)=vee(j,k)+dveedt*dtime
ccc *JL* Need dir with use of characteristic equation above.
c
          vee(j,k)=vee(j,k)+dir*dveedt*dtime
c
ccc          if (eflux%active) then
ccc            if (eflux_factor*eflux_vr(j,k).gt.eflux%vr_max)
ccc     &        vee(j,k)=0.
ccc          end if
c
c ****** When UBZERO=.true., set the inward flow to zero if
c ****** it is negative.
c
          if (ubzero.and.dir*vee(j,k).lt.0.) then
            vee(j,k)=0.
          end if
c
c ****** Check for supersonic inflow.  If there is supersonic infow,
c ****** set IERR=1 and exit the (j,k) loops.
c
          if (dir*vee(j,k).ge.cs) then
            write (13,*)
            if (expert_user_override%limit_supersonic_inflow) then
              write (13,*) '### WARNING in CHAR_BC_0:'
            else
              write (13,*) '### ERROR in CHAR_BC_0:'
            endif
            write (13,*) '### Supersonic inflow at r=R0.'
            write (13,*) 'NTIME  = ',ntime
            write (13,*) 'TIME  = ',time
            write (13,*) 'CS  = ',cs
            write (13,*) 'DIR*VEE = ',dir*vee(j,k)
            write (13,*) 'theta = ',th(j)
            write (13,*) 'phi = ',ph(k)
            if (expert_user_override%limit_supersonic_inflow) then
              vee(j,k)=cs*dir
            else
!$acc atomic write
              ierr=1
            endif
          end if
c
        enddo
      enddo
!$acc end kernels
c
      end if
c
c ****** Check whether to terminate the code.
c ****** The rb0 conditional is seperated here
c ****** because this check needs all ranks to call it.
c
      call check_error_on_any_proc (ierr)
c
c ****** The boundary velocity is determined only on processors
c ****** that include the lower radial boundary.
c
      if (rb0) then
c
c ****** Filter the parallel velocity.
c
      do i=1,nfiltub
        call filter_tp_hh (vee)
      enddo
c
c ****** If we are using the parallel flow model, save the BC
c ****** on the parallel velocity.
c
      if (freeze_b) then
        vb%r0%par(:,:)=0.
        do k=2,npm1
          do j=2,ntm1
            vb%r0%par(j,k)=vee(j,k)
          enddo
        enddo
      end if
c
c ****** Set the BC velocity to be parallel to B.  This is done
c ****** even when using the parallel flow model because we still
c ****** set BCs on the vector velocity.
c
!$acc kernels loop collapse(2) default(present) present(vb)
      do k=2,npm1
        do j=2,ntm1
          vb%r0%r(j,k)=vee(j,k)*brhat(j,k)
          vpar_t (j,k)=vee(j,k)*bthat(j,k)
          vpar_p (j,k)=vee(j,k)*bphat(j,k)
        enddo
      enddo
!$acc end kernels
c
c ****** Seam the vt and vp BC arrays.
c
      call seam_2d_tp_acc (vpar_t,nt,np,.true.,.false.)
      call seam_2d_tp_acc (vpar_p,nt,np,.false.,.true.)
c
c ****** Average the theta and phi components to the
c ****** appropriate mesh.
c
!$acc parallel default(present) present(vb)
!$acc loop collapse(2)
      do k=2,npm1
        do j=2,ntm-1
          vb%r0%t(j,k)=half*(vpar_t(j,k)+vpar_t(j+1,k))
        enddo
      enddo
c
!$acc loop collapse(2)
      do k=2,npm-1
        do j=2,ntm1
          vb%r0%p(j,k)=half*(vpar_p(j,k)+vpar_p(j,k+1))
        enddo
      enddo
!$acc end parallel
c
      if (debug.and.iamp0) then
        close (24)
      end if
c
!$acc exit data delete(brhat,bthat,bphat,vee,vpar_t,vpar_p,vee_mm)
      end if
c
      return
      end
c#######################################################################

Hi,

Just realized that code I sent had macros in it.

Here is the code expanded out and some things commented out that do not effect the problem.

c#######################################################################
      subroutine char_bc_0
c
c-----------------------------------------------------------------------
c
c ****** Get the boundary values at r=R0 using the characteristics.
c ****** This version solves the "parallel characteristics".
c
c-----------------------------------------------------------------------
c
      use number_types
      use globals
      use mesh
      use fields
      use characteristics
      use vars
      use mpidefs
      use emerging_flux_params
c
c-----------------------------------------------------------------------
c
      implicit none
c
c-----------------------------------------------------------------------
c
      real(r_typ), parameter :: one=1._r_typ
      real(r_typ), parameter :: half=.5_r_typ
c
c-----------------------------------------------------------------------
c
c ****** Temporary storage for BC arrays.
c
      real(r_typ), dimension(nt,np) :: brhat,bthat,bphat,vee
      real(r_typ), dimension(nt,np) :: vpar_t,vpar_p
      real(r_typ), dimension(2,ntm1,npm1) :: vee_mm
c
c-----------------------------------------------------------------------
c
      integer :: i,j,k
      real(r_typ) :: br,bt,bp
      real(r_typ) :: brh,bth,bph
      real(r_typ) :: bsq,bmag,bmag_i
      real(r_typ) :: dir,fac
      real(r_typ) :: ps,rhos,cs
      real(r_typ) :: qp,qm
      real(r_typ) :: dvdr,dvdt,dvdp
      real(r_typ) :: dpdr,dpdt,dpdp
      real(r_typ) :: dvds,dpds,vr,vt,vp,dveedt
c
      integer :: ierr
      integer :: ierr2
      integer, save :: iseq=0
      character(3) :: seq
      character(16) :: fname
      logical, parameter :: debug=.false.
c
c-----------------------------------------------------------------------
c
      ierr=0
      ierr2=0
c
c ****** The boundary velocity is determined only on processors
c ****** that include the lower radial boundary.
c
      if (rb0) then
c
!$acc enter data create(brhat,bthat,bphat,vee,vpar_t,vpar_p,vee_mm)
!$acc kernels loop collapse(2) default(present) async(1)
      do k=1,np
        do j=1,nt
          vpar_t(j,k)=0.
          vpar_p(j,k)=0.
          vee(j,k)=0.
          brhat(j,k)=0.
          bthat(j,k)=0.
          bphat(j,k)=0.
        enddo
      enddo
c
c      if (debug.and.iamp0) then
c        iseq=iseq+1
c        write (seq,'(i3.3)') iseq
c        fname='char'//seq//'.dat'
c        call ffopen (24,trim(fname),'rw',ierr2)
c      end if
c      if (debug) call check_error_on_p0 (ierr2)
c
c      if (debug.and.iamp0) then
c        write (24,'(100(a,a))')
c     &                        'th',
c     &               achar(9),'V',
c     &               achar(9),'br',
c     &               achar(9),'bt',
c     &               achar(9),'brhat',
c     &               achar(9),'bthat',
c     &               achar(9),'p',
c     &               achar(9),'rho',
c     &               achar(9),'c',
c     &               achar(9),'dV/ds',
c     &               achar(9),'dp/ds',
c     &               achar(9),'dV/dt'
c      end if
!$acc wait(1)
c
c ****** Get the parallel velocity.
c
!$acc kernels default(present) present(b,v)
!$acc loop
      do k=1,npm1
!$acc loop
        do j=1,ntm1
!$acc loop seq
          do i=1,2
            br=(.5_r_typ*(b%r(i+1,j,k)+b%r(i+1-1,j,k)))
            bt=(.5_r_typ*(b%t(i,j+1,k)+b%t(i,j+1-1,k)))
            bp=(.5_r_typ*(b%p(i,j,k+1)+b%p(i,j,k+1-1)))
            vr=(.25_r_typ*(v%r(i,j+1,k+1)+v%r(i,j+1-1,k+1)+v%r(i,j+1,k+1
     &-1)+v%r(i,j+1-1,k+1-1)))
            vt=(.25_r_typ*(v%t(i+1,j,k+1)+v%t(i+1-1,j,k+1)+v%t(i+1,j,k+1
     &-1)+v%t(i+1-1,j,k+1-1)))
            vp=(.25_r_typ*(v%p(i+1,j+1,k)+v%p(i+1-1,j+1,k)+v%p(i+1,j+1-1
     &,k)+v%p(i+1-1,j+1-1,k)))
            bsq=br**2+bt**2+bp**2
            bmag=sqrt(bsq)
            if (bmag.eq.0.) then
              brh=one
              bth=0.
              bph=0.
            else
              bmag_i=one/bmag
              brh=br*bmag_i
              bth=bt*bmag_i
              bph=bp*bmag_i
            end if
            vee_mm(i,j,k)=vr*brh+vt*bth+vp*bph
          enddo
        enddo
      enddo
!$acc end kernels
c
c ****** Loop over all theta and phi points.
c
!$acc kernels default(present) present(b,v) copy(ierr)
!$acc loop
      do k=2,npm1
!$acc loop
        do j=2,ntm1
c
c ****** Get the direction of B.
c
          br=(.125_r_typ*(b%r(2,j,k)+b%r(2-1,j,k)+b%r(2,j-1,k)+b%r(2,j,k
     &-1)+b%r(2-1,j-1,k)+b%r(2-1,j,k-1)+b%r(2,j-1,k-1)+b%r(2-1,j-1,k-1))
     &)
          bt=(.5_r_typ*(b%t(1,j,k)+b%t(1,j,k-1)))
          bp=(.5_r_typ*(b%p(1,j,k)+b%p(1,j-1,k)))
          bmag=sqrt(br**2+bt**2+bp**2)
          if (bmag.eq.0.) then
            brhat(j,k)=one
            bthat(j,k)=0.
            bphat(j,k)=0.
          else
            bmag_i=one/bmag
            brhat(j,k)=br*bmag_i
            bthat(j,k)=bt*bmag_i
            bphat(j,k)=bp*bmag_i
          end if
c
c ****** Project along B in the inward radial direction.
c
          dir=sign(one,brhat(j,k))
c
c ****** Initialize the variables.
c
          ps=(.5_r_typ*(pres(2,j,k)+pres(2-1,j,k)))
          rhos=(.5_r_typ*(rho(2,j,k)+rho(2-1,j,k)))
          cs=sqrt(gamma*ps/rhos)
c
          vr=v%r(1,j,k)
          vt=(.25_r_typ*(v%t(2,j,k)+v%t(2-1,j,k)+v%t(2,j-1,k)+v%t(2-1,j-
     &1,k)))
          vp=(.25_r_typ*(v%p(2,j,k)+v%p(2-1,j,k)+v%p(2,j,k-1)+v%p(2-1,j,
     &k-1)))
c
          vee(j,k)=brhat(j,k)*vr+bthat(j,k)*vt+bphat(j,k)*vp
c
c ****** Get derivatives for projecting along the characteristic.
c
          qp=pres(2,j,k)
          qm=pres(1,j,k)
          dpdr=(qp-qm)*drh_i(2)
          qp=(.25_r_typ*(pres(2,j+1,k)+pres(2-1,j+1,k)+pres(2,j+1-1,k)+p
     &res(2-1,j+1-1,k)))
          qm=(.25_r_typ*(pres(2,j,k)+pres(2-1,j,k)+pres(2,j-1,k)+pres(2-
     &1,j-1,k)))
          dpdt=(qp-qm)*dth_i(j)*r_i(1)
          qp=(.25_r_typ*(pres(2,j,k+1)+pres(2-1,j,k+1)+pres(2,j,k+1-1)+p
     &res(2-1,j,k+1-1)))
          qm=(.25_r_typ*(pres(2,j,k)+pres(2-1,j,k)+pres(2,j,k-1)+pres(2-
     &1,j,k-1)))
          dpdp=(qp-qm)*dp_mult*dph_i(k)*r_i(1)*sth_i(j)
c
          dpds=brhat(j,k)*dpdr+bthat(j,k)*dpdt+bphat(j,k)*dpdp
c
c ****** Get dV/ds.
c
          qp=(.25_r_typ*(vee_mm(2,j,k)+vee_mm(2,j-1,k)+vee_mm(2,j,k-1)+v
     &ee_mm(2,j-1,k-1)))
          qm=(.25_r_typ*(vee_mm(1,j,k)+vee_mm(1,j-1,k)+vee_mm(1,j,k-1)+v
     &ee_mm(1,j-1,k-1)))
          dvdr=(qp-qm)*drh_i(2)
c
          qp=(.5_r_typ*(vee_mm(1,j,k)+vee_mm(1,j,k-1)))
          qm=(.5_r_typ*(vee_mm(1,j-1,k)+vee_mm(1,j-1,k-1)))
          dvdt=(qp-qm)*dth_i(j)*r_i(1)
c
          qp=(.5_r_typ*(vee_mm(1,j,k)+vee_mm(1,j-1,k)))
          qm=(.5_r_typ*(vee_mm(1,j,k-1)+vee_mm(1,j-1,k-1)))
          dvdp=(qp-qm)*dp_mult*dph_i(k)*r_i(1)*sth_i(j)
c
          dvds=(brhat(j,k)*dvdr+bthat(j,k)*dvdt+bphat(j,k)*dvdp)
c
ccc          dveedt=-dpds/rhos
ccc     &           -vee(j,k)*dvds
ccc     &           -g0*brhat(j,k)*r_i(1)**2
c
ccc *JL* Use characterisic formulation instead of momentum
ccc      equation above.
c
          fac=(dir*vee(j,k)-cs)
          dveedt=fac*dir*dpds/(rhos*cs)           -fac*dvds           -g
     &0*dir*brhat(j,k)*r_i(1)**2       +2.*vee(j,k)*dir*cs/r(1)
c
c          if (debug.and.iamp0.and.k.eq.2) then
c            write (24,'(100(1pe12.5,a))')
c     &                            th(j),
c     &                   achar(9),vee(j,k),
c     &                   achar(9),br,
c     &                   achar(9),bt,
c     &                   achar(9),brhat(j,k),
c     &                   achar(9),bthat(j,k),
c     &                   achar(9),ps,
c     &                   achar(9),rhos,
c     &                   achar(9),cs,
c     &                   achar(9),dvds,
c     &                   achar(9),dpds,
c     &                   achar(9),dveedt
c          end if
c
ccc *JL*          vee(j,k)=vee(j,k)+dveedt*dtime
ccc *JL* Need dir with use of characteristic equation above.
c
          vee(j,k)=vee(j,k)+dir*dveedt*dtime
c
ccc          if (eflux%active) then
ccc            if (eflux_factor*eflux_vr(j,k).gt.eflux%vr_max)
ccc     &        vee(j,k)=0.
ccc          end if
c
c ****** When UBZERO=.true., set the inward flow to zero if
c ****** it is negative.
c
          if (ubzero.and.dir*vee(j,k).lt.0.) then
            vee(j,k)=0.
          end if
c
c ****** Check for supersonic inflow.  If there is supersonic infow,
c ****** set IERR=1 and exit the (j,k) loops.
c
          if (dir*vee(j,k).ge.cs) then
c            write (13,*)
c            if (expert_user_override%limit_supersonic_inflow) then
c              write (13,*) '### WARNING in CHAR_BC_0:'
c            else
c              write (13,*) '### ERROR in CHAR_BC_0:'
c            endif
c            write (13,*) '### Supersonic inflow at r=R0.'
c            write (13,*) 'NTIME  = ',ntime
c            write (13,*) 'TIME  = ',time
c            write (13,*) 'CS  = ',cs
c            write (13,*) 'DIR*VEE = ',dir*vee(j,k)
c            write (13,*) 'theta = ',th(j)
c            write (13,*) 'phi = ',ph(k)
            if (expert_user_override%limit_supersonic_inflow) then
              vee(j,k)=cs*dir
            else
!$acc atomic write
              ierr=1
            endif
          end if
c
        enddo
      enddo
!$acc end kernels
c
      end if
c
c ****** Check whether to terminate the code.
c ****** The rb0 conditional is seperated here
c ****** because this check needs all ranks to call it.
c
      call check_error_on_any_proc (ierr)
c
c ****** The boundary velocity is determined only on processors
c ****** that include the lower radial boundary.
c
      if (rb0) then
c
c ****** Filter the parallel velocity.
c
      do i=1,nfiltub
        call filter_tp_hh (vee)
      enddo
c
c ****** If we are using the parallel flow model, save the BC
c ****** on the parallel velocity.
c
      if (freeze_b) then
        vb%r0%par(:,:)=0.
        do k=2,npm1
          do j=2,ntm1
            vb%r0%par(j,k)=vee(j,k)
          enddo
        enddo
      end if
c
c ****** Set the BC velocity to be parallel to B.  This is done
c ****** even when using the parallel flow model because we still
c ****** set BCs on the vector velocity.
c
!$acc kernels loop collapse(2) default(present) present(vb)
      do k=2,npm1
        do j=2,ntm1
          vb%r0%r(j,k)=vee(j,k)*brhat(j,k)
          vpar_t (j,k)=vee(j,k)*bthat(j,k)
          vpar_p (j,k)=vee(j,k)*bphat(j,k)
        enddo
      enddo
!$acc end kernels
c
c ****** Seam the vt and vp BC arrays.
c
      call seam_2d_tp_acc (vpar_t,nt,np,.true.,.false.)
      call seam_2d_tp_acc (vpar_p,nt,np,.false.,.true.)
c
c ****** Average the theta and phi components to the
c ****** appropriate mesh.
c
!$acc parallel default(present) present(vb)
!$acc loop collapse(2)
      do k=2,npm1
        do j=2,ntm-1
          vb%r0%t(j,k)=half*(vpar_t(j,k)+vpar_t(j+1,k))
        enddo
      enddo
c
!$acc loop collapse(2)
      do k=2,npm-1
        do j=2,ntm1
          vb%r0%p(j,k)=half*(vpar_p(j,k)+vpar_p(j,k+1))
        enddo
      enddo
!$acc end parallel
c
c      if (debug.and.iamp0) then
c        close (24)
c      end if
c
!$acc exit data delete(brhat,bthat,bphat,vee,vpar_t,vpar_p,vee_mm)
      end if
c
      return
      end

Hmm, sorry but I can’t really tell much from by just looking a the source. Are you able to send me the full source so I can reproduce the wrong answers?

You can try reducing the optimization level, adding “-Mnofma” to disable FMA instructions, or add “-Kieee” to have the compiler conform to strictly to IEEE 754.

-Mat

Hi,

Just an update that this bug still exists in version 19.7 :(

I have previously sent a reproducer, so I am hoping this is fixed before the next community edition, as I am stuck officially only supporting 18.10 for our code for now.

  • Ron

Hi,

I think I finally tracked down this bug.

The code that works with 18.10 but gives wrong answers with 19.x has the following loop:

!$acc kernels default(present) present(b,v)
!$acc loop
      do k=1,npm1
!$acc loop
        do j=1,ntm1
!$acc loop seq
          do i=1,2
            br=AVGR  (b%r,i+1,j  ,k  )
            bt=AVGT  (b%t,i  ,j+1,k  )
            bp=AVGP  (b%p,i  ,j  ,k+1)
            vr=AVGTP (v%r,i  ,j+1,k+1)
            vt=AVGRP (v%t,i+1,j  ,k+1)
            vp=AVGRT (v%p,i+1,j+1,k  )
            bsq=br**2+bt**2+bp**2
            bmag=sqrt(bsq)
            if (bmag.eq.0.) then
              brh=one
              bth=0.
              bph=0.
            else
              bmag_i=one/bmag
              brh=br*bmag_i
              bth=bt*bmag_i
              bph=bp*bmag_i
            end if
            vee_mm(i,j,k)=vr*brh+vt*bth+vp*bph
          enddo
        enddo
      enddo
!$acc end kernels

Note the inner-loop is set to sequential since it only iterates over two indicies.

If I change the loop to collapse the 2-iter loop as follows:

!$acc kernels default(present) present(b,v)
!$acc loop
      do k=1,npm1
!$acc loop collapse(2)
        do j=1,ntm1
          do i=1,2
            br=AVGR  (b%r,i+1,j  ,k  )
            bt=AVGT  (b%t,i  ,j+1,k  )
            bp=AVGP  (b%p,i  ,j  ,k+1)
            vr=AVGTP (v%r,i  ,j+1,k+1)
            vt=AVGRP (v%t,i+1,j  ,k+1)
            vp=AVGRT (v%p,i+1,j+1,k  )
            bsq=br**2+bt**2+bp**2
            bmag=sqrt(bsq)
            if (bmag.eq.0.) then
              brh=one
              bth=0.
              bph=0.
            else
              bmag_i=one/bmag
              brh=br*bmag_i
              bth=bt*bmag_i
              bph=bp*bmag_i
            end if
            vee_mm(i,j,k)=vr*brh+vt*bth+vp*bph
          enddo
        enddo
      enddo
!$acc end kernels

Then the code produces the correct answer when using PGI 19.x!

I do not know if this a problem with what I was doing to this loop that version 18.10 was smart to avoid, or if there is a bug in version 19.x.

At least I have a work-around for now!

  • Ron

Hi Ron,

Glad you were able to find a work around. My best guess is that there’s something wrong with the translated indexing and by collapsing, it changes it enough that the compiler gets it correct. But I can’t really tell without a reproducing example.

-Mat

Hi,

I sent out a reproducer a while back.

You should be able to run it as-is to get the error and then modify it as my post to see the workaround.

  • Ron