NVVP timeline doesn't show compute information

Hi. I’m trying to see what is happening on GPU.

My code is a bit slow. I presume that many atomic operations slow down my code, but I want to make it sure.

This is what I get from NVVP.

And my code is written below.

I know this code is quite complicated, but I don’t think the complexity of the code is the reason for NVVP not providing information about compute occupancy.

How can I see ‘inside’ the cuStreamSynchronize?

#include <defines.h>
SUBROUTINE RayTrace_GPU(RayInfo, CoreInfo, phis, PhiAngIn, xst, src, jout, iz, mygb, myge, ljout)
USE PARAM
USE TYPEDEF, ONLY : RayInfo_Type, Coreinfo_type
USE MOC_MOD, ONLY : nMaxRaySeg,     nMaxCellRay,    nMaxAsyRay,     nMaxCoreRay,    &
                    EXPAPolar,      EXPBPolar,      wtangP0
USE PE_MOD,  ONLY : PE, GPUControl
IMPLICIT NONE
TYPE(RayInfo_Type) :: RayInfo
TYPE(CoreInfo_Type) :: CoreInfo
!$ACC DECLARE PRESENT(RayInfo, CoreInfo, GPUControl, EXPAPolar, EXPBPolar, wtangP0)
REAL(8), POINTER :: phis(:, :), PhiAngIn(:, :, :), xst(:, :), src(:, :), jout(:, :, :, :)
INTEGER :: iz, mygb, myge
LOGICAL :: ljout

INTEGER :: iRay
INTEGER :: i, j, k, l, m, jbeg, jend, jinc, irw, irw1, irv, ig

REAL(8) :: wttemp, wt(RayInfo%nPolarAngle), tau
REAL(8) :: phiobd(Rayinfo%nPolarAngle, mygb : myge), phia(mygb : myge, 2), phid, phiocel1, phiocel2
INTEGER :: nPolarAngle, nAziAngle, nPhiAngSv
INTEGER :: iazi, ipol, PhiAnginSvIdx, PhiAngOutSvIdx
INTEGER :: nCoreRay, nAsyRay, nPinRay, nRaySeg
INTEGER :: irotray, icoreray, iasyray, iceray, irayseg
INTEGER :: ipin, icelg, icelw, icelv, iasy, ireg, isurf1, isurf2, irot, idir, ifsr
INTEGER :: irsegidx, icellrayidx, FsrIdxSt

INTEGER :: mp(2) = (/ 2, 1 /)

! Tracking Data Storages
INTEGER :: nTotRaySeg(nMaxCoreRay), nTotCellRay(nMaxCoreRay)
INTEGER :: CellRayIdxSt(nMaxCellRay, nMaxCoreRay, 2)
INTEGER :: PinIdx(nMaxCellRay, nMaxCoreRay)
INTEGER :: SurfIdx(nMaxCellRay, nMaxCoreRay, 2)
INTEGER :: ExpAppIdx(mygb : myge, nMaxRaySeg, nMaxCoreRay)
INTEGER :: FsrIdx(nMaxRaySeg, nMaxCoreRay)
REAL(8) :: ExpApp(RayInfo%nPolarAngle, mygb : myge, nMaxRaySeg, nMaxCoreRay)
REAL(8) :: OptLenList(mygb : myge, nMaxRaySeg, nMaxCoreRay)
REAL(8) :: PhiAngOut(RayInfo%nPolarAngle, mygb : myge, nMaxRaySeg + 2)
    
nAziAngle = RayInfo%nAziAngle
nPolarAngle = RayInfo%nPolarAngle
nPhiAngSv = RayInfo%nPhiAngSv

!$ACC ENTER DATA COPYIN(xst(mygb : myge, :), src(mygb : myge, :), PhiAngIn(:, mygb : myge, :))
!$ACC ENTER DATA CREATE(phis(mygb : myge, :), Jout(mygb : myge, :, :, :))

!$ACC DATA PRESENT(phis(mygb : myge, :), Jout(mygb : myge, :, :, :))
!$ACC KERNELS
  phis(mygb : myge, :) = 0._8
!$ACC END KERNELS
IF (ljout) THEN
  !$ACC KERNELS
    jout(mygb : myge, :, :, :) = 0._8
  !$ACC END KERNELS
ENDIF
!$ACC END DATA
!$ACC DATA PRESENT(xst(mygb : myge, :), src(mygb : myge, :), PhiAngIn(:, mygb : myge, :),                        &
!$ACC              phis(mygb : myge, :), Jout(mygb : myge, :, :, :))
!$ACC PARALLEL NUM_GANGS(GPUControl(1)%nGang) NUM_WORKERS(2) VECTOR_LENGTH(GPUControl(1)%nVector)
!$ACC LOOP INDEPENDENT GANG PRIVATE(phia, irsegidx, icellrayidx, nTotRaySeg, nTotCellRay, CellRayIdxSt,          &
!$ACC                               PinIdx, SurfIdx, ExpAppIdx, FsrIdx, ExpApp, OptLenList)
DO iRay = 1, GPUControl(1)%nRay
  !$ACC CACHE(phia)
  iRotRay = GPUControl(1)%RayList(iRay)

  !!!!!!!!!!!!!!!! Inlined Tracking Subroutine !!!!!!!!!!!!!!!!
      
  nCoreRay = RayInfo%RotRay(iRotRay)%nRay
  !$ACC LOOP SEQ
  DO j = 1, nCoreRay         
    irsegidx = 0; icellrayidx = 0
    iCoreRay = RayInfo%RotRay(iRotRay)%RayIdx(j)
    nAsyRay = RayInfo%CoreRay(iCoreRay)%nRay
    !$ACC LOOP SEQ
    DO k = 1, nAsyRay 
      iasyray = RayInfo%CoreRay(iCoreRay)%AsyRayIdx(k)
      iasy = RayInfo%CoreRay(iCoreRay)%AsyIdx(k)
      IF(iasy .EQ. 0) CYCLE
      nPinRay = RayInfo%AsyRay(iAsyRay)%nCellRay
      !$ACC LOOP SEQ
      DO l = 1, nPinRay
        ipin = RayInfo%AsyRay(iAsyRay)%PinIdx(l)
        iceray = RayInfo%AsyRay(iAsyRay)%PinRayIdx(l)
        ipin = CoreInfo%Asy(iAsy)%GlobalPinIdx(ipin)
        icelw = CoreInfo%Pin(ipin)%Cell(iz)
        FsrIdxSt = CoreInfo%Pin(ipin)%FsrIdxSt
        irw = icellrayidx + l
        PinIdx(irw, j) = ipin
        CellRayIdxSt(irw, j, 2) = irsegidx + 1
        nRaySeg = CoreInfo%CellInfo(icelw)%CellRay(iceray)%nSeg
        !$ACC LOOP INDEPENDENT WORKER VECTOR
        DO iRaySeg = 1, nRaySeg
          irv = irsegidx + iRaySeg
          ireg = FsrIdxSt + CoreInfo%CellInfo(icelw)%CellRay(iceray)%LocalFsrIdx(iRaySeg) - 1
          FsrIdx(irv, j) = ireg
          !$ACC LOOP SEQ
          DO ig = mygb, myge
            tau = - CoreInfo%CellInfo(icelw)%CellRay(iceray)%LenSeg(iRaySeg) * xst(ig, ireg)
            OptLenList(ig, irv, j) = tau
            ExpAppIdx(ig, irv, j) = min(0, max(INT(tau), -40000))
          ENDDO
        ENDDO
        irsegidx = irsegidx + nRaySeg
        CellRayIdxSt(irw, j, 1) = irsegidx
        SurfIdx(irw, j, 1) = RayInfo%AsyRay(iAsyRay)%PinRaySurf(2, l)
        SurfIdx(irw, j, 2) = RayInfo%AsyRay(iAsyRay)%PinRaySurf(1, l)
      ENDDO
      icellrayidx = icellrayidx + nPinRay
    ENDDO
    nTotRaySeg(j) = irsegidx
    nTotCellRay(j) = icellRayIdx
    !$ACC LOOP INDEPENDENT WORKER
    DO iRaySeg = 1, nTotRaySeg(j)
      !$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
      DO ig = mygb, myge
        DO ipol = 1, nPolarAngle
          ExpApp(ipol, ig, iRaySeg, j) &
          = EXPAPolar(ipol, ExpAppIdx(ig, iRaySeg, j)) * OptLenList(ig, iRaySeg, j) + EXPBPolar(ipol, ExpAppIdx(ig, iRaySeg, j))
        ENDDO
      ENDDO
    ENDDO
  ENDDO

  !$ACC LOOP INDEPENDENT WORKER PRIVATE(wt, phiobd, PhiAngOut) 
  DO irot = 1, 2
    !$ACC CACHE(wt)
    PhiAnginSvIdx = RayInfo%PhiAngInSvIdx(iRotRay, irot)
    PhiAngOutSvIdx = RayInfo%PhiangOutSvIdx(iRotRay, irot)
    phiobd(:, :) = PhiAngIn(:, :, PhiAnginSvIdx)
    jinc = 1; jbeg = 1; jend = nCoreRay
    IF(irot .EQ. 2) THEN
      jinc = -1; jbeg = nCoreRay; jend = 1
    ENDIF
    !$ACC LOOP SEQ
    DO j = jbeg, jend, jinc
      idir = RayInfo%RotRay(iRotRay)%DIR(j);
      iazi = RayInfo%CoreRay(RayInfo%RotRay(iRotRay)%RayIdx(j))%iang
      wt(1 : nPolarAngle) = wtangP0(1 : nPolarAngle, iazi)
      IF(irot .EQ. 2) idir = mp(idir)
      nRaySeg = nTotRaySeg(j)   
      IF(idir .EQ. 1) THEN
        PhiAngOut(:, :, 1) = phiobd(:, :)
        !$ACC LOOP SEQ
        DO irw = 1, nRaySeg
          ifsr = FsrIdx(irw, j)
          phia(:, irot) = 0._8
          !$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
          DO ig = mygb, myge
            DO ipol = 1, nPolarAngle
              phid = (PhiAngOut(ipol, ig, irw) - src(ig, ifsr)) * ExpApp(ipol, ig, irw, j)
              PhiAngOut(ipol, ig, irw + 1) = PhiAngOut(ipol, ig, irw) - phid
              !$ACC ATOMIC UPDATE
              phia(ig, irot) = phia(ig, irot) + wt(ipol) * phid
              !$ACC END ATOMIC
            ENDDO
          ENDDO
          !$ACC LOOP INDEPENDENT VECTOR
          DO ig = mygb, myge
            !$ACC ATOMIC UPDATE
            phis(ig, ifsr) = phis(ig, ifsr) + phia(ig, irot)
            !$ACC END ATOMIC
          ENDDO
        ENDDO
        phiobd(:, :) = PhiAngOut(:, :, nRaySeg + 1)
        IF(ljout) THEN
          !$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
          DO irv = 1, nTotCellRay(j)
            DO ig = mygb, myge
              icelv = PinIdx(irv, j); isurf1 = SurfIdx(irv, j, 1); isurf2 = SurfIdx(irv, j, 2)
              phiocel1 = 0._8; phiocel2 = 0._8
              !$ACC LOOP SEQ
              DO ipol = 1, nPolarAngle
                phiocel1 = phiocel1 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 1) + 1)
                phiocel2 = phiocel2 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 2))
              ENDDO
              !$ACC ATOMIC UPDATE
              Jout(ig, 2, isurf1, icelv) = Jout(ig, 2, isurf1, icelv) + phiocel1
              !$ACC END ATOMIC
              !$ACC ATOMIC UPDATE
              Jout(ig, 1, isurf2, icelv) = Jout(ig, 1, isurf2, icelv) + phiocel2
              !$ACC END ATOMIC
            ENDDO
          ENDDO
        ENDIF
      ELSE
        PhiAngOut(:, :, nRaySeg + 2) = phiobd(:, :)
        irw = nRaySeg + 1
        !$ACC LOOP SEQ
        DO irw1 = 1, nRaySeg
          irw = irw - 1
          ifsr = FsrIdx(irw, j)
          phia(:, irot) = 0._8
          !$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
          DO ig = mygb, myge
            DO ipol = 1, nPolarAngle
              phid = (PhiAngOut(ipol, ig, irw + 2) - src(ig, ifsr)) * ExpApp(ipol, ig, irw, j)
              PhiAngOut(ipol, ig, irw + 1) = PhiAngOut(ipol, ig, irw + 2) - phid
              !$ACC ATOMIC UPDATE
              phia(ig, irot) = phia(ig, irot) + wt(ipol) * phid
              !$ACC END ATOMIC
            ENDDO
          ENDDO
          !$ACC LOOP INDEPENDENT VECTOR
          DO ig = mygb, myge
            !$ACC ATOMIC UPDATE
            phis(ig, ifsr) = phis(ig, ifsr) + phia(ig, irot)
            !$ACC END ATOMIC
          ENDDO
        ENDDO
        phiobd(:, :) = PhiAngOut(:, :, 2)
        IF(lJout) THEN
          !$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
          DO irv = 1, nTotCellRay(j)
            DO ig = mygb, myge
              icelv = PinIdx(irv, j); isurf1 = SurfIdx(irv, j, 1); isurf2 = SurfIdx(irv, j, 2)
              phiocel1 = 0._8; phiocel2 = 0._8
              !$ACC LOOP SEQ
              DO ipol = 1, nPolarAngle
                phiocel1 = phiocel1 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 1) + 2)
                phiocel2 = phiocel2 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 2) + 1)
              ENDDO
              !$ACC ATOMIC UPDATE
              Jout(ig, 1, isurf1, icelv) = Jout(ig, 1, isurf1, icelv) + phiocel1
              !$ACC END ATOMIC
              !$ACC ATOMIC UPDATE
              Jout(ig, 2, isurf2, icelv) = Jout(ig, 2, isurf2, icelv) + phiocel2
              !$ACC END ATOMIC
            ENDDO
          ENDDO
        ENDIF
      ENDIF
    ENDDO
    PhiAngIn(:, :, PhiAngOutSvIdx) = phiobd(:, :)
  ENDDO

  !!!!!!!!!!!!!!!! Inlined Tracking Subroutine !!!!!!!!!!!!!!!!

ENDDO  
!$ACC END PARALLEL
!$ACC END DATA

!$ACC DATA PRESENT(xst(mygb : myge, :), src(mygb : myge, :), phis(mygb : myge, :))
!$ACC PARALLEL
!$ACC LOOP INDEPENDENT GANG
DO j = 1, CoreInfo%nxy
  FsrIdxSt = CoreInfo%Pin(j)%FsrIdxSt; icelg = CoreInfo%Pin(j)%Cell(iz)
  !$ACC LOOP INDEPENDENT COLLAPSE(2) WORKER VECTOR
  DO i = 1, CoreInfo%CellInfo(icelg)%nFsr
    DO ig = mygb, myge
      ireg = FsrIdxSt + i - 1
      phis(ig, ireg) = phis(ig, ireg) / (xst(ig, ireg) * CoreInfo%CellInfo(icelg)%vol(i)) + src(ig, ireg)
    ENDDO
  ENDDO
ENDDO
!$ACC END PARALLEL
!$ACC END DATA

!$ACC EXIT DATA DELETE(xst(mygb : myge, :), src(mygb : myge, :))
!$ACC EXIT DATA COPYOUT(phis(mygb : myge, :), Jout(mygb : myge, :, :, :), PhiAngIn(:, mygb : myge, :))

END SUBROUTINE

It appears you are using the Nsight Tool. Try using pgprof, which
is in the OpenACC Toolkit.

dave