Hi. I’m trying to see what is happening on GPU.
My code is a bit slow. I presume that many atomic operations slow down my code, but I want to make it sure.
This is what I get from NVVP.
And my code is written below.
I know this code is quite complicated, but I don’t think the complexity of the code is the reason for NVVP not providing information about compute occupancy.
How can I see ‘inside’ the cuStreamSynchronize?
#include <defines.h>
SUBROUTINE RayTrace_GPU(RayInfo, CoreInfo, phis, PhiAngIn, xst, src, jout, iz, mygb, myge, ljout)
USE PARAM
USE TYPEDEF, ONLY : RayInfo_Type, Coreinfo_type
USE MOC_MOD, ONLY : nMaxRaySeg, nMaxCellRay, nMaxAsyRay, nMaxCoreRay, &
EXPAPolar, EXPBPolar, wtangP0
USE PE_MOD, ONLY : PE, GPUControl
IMPLICIT NONE
TYPE(RayInfo_Type) :: RayInfo
TYPE(CoreInfo_Type) :: CoreInfo
!$ACC DECLARE PRESENT(RayInfo, CoreInfo, GPUControl, EXPAPolar, EXPBPolar, wtangP0)
REAL(8), POINTER :: phis(:, :), PhiAngIn(:, :, :), xst(:, :), src(:, :), jout(:, :, :, :)
INTEGER :: iz, mygb, myge
LOGICAL :: ljout
INTEGER :: iRay
INTEGER :: i, j, k, l, m, jbeg, jend, jinc, irw, irw1, irv, ig
REAL(8) :: wttemp, wt(RayInfo%nPolarAngle), tau
REAL(8) :: phiobd(Rayinfo%nPolarAngle, mygb : myge), phia(mygb : myge, 2), phid, phiocel1, phiocel2
INTEGER :: nPolarAngle, nAziAngle, nPhiAngSv
INTEGER :: iazi, ipol, PhiAnginSvIdx, PhiAngOutSvIdx
INTEGER :: nCoreRay, nAsyRay, nPinRay, nRaySeg
INTEGER :: irotray, icoreray, iasyray, iceray, irayseg
INTEGER :: ipin, icelg, icelw, icelv, iasy, ireg, isurf1, isurf2, irot, idir, ifsr
INTEGER :: irsegidx, icellrayidx, FsrIdxSt
INTEGER :: mp(2) = (/ 2, 1 /)
! Tracking Data Storages
INTEGER :: nTotRaySeg(nMaxCoreRay), nTotCellRay(nMaxCoreRay)
INTEGER :: CellRayIdxSt(nMaxCellRay, nMaxCoreRay, 2)
INTEGER :: PinIdx(nMaxCellRay, nMaxCoreRay)
INTEGER :: SurfIdx(nMaxCellRay, nMaxCoreRay, 2)
INTEGER :: ExpAppIdx(mygb : myge, nMaxRaySeg, nMaxCoreRay)
INTEGER :: FsrIdx(nMaxRaySeg, nMaxCoreRay)
REAL(8) :: ExpApp(RayInfo%nPolarAngle, mygb : myge, nMaxRaySeg, nMaxCoreRay)
REAL(8) :: OptLenList(mygb : myge, nMaxRaySeg, nMaxCoreRay)
REAL(8) :: PhiAngOut(RayInfo%nPolarAngle, mygb : myge, nMaxRaySeg + 2)
nAziAngle = RayInfo%nAziAngle
nPolarAngle = RayInfo%nPolarAngle
nPhiAngSv = RayInfo%nPhiAngSv
!$ACC ENTER DATA COPYIN(xst(mygb : myge, :), src(mygb : myge, :), PhiAngIn(:, mygb : myge, :))
!$ACC ENTER DATA CREATE(phis(mygb : myge, :), Jout(mygb : myge, :, :, :))
!$ACC DATA PRESENT(phis(mygb : myge, :), Jout(mygb : myge, :, :, :))
!$ACC KERNELS
phis(mygb : myge, :) = 0._8
!$ACC END KERNELS
IF (ljout) THEN
!$ACC KERNELS
jout(mygb : myge, :, :, :) = 0._8
!$ACC END KERNELS
ENDIF
!$ACC END DATA
!$ACC DATA PRESENT(xst(mygb : myge, :), src(mygb : myge, :), PhiAngIn(:, mygb : myge, :), &
!$ACC phis(mygb : myge, :), Jout(mygb : myge, :, :, :))
!$ACC PARALLEL NUM_GANGS(GPUControl(1)%nGang) NUM_WORKERS(2) VECTOR_LENGTH(GPUControl(1)%nVector)
!$ACC LOOP INDEPENDENT GANG PRIVATE(phia, irsegidx, icellrayidx, nTotRaySeg, nTotCellRay, CellRayIdxSt, &
!$ACC PinIdx, SurfIdx, ExpAppIdx, FsrIdx, ExpApp, OptLenList)
DO iRay = 1, GPUControl(1)%nRay
!$ACC CACHE(phia)
iRotRay = GPUControl(1)%RayList(iRay)
!!!!!!!!!!!!!!!! Inlined Tracking Subroutine !!!!!!!!!!!!!!!!
nCoreRay = RayInfo%RotRay(iRotRay)%nRay
!$ACC LOOP SEQ
DO j = 1, nCoreRay
irsegidx = 0; icellrayidx = 0
iCoreRay = RayInfo%RotRay(iRotRay)%RayIdx(j)
nAsyRay = RayInfo%CoreRay(iCoreRay)%nRay
!$ACC LOOP SEQ
DO k = 1, nAsyRay
iasyray = RayInfo%CoreRay(iCoreRay)%AsyRayIdx(k)
iasy = RayInfo%CoreRay(iCoreRay)%AsyIdx(k)
IF(iasy .EQ. 0) CYCLE
nPinRay = RayInfo%AsyRay(iAsyRay)%nCellRay
!$ACC LOOP SEQ
DO l = 1, nPinRay
ipin = RayInfo%AsyRay(iAsyRay)%PinIdx(l)
iceray = RayInfo%AsyRay(iAsyRay)%PinRayIdx(l)
ipin = CoreInfo%Asy(iAsy)%GlobalPinIdx(ipin)
icelw = CoreInfo%Pin(ipin)%Cell(iz)
FsrIdxSt = CoreInfo%Pin(ipin)%FsrIdxSt
irw = icellrayidx + l
PinIdx(irw, j) = ipin
CellRayIdxSt(irw, j, 2) = irsegidx + 1
nRaySeg = CoreInfo%CellInfo(icelw)%CellRay(iceray)%nSeg
!$ACC LOOP INDEPENDENT WORKER VECTOR
DO iRaySeg = 1, nRaySeg
irv = irsegidx + iRaySeg
ireg = FsrIdxSt + CoreInfo%CellInfo(icelw)%CellRay(iceray)%LocalFsrIdx(iRaySeg) - 1
FsrIdx(irv, j) = ireg
!$ACC LOOP SEQ
DO ig = mygb, myge
tau = - CoreInfo%CellInfo(icelw)%CellRay(iceray)%LenSeg(iRaySeg) * xst(ig, ireg)
OptLenList(ig, irv, j) = tau
ExpAppIdx(ig, irv, j) = min(0, max(INT(tau), -40000))
ENDDO
ENDDO
irsegidx = irsegidx + nRaySeg
CellRayIdxSt(irw, j, 1) = irsegidx
SurfIdx(irw, j, 1) = RayInfo%AsyRay(iAsyRay)%PinRaySurf(2, l)
SurfIdx(irw, j, 2) = RayInfo%AsyRay(iAsyRay)%PinRaySurf(1, l)
ENDDO
icellrayidx = icellrayidx + nPinRay
ENDDO
nTotRaySeg(j) = irsegidx
nTotCellRay(j) = icellRayIdx
!$ACC LOOP INDEPENDENT WORKER
DO iRaySeg = 1, nTotRaySeg(j)
!$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
DO ig = mygb, myge
DO ipol = 1, nPolarAngle
ExpApp(ipol, ig, iRaySeg, j) &
= EXPAPolar(ipol, ExpAppIdx(ig, iRaySeg, j)) * OptLenList(ig, iRaySeg, j) + EXPBPolar(ipol, ExpAppIdx(ig, iRaySeg, j))
ENDDO
ENDDO
ENDDO
ENDDO
!$ACC LOOP INDEPENDENT WORKER PRIVATE(wt, phiobd, PhiAngOut)
DO irot = 1, 2
!$ACC CACHE(wt)
PhiAnginSvIdx = RayInfo%PhiAngInSvIdx(iRotRay, irot)
PhiAngOutSvIdx = RayInfo%PhiangOutSvIdx(iRotRay, irot)
phiobd(:, :) = PhiAngIn(:, :, PhiAnginSvIdx)
jinc = 1; jbeg = 1; jend = nCoreRay
IF(irot .EQ. 2) THEN
jinc = -1; jbeg = nCoreRay; jend = 1
ENDIF
!$ACC LOOP SEQ
DO j = jbeg, jend, jinc
idir = RayInfo%RotRay(iRotRay)%DIR(j);
iazi = RayInfo%CoreRay(RayInfo%RotRay(iRotRay)%RayIdx(j))%iang
wt(1 : nPolarAngle) = wtangP0(1 : nPolarAngle, iazi)
IF(irot .EQ. 2) idir = mp(idir)
nRaySeg = nTotRaySeg(j)
IF(idir .EQ. 1) THEN
PhiAngOut(:, :, 1) = phiobd(:, :)
!$ACC LOOP SEQ
DO irw = 1, nRaySeg
ifsr = FsrIdx(irw, j)
phia(:, irot) = 0._8
!$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
DO ig = mygb, myge
DO ipol = 1, nPolarAngle
phid = (PhiAngOut(ipol, ig, irw) - src(ig, ifsr)) * ExpApp(ipol, ig, irw, j)
PhiAngOut(ipol, ig, irw + 1) = PhiAngOut(ipol, ig, irw) - phid
!$ACC ATOMIC UPDATE
phia(ig, irot) = phia(ig, irot) + wt(ipol) * phid
!$ACC END ATOMIC
ENDDO
ENDDO
!$ACC LOOP INDEPENDENT VECTOR
DO ig = mygb, myge
!$ACC ATOMIC UPDATE
phis(ig, ifsr) = phis(ig, ifsr) + phia(ig, irot)
!$ACC END ATOMIC
ENDDO
ENDDO
phiobd(:, :) = PhiAngOut(:, :, nRaySeg + 1)
IF(ljout) THEN
!$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
DO irv = 1, nTotCellRay(j)
DO ig = mygb, myge
icelv = PinIdx(irv, j); isurf1 = SurfIdx(irv, j, 1); isurf2 = SurfIdx(irv, j, 2)
phiocel1 = 0._8; phiocel2 = 0._8
!$ACC LOOP SEQ
DO ipol = 1, nPolarAngle
phiocel1 = phiocel1 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 1) + 1)
phiocel2 = phiocel2 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 2))
ENDDO
!$ACC ATOMIC UPDATE
Jout(ig, 2, isurf1, icelv) = Jout(ig, 2, isurf1, icelv) + phiocel1
!$ACC END ATOMIC
!$ACC ATOMIC UPDATE
Jout(ig, 1, isurf2, icelv) = Jout(ig, 1, isurf2, icelv) + phiocel2
!$ACC END ATOMIC
ENDDO
ENDDO
ENDIF
ELSE
PhiAngOut(:, :, nRaySeg + 2) = phiobd(:, :)
irw = nRaySeg + 1
!$ACC LOOP SEQ
DO irw1 = 1, nRaySeg
irw = irw - 1
ifsr = FsrIdx(irw, j)
phia(:, irot) = 0._8
!$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
DO ig = mygb, myge
DO ipol = 1, nPolarAngle
phid = (PhiAngOut(ipol, ig, irw + 2) - src(ig, ifsr)) * ExpApp(ipol, ig, irw, j)
PhiAngOut(ipol, ig, irw + 1) = PhiAngOut(ipol, ig, irw + 2) - phid
!$ACC ATOMIC UPDATE
phia(ig, irot) = phia(ig, irot) + wt(ipol) * phid
!$ACC END ATOMIC
ENDDO
ENDDO
!$ACC LOOP INDEPENDENT VECTOR
DO ig = mygb, myge
!$ACC ATOMIC UPDATE
phis(ig, ifsr) = phis(ig, ifsr) + phia(ig, irot)
!$ACC END ATOMIC
ENDDO
ENDDO
phiobd(:, :) = PhiAngOut(:, :, 2)
IF(lJout) THEN
!$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
DO irv = 1, nTotCellRay(j)
DO ig = mygb, myge
icelv = PinIdx(irv, j); isurf1 = SurfIdx(irv, j, 1); isurf2 = SurfIdx(irv, j, 2)
phiocel1 = 0._8; phiocel2 = 0._8
!$ACC LOOP SEQ
DO ipol = 1, nPolarAngle
phiocel1 = phiocel1 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 1) + 2)
phiocel2 = phiocel2 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 2) + 1)
ENDDO
!$ACC ATOMIC UPDATE
Jout(ig, 1, isurf1, icelv) = Jout(ig, 1, isurf1, icelv) + phiocel1
!$ACC END ATOMIC
!$ACC ATOMIC UPDATE
Jout(ig, 2, isurf2, icelv) = Jout(ig, 2, isurf2, icelv) + phiocel2
!$ACC END ATOMIC
ENDDO
ENDDO
ENDIF
ENDIF
ENDDO
PhiAngIn(:, :, PhiAngOutSvIdx) = phiobd(:, :)
ENDDO
!!!!!!!!!!!!!!!! Inlined Tracking Subroutine !!!!!!!!!!!!!!!!
ENDDO
!$ACC END PARALLEL
!$ACC END DATA
!$ACC DATA PRESENT(xst(mygb : myge, :), src(mygb : myge, :), phis(mygb : myge, :))
!$ACC PARALLEL
!$ACC LOOP INDEPENDENT GANG
DO j = 1, CoreInfo%nxy
FsrIdxSt = CoreInfo%Pin(j)%FsrIdxSt; icelg = CoreInfo%Pin(j)%Cell(iz)
!$ACC LOOP INDEPENDENT COLLAPSE(2) WORKER VECTOR
DO i = 1, CoreInfo%CellInfo(icelg)%nFsr
DO ig = mygb, myge
ireg = FsrIdxSt + i - 1
phis(ig, ireg) = phis(ig, ireg) / (xst(ig, ireg) * CoreInfo%CellInfo(icelg)%vol(i)) + src(ig, ireg)
ENDDO
ENDDO
ENDDO
!$ACC END PARALLEL
!$ACC END DATA
!$ACC EXIT DATA DELETE(xst(mygb : myge, :), src(mygb : myge, :))
!$ACC EXIT DATA COPYOUT(phis(mygb : myge, :), Jout(mygb : myge, :, :, :), PhiAngIn(:, mygb : myge, :))
END SUBROUTINE