Okay. I will try removing private and let you know the result.
However, as you can see, there are many scalars that don’t need to be allocated for each vector unit. If they use up the register, isn’t the number of threads I can launch limited?
And Dmalloc doesn’t initialize values.
What I suspect is the optimization process that you called dead line elimination. Compiler is making wrong decisions, that is, if I remove (A), compiler may think that GPUWorkerDat = ireg line is not needed and thus removing this line. Thereby ifsr becomes uninitialized, and cause illegal address error at reading Src(ig, ifsr). How do you think?
FYI, when I tested this code with OpenACC directives disabled, the result was fine.
#include <defines.h>
SUBROUTINE RayTrace_GPU(RayInfo, CoreInfo, phis, PhiAngIn, xst, src, jout, iz, mygb, myge, ljout)
USE PARAM
USE TYPEDEF, ONLY : RayInfo_Type, Coreinfo_type
USE MOC_MOD, ONLY : nMaxRaySeg, nMaxCellRay, nMaxAsyRay, nMaxCoreRay, &
EXPAPolar, EXPBPolar, wtangP0
USE PE_MOD, ONLY : PE, GPUControl
IMPLICIT NONE
TYPE(RayInfo_Type) :: RayInfo
TYPE(CoreInfo_Type) :: CoreInfo
!$ACC DECLARE PRESENT(RayInfo, CoreInfo, GPUControl, EXPAPolar, EXPBPolar, wtangP0)
REAL(8), POINTER :: phis(:, :), PhiAngIn(:, :, :), xst(:, :), src(:, :), jout(:, :, :, :)
INTEGER :: iz, mygb, myge
LOGICAL :: ljout
INTEGER :: iGang, iWorker, iRay
INTEGER :: i, j, k, l, m, jbeg, jend, jinc, irw, irw1, irv, ig
REAL(8) :: wttemp, wt(RayInfo%nPolarAngle), tau
REAL(8) :: phiobd(Rayinfo%nPolarAngle, mygb : myge), phia(mygb : myge), phid, phiocel1, phiocel2
INTEGER :: nPolarAngle, nAziAngle, nPhiAngSv
INTEGER :: iazi, ipol, PhiAnginSvIdx, PhiAngOutSvIdx
INTEGER :: nRotRay, nCoreRay, nAsyRay, nPinRay, nRaySeg
INTEGER :: irotray, icoreray, iasyray, iceray, irayseg
INTEGER :: ipin, icelg, icelw, icelv, iasy, ireg, isurf1, isurf2, irot, idir, ifsr
INTEGER :: irsegidx, icellrayidx, FsrIdxSt
INTEGER :: mp(2) = (/ 2, 1 /)
! Tracking Data Storages
INTEGER :: nTotRaySeg(nMaxCoreRay), nTotCellRay(nMaxCoreRay)
INTEGER :: CellRayIdxSt(nMaxCellRay, nMaxCoreRay, 2)
INTEGER :: PinIdx(nMaxCellRay, nMaxCoreRay)
INTEGER :: SurfIdx(nMaxCellRay, nMaxCoreRay, 2)
INTEGER :: ExpAppIdx(mygb : myge, nMaxRaySeg, nMaxCoreRay)
INTEGER :: FsrIdx(nMaxRaySeg, nMaxCoreRay)
REAL(8) :: ExpApp(RayInfo%nPolarAngle, mygb : myge, nMaxRaySeg, nMaxCoreRay)
REAL(8) :: OptLenList(mygb : myge, nMaxRaySeg, nMaxCoreRay)
REAL(8) :: PhiAngOut(RayInfo%nPolarAngle, mygb : myge, nMaxRaySeg + 2)
nAziAngle = RayInfo%nAziAngle
nPolarAngle = RayInfo%nPolarAngle
nPhiAngSv = RayInfo%nPhiAngSv
!$ACC ENTER DATA COPYIN(xst(mygb : myge, :), src(mygb : myge, :), PhiAngIn(:, mygb : myge, :))
!$ACC ENTER DATA COPYIN(phis(mygb : myge, :), Jout(mygb : myge, :, :, :))
!$ACC DATA PRESENT(phis(mygb : myge, :), Jout(mygb : myge, :, :, :))
!$ACC KERNELS
phis(mygb : myge, :) = 0._8
!$ACC END KERNELS
IF (ljout) THEN
!$ACC KERNELS
jout(mygb : myge, :, :, :) = 0._8
!$ACC END KERNELS
ENDIF
!$ACC END DATA
!$ACC DATA PRESENT(xst(mygb : myge, :), src(mygb : myge, :), PhiAngIn(:, mygb : myge, :), &
!$ACC phis(mygb : myge, :), Jout(mygb : myge, :, :, :))
!$ACC PARALLEL &
!$ACC NUM_GANGS(GPUControl(1)%nGang) NUM_WORKERS(GPUControl(1)%nWorker) VECTOR_LENGTH(GPUControl(1)%nVector)
!$ACC LOOP INDEPENDENT GANG PRIVATE(iGang)
DO iGang = 1, GPUControl(1)%nGang
!$ACC LOOP INDEPENDENT WORKER PRIVATE(wt, phiobd, phia, FsrIdxSt, nTotRaySeg, nTotCellRay, &
!$ACC CellRayIdxSt, PinIdx, SurfIdx, ExpAppIdx, FsrIdx, ExpApp, OptLenList, PhiAngOut)
DO iWorker = 1, GPUControl(1)%nWorker
!$ACC CACHE(wt, phia)
!$ACC LOOP SEQ
DO iRay = 1, GPUControl(1)%OpenACCRayList(0, iWorker, iGang)
iRotRay = GPUControl(1)%OpenACCRayList(iRay, iWorker, iGang)
!!!!!!!!!!!!!!!! Inlined Tracking Subroutine !!!!!!!!!!!!!!!!
nCoreRay = RayInfo%RotRay(iRotRay)%nRay
PRINT *, '1st', iRotRay, nCoreRay
!$ACC LOOP SEQ
DO j = 1, nCoreRay
irsegidx = 0; icellrayidx = 0
iCoreRay = RayInfo%RotRay(iRotRay)%RayIdx(j)
nAsyRay = RayInfo%CoreRay(iCoreRay)%nRay
PRINT *, '2nd', iCoreRay, nAsyRay
!$ACC LOOP SEQ
DO k = 1, nAsyRay
iasyray = RayInfo%CoreRay(iCoreRay)%AsyRayIdx(k)
iasy = RayInfo%CoreRay(iCoreRay)%AsyIdx(k)
IF(iasy .EQ. 0) CYCLE !Skip Dummy Assembly
nPinRay = RayInfo%AsyRay(iAsyRay)%nCellRay
PRINT *, '3rd', iasyray, iasy, nPinRay
!$ACC LOOP SEQ
DO l = 1, nPinRay
ipin = RayInfo%AsyRay(iAsyRay)%PinIdx(l)
iceray = RayInfo%AsyRay(iAsyRay)%PinRayIdx(l)
ipin = CoreInfo%Asy(iAsy)%GlobalPinIdx(ipin)
icelw = CoreInfo%Pin(ipin)%Cell(iz)
FsrIdxSt = CoreInfo%Pin(ipin)%FsrIdxSt
icellrayidx = icellrayidx + 1
PRINT *, '4th', ipin, iceray, icelw, FsrIdxSt, icellrayidx
PinIdx(icellrayidx, j) = ipin
CellRayIdxSt(icellrayidx, j, 2) = irsegidx + 1
nRaySeg = CoreInfo%CellInfo(icelw)%CellRay(iceray)%nSeg
!$ACC LOOP INDEPENDENT VECTOR
DO iRaySeg = 1, nRaySeg
irv = irsegidx + iRaySeg
ireg = FsrIdxSt + CoreInfo%CellInfo(icelw)%CellRay(iceray)%LocalFsrIdx(iRaySeg) - 1
FsrIdx(irv, j) = ireg
!$ACC LOOP SEQ
DO ig = mygb, myge
tau = - CoreInfo%CellInfo(icelw)%CellRay(iceray)%LenSeg(iRaySeg) * xst(ig, ireg)
OptLenList(ig, irv, j) = tau
ExpAppIdx(ig, irv, j) = min(0, max(INT(tau), -40000))
ENDDO
ENDDO
irsegidx = irsegidx + nRaySeg
CellRayIdxSt(icellrayidx, j, 1) = irsegidx
SurfIdx(icellRayIdx, j, 1) = RayInfo%AsyRay(iAsyRay)%PinRaySurf(2, l)
SurfIdx(icellRayIdx, j, 2) = RayInfo%AsyRay(iAsyRay)%PinRaySurf(1, l)
ENDDO
ENDDO
nTotRaySeg(j) = irsegidx
nTotCellRay(j) = icellRayIdx
!$ACC LOOP INDEPENDENT COLLAPSE(3) VECTOR
DO iRaySeg = 1, nTotRaySeg(j)
DO ig = mygb, myge
DO ipol = 1, nPolarAngle
ExpApp(ipol, ig, iRaySeg, j) &
= EXPAPolar(ipol, ExpAppIdx(ig, iRaySeg, j)) * OptLenList(ig, iRaySeg, j) + EXPBPolar(ipol, ExpAppIdx(ig, iRaySeg, j))
ENDDO
ENDDO
ENDDO
ENDDO
!$ACC LOOP SEQ
DO irot = 1, 2
PhiAnginSvIdx = RayInfo%PhiAngInSvIdx(iRotRay, irot)
PhiAngOutSvIdx = RayInfo%PhiangOutSvIdx(iRotRay, irot)
phiobd(:, :) = PhiAngIn(:, :, PhiAnginSvIdx)
jinc = 1; jbeg = 1; jend = nCoreRay
IF(irot .EQ. 2) THEN
jinc = -1; jbeg = nCoreRay; jend = 1
ENDIF
!$ACC LOOP SEQ
DO j = jbeg, jend, jinc
idir = RayInfo%RotRay(iRotRay)%DIR(j);
iazi = RayInfo%CoreRay(RayInfo%RotRay(iRotRay)%RayIdx(j))%iang
wt(:) = wtangP0(:, iazi)
IF(irot .EQ. 2) idir = mp(idir)
nRaySeg = nTotRaySeg(j)
IF(idir .EQ. 1) THEN
PhiAngOut(:, :, 1) = phiobd(:, :)
!$ACC LOOP SEQ
DO irw = 1, nRaySeg
ifsr = FsrIdx(irw, j)
phia(:) = 0._8
!$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
DO ig = mygb, myge
DO ipol = 1, nPolarAngle
phid = (PhiAngOut(ipol, ig, irw) - src(ig, ifsr)) * ExpApp(ipol, ig, irw, j)
PhiAngOut(ipol, ig, irw + 1) = PhiAngOut(ipol, ig, irw) - phid
!$ACC ATOMIC UPDATE
! phis(ig, ifsr) = phis(ig, ifsr) + wt(ipol) * phid
phia(ig) = phia(ig) + wt(ipol) * phid
!$ACC END ATOMIC
ENDDO
ENDDO
!$ACC LOOP INDEPENDENT VECTOR
DO ig = mygb, myge
!$ACC ATOMIC UPDATE
phis(ig, ifsr) = phis(ig, ifsr) + phia(ig)
!$ACC END ATOMIC
ENDDO
ENDDO
phiobd(:, :) = PhiAngOut(:, :, nRaySeg + 1)
IF(ljout) THEN
!$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
DO irv = 1, nTotCellRay(j)
DO ig = mygb, myge
icelv = PinIdx(irv, j); isurf1 = SurfIdx(irv, j, 1); isurf2 = SurfIdx(irv, j, 2)
phiocel1 = 0._8; phiocel2 = 0._8
!$ACC LOOP SEQ
DO ipol = 1, nPolarAngle
phiocel1 = phiocel1 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 1) + 1)
phiocel2 = phiocel2 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 2))
ENDDO
!$ACC ATOMIC UPDATE
Jout(ig, 2, isurf1, icelv) = Jout(ig, 2, isurf1, icelv) + phiocel1
Jout(ig, 1, isurf2, icelv) = Jout(ig, 1, isurf2, icelv) + phiocel2
!$ACC END ATOMIC
ENDDO
ENDDO
ENDIF
ELSE
PhiAngOut(:, :, nRaySeg + 2) = phiobd(:, :)
irw = nRaySeg + 1
!$ACC LOOP SEQ
DO irw1 = 1, nRaySeg
irw = irw - 1
ifsr = FsrIdx(irw, j)
phia(:) = 0._8
!$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
DO ig = mygb, myge
DO ipol = 1, nPolarAngle
phid = (PhiAngOut(ipol, ig, irw + 2) - src(ig, ifsr)) * ExpApp(ipol, ig, irw, j)
PhiAngOut(ipol, ig, irw + 1) = PhiAngOut(ipol, ig, irw + 2) - phid
!$ACC ATOMIC UPDATE
! phis(ig, ifsr) = phis(ig, ifsr) + wt(ipol) * phid
phia(ig) = phia(ig) + wt(ipol) * phid
!$ACC END ATOMIC
ENDDO
ENDDO
!$ACC LOOP INDEPENDENT VECTOR
DO ig = mygb, myge
!$ACC ATOMIC UPDATE
phis(ig, ifsr) = phis(ig, ifsr) + phia(ig)
!$ACC END ATOMIC
ENDDO
ENDDO
phiobd(:, :) = PhiAngOut(:, :, 2)
IF(lJout) THEN
!$ACC LOOP INDEPENDENT COLLAPSE(2) VECTOR
DO irv = 1, nTotCellRay(j)
DO ig = mygb, myge
icelv = PinIdx(irv, j); isurf1 = SurfIdx(irv, j, 1); isurf2 = SurfIdx(irv, j, 2)
phiocel1 = 0._8; phiocel2 = 0._8
!$ACC LOOP SEQ
DO ipol = 1, nPolarAngle
phiocel1 = phiocel1 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 1) + 2)
phiocel2 = phiocel2 + wt(ipol) * PhiAngOut(ipol, ig, CellRayIdxSt(irv, j, 2) + 1)
ENDDO
!$ACC ATOMIC UPDATE
Jout(ig, 1, isurf1, icelv) = Jout(ig, 1, isurf1, icelv) + phiocel1
Jout(ig, 2, isurf2, icelv) = Jout(ig, 2, isurf2, icelv) + phiocel2
!$ACC END ATOMIC
ENDDO
ENDDO
ENDIF
ENDIF
ENDDO
PhiAngIn(:, :, PhiAngOutSvIdx) = phiobd(:, :)
ENDDO
!!!!!!!!!!!!!!!! Inlined Tracking Subroutine !!!!!!!!!!!!!!!!
ENDDO
ENDDO
ENDDO
!$ACC LOOP INDEPENDENT GANG PRIVATE(j, FsrIdxSt, icelg)
DO j = 1, CoreInfo%nxy
FsrIdxSt = CoreInfo%Pin(j)%FsrIdxSt; icelg = CoreInfo%Pin(j)%Cell(iz)
!$ACC LOOP INDEPENDENT WORKER PRIVATE(i, ireg)
DO i = 1, CoreInfo%CellInfo(icelg)%nFsr
ireg = FsrIdxSt + i - 1
!$ACC LOOP INDEPENDENT VECTOR
DO ig = mygb, myge
phis(ig, ireg) = phis(ig, ireg) / (xst(ig, ireg) * CoreInfo%CellInfo(icelg)%vol(i)) + src(ig, ireg)
ENDDO
ENDDO
ENDDO
!$ACC END PARALLEL
!$ACC END DATA
!$ACC EXIT DATA DELETE(xst(mygb : myge, :), src(mygb : myge, :))
!$ACC EXIT DATA COPYOUT(phis(mygb : myge, :), Jout(mygb : myge, :, :, :), PhiAngIn(:, mygb : myge, :))
PAUSE
END SUBROUTINE
I eliminated the derived type variable. I thought this would be safer. Also, I tidied up the code a little bit.
The problem is not fixed yet, though.
Removing private declarations on scalars didn’t make any change.
While I was examining some features, I found an odd thing.
The value of icellrayidx is not properly incremented in “Release mode” only.
icellrayidx has some values like 272043665.
Why?
And do you have any other suggestions about strange behavior of FsrIdx?
I’m getting pessimistic about when I will be able to finish this code. I hope there is a solution.