The only thing I’m seeing is that there’s about 2us when it’s entering and exiting the implicit data region (add “openacc” to nsys trace options to see the OpenACC runtime calls). I can get rid of this by adding “default(present)” on the parallel loop construct so the compiler knows all data is present on the device.
All remaining time seems to be the loop overhead and some of the call overhead. This can be measured by instrumenting the code with NVTX.
Try using this version to see if the gaps remain and if so, the NVTX ranges will give us an idea where in the host code that’s causing it.
Note that I also added a “wait” directive so the CUDA Fortran calls didn’t start before the OpenACC section is finished.
module util
implicit none
integer :: ne, dof, irk
real, allocatable, dimension(:,:,:) :: ze,qx,qy,rhs_ze,rhs_qx,rhs_qy
contains
subroutine init(n, d, r)
implicit none
integer, value :: n, d, r
integer :: k, j
ne = n
dof = d
irk = r
if (.not. allocated(ze)) then
allocate (ze(dof,ne,irk), qx(dof, ne,irk), qy(dof, ne,irk))
allocate (rhs_ze(dof,ne,irk), rhs_qx(dof,ne,irk), rhs_qy(dof,ne,irk))
endif
! some arbitrary initialization on host
DO J = 1,NE
DO K = 1,DOF
ze(k,j,1) = j/ne
qx(k,j,1) = ze(k,j,1) + k/ne
qy(k,j,1) = qx(k,j,1) + j/dof
rhs_ze(k,j,1) = j/ne
rhs_qx(k,j,1) = ze(k,j,1) + k/ne
rhs_qy(k,j,1) = qx(k,j,1) + j/dof
enddo
enddo
end subroutine init
subroutine rk_update()
implicit none
integer :: k, j, i
!$acc parallel loop gang vector async(1) default(present)
DO J = 1,NE
do i = 1,IRK-1
DO K = 1,DOF
ZE(K,J,i+1) = ZE(K,J,i) + RHS_ZE(K,J,i)
QX(K,J,i+1) = QX(K,J,i) + RHS_QX(K,J,i)
QY(K,J,i+1) = QY(K,J,i) + RHS_QY(K,J,i)
enddo
enddo
enddo
end subroutine rk_update
attributes(global) subroutine rk_update_cuda()
implicit none
integer :: k, j, i
j = (blockidx%x-1)*blockdim%x + threadidx%x
if (j <= ne) then
do i = 1,IRK-1
DO K = 1,DOF
ZE(K,J,i+1) = ZE(K,J,i) + RHS_ZE(K,J,i)
QX(K,J,i+1) = QX(K,J,i) + RHS_QX(K,J,i)
QY(K,J,i+1) = QY(K,J,i) + RHS_QY(K,J,i)
enddo
enddo
endif
end subroutine rk_update_cuda
end module util
program main
use util
use nvtx
implicit none
integer :: i, N, nn
nn = 300000
N = 1000
call init(n=nn, d=3, r=2)
call nvtxStartRange("Start ACC")
do i = 1,N
call nvtxStartRange("rk_update")
call rk_update()
call nvtxEndRange
enddo
!$acc wait
call nvtxEndRange
! cuda version
call init(n=nn, d=3, r=2)
call nvtxStartRange("Start CUF")
do i = 1,N
call nvtxStartRange("rk_update_cuda")
call rk_update_cuda<<<nn/128+1, 128>>>()
call nvtxEndRange
enddo
call nvtxEndRange
end program main