Hello,
Thank you for your previous replies. I could get pretty good speed-ups in my MPI-GPU hybrid code. It is very promising. Moving forward, I’m thinking of using async and routine directives to hide memory latency.
Based on documentation and other forum discussions, I could write the following code structure, but I’m a bit confused about how to use ACC routine directives in Fortran. Could you comment on this reproducing example? I declared the arrays to be used in the device subroutine AAA and used default(present) cluase to notify in the outer loop.
program routine_example
use openacc
implicit none
integer,parameter :: &
NPmax=100000, &
NPNmax = 300, &
NSD = 3 &
TIMESTEPS = 10000
integer,allocatable :: npnl(:), pair(:,:)
real(8),allocatable :: mass(:), at(:,:), p(:), rho(:), dwdx(:,:,:)
!$acc declare create(npnl,pair,mass,at,p,rho,dwdx)
integer :: ii,i,err
!$acc routine(AAA)
allocate(npnl(NPmax), pair(NPNmax,NPmax), mass(NPmax), at(NSD,NPmax), &
p(NPmax), rho(NPmax), dwdx(NSD,NPNmax,NPmax), stat=err)
if(err/=0) then
print '(A)', ' DYNAMIC ALLOCATION ERROR '
end if
!$acc data copy(npnl,pair,mass,at,p,rho,dwdx)
do ii=1,TIMESTEPS
! ... other prallel constructs for async ...
!$acc parallel loop independent gang vector default(present) async(7)
do i=1,NPmax
call AAA
!call BBB
!call CCC
end do
! ... other parallel constructs for async ...
!$acc wait
end do
!$acc end data
end program
subroutine AAA
!$acc routine
use openacc
implicit none
integer :: i,j,k
real(8) :: sr,vr(3)
!$acc loop seq private(j,sr,vr)
do k=1,npnl(i)
j = pair(k,i)
sr = (p(i)+p(j))/(rho(i)*rho(j))
vr = sr*dwdx(:,k,i)
at(:,i) = at(:,i) + mass(j)*vr
end do
end subroutine
Thank you