I am to trying run multiple loops in parallel. I used async clause next to the compute directive but the processes are not launching in async. I have confirmed that by looking at the nsys profile generated. I am also attaching the .qdrep file for reference. I am not able to figure out what mistake I am doing? Did I use the async clause in the correct location?
those four segments should have all started at once.
Code:
PROGRAM Test
use declare_variables
implicit none
call CPU_TIME(start_time)
call ALLOCATE_VARIABLES()
! ------------------------------
! DO WHILE (time < t_end)
DO WHILE (iter < 3)
call Compute_Prims()
call Compute_Conservs()
!$acc wait
time = time + time_step
iter = iter + 1
call CPU_TIME(end_time)
print*, time, end_time-start_time
ENDDO
! ------------------------------
call CPU_TIME(end_time)
print*, 'Total wall clock time taken = ', end_time-start_time, 'secs'
END
SUBROUTINE ALLOCATE_VARIABLES()
use declare_variables
implicit none
nblocks = 1
ALLOCATE(NI(nblocks))
ALLOCATE(NJ(nblocks))
ALLOCATE(NK(nblocks))
NI = 64
NJ = 64
NK = 64
nprims = 5
nconserv = 5
time_Step = 0.001d0
t_end = 0.5d0
NImax = MAXVAL(NI)
NJmax = MAXVAL(NJ)
NKmax = MAXVAL(NK)
ALLOCATE(Px(NImax,NJmax,NKmax,nblocks,nprims))
ALLOCATE(Py(NImax,NJmax,NKmax,nblocks,nprims))
ALLOCATE(Pz(NImax,NJmax,NKmax,nblocks,nprims))
ALLOCATE(Cx(NImax,NJmax,NKmax,nblocks,nconserv))
ALLOCATE(Cy(NImax,NJmax,NKmax,nblocks,nconserv))
ALLOCATE(Cz(NImax,NJmax,NKmax,nblocks,nconserv))
!$acc enter data copyin(NI,NJ,NK), create(Px,Py,Pz,Cx,Cy,Cz)
END
SUBROUTINE Compute_Prims()
use declare_variables
implicit none
integer :: queue
!$acc parallel loop gang vector collapse(4) async(1)
DO nbl = 1,nblocks
DO k = 1, NKmax
DO j = 1, NJmax
DO i = 1, NImax
if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then
Px(i,j,k,nbl,1) = i*j
Px(i,j,k,nbl,2) = j*k
Px(i,j,k,nbl,3) = k*i
endif
ENDDO
ENDDO
ENDDO
ENDDO
!$acc parallel loop gang vector collapse(4) async(2)
DO nbl = 1,nblocks
DO k = 1, NKmax
DO j = 1, NJmax
DO i = 1, NImax
if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then
Px(i,j,k,nbl,4) = i*i
endif
ENDDO
ENDDO
ENDDO
ENDDO
!$acc parallel loop gang vector collapse(4) async(3)
DO nbl = 1,nblocks
DO k = 1, NKmax
DO j = 1, NJmax
DO i = 1, NImax
if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then
Px(i,j,k,nbl,5) = j*j
endif
ENDDO
ENDDO
ENDDO
ENDDO
END
SUBROUTINE Compute_Conservs()
use declare_variables
implicit none
integer :: queue
!$acc parallel loop gang vector collapse(4) async(4)
DO nbl = 1,nblocks
DO k = 1, NKmax
DO j = 1, NJmax
DO i = 1, NImax
if (k.le.NK(nbl).and.j.le.NJ(nbl).and.i.le.NI(nbl)) then
Cx(i,j,k,nbl,1) = i*j
Cx(i,j,k,nbl,2) = j*k
Cx(i,j,k,nbl,3) = k*i
Cx(i,j,k,nbl,4) = i*i
Cx(i,j,k,nbl,5) = j*j
endif
ENDDO
ENDDO
ENDDO
ENDDO
END
I am also attaching the full code (both test_Async.f90 and Module.f90 should be run). Thanks.
Module.f90 (745 Bytes)
test_async.f90 (2.8 KB)
Makefile (514 Bytes)test_async1.qdrep (382.2 KB)