Hi, Mat,
I am back on this project.
If I declare the arrays as private before entering the collapsed loops with the routine call, and declare them as arguments to the subroutine, the code runs, but only some of these private arrays are used correctly, the other ones contain data messed up.
Here is the main program (nvect=128 is a parameter in the module):
!
!$acc kernels present(domain)
!
c *** Main compute routine inside 2 nested OMP loops
c *** *****************************************************************
C
!$OMP PARALLEL DO DEFAULT(PRIVATE)
!$OMP1 SCHEDULE(STATIC,1)
!$OMP1 SHARED(domain,nparts,nsub,
!$OMP2 istep,isweep,istage,icomput)
!$acc loop gang collapse(2)
!$acc1 private(rtbl(nvect),utbl(nvect),vtbl(nvect),wtbl(nvect),
!$acc2 etbl(nvect),sabl(nvect),eibl(nvect),ekbl(nvect),
!$acc3 htbl(nvect),prbl(nvect),
!$acc4 cph3g(nvect),cph1g(nvect),cph3d(nvect),cph1d(nvect),
!$acc5 cgpx(nvect), cgpy(nvect), cgpz(nvect), cddd(nvect) )
do ipart=1,nparts
!$OMP PARALLEL DO DEFAULT(PRIVATE)
!$OMP1 SCHEDULE(STATIC,1)
!$OMP1 SHARED(domain,ipart,nparts,nsub,
!$OMP2 istep,isweep,istage,icomput)
do isub=1,nsub
call Nxo_compute(istep,isweep,istage,icomput,
1 ipart,isub,nparts,nsub,
2 domain,domain(ipart)%subdomain(isub),
3 domain(ipart)%subdomain(isub)%problem,
4 rtbl,utbl,vtbl,wtbl,etbl,sabl,eibl,ekbl,htbl,prbl,
5 cph3g,cph1g,cph3d,cph1d,cgpx,cgpy,cgpz,cddd )
enddo
!$OMP END PARALLEL DO
enddo
!$OMP END PARALLEL DO
c
!$acc end kernels
The subroutine header is :
c ******************************************************************
subroutine Nxo_compute(istep,isweep,istage,icomput,
1 ipart,isub,nparts,nsub,
2 domain,subdomain,problem,
4 rtbl,utbl,vtbl,wtbl,etbl,sabl,eibl,ekbl,htbl,prbl,
5 cph3g,cph1g,cph3d,cph1d,cgpx,cgpy,cgpz,cddd )
c
!$acc routine vector
c
USE Nxo_commun
USE omp_lib
#ifdef _OPENACC
use openacc
use cudafor
c use wmma
#endif
use ieee_arithmetic
c
integer istep,isweep,istage,icomput,ipart,nparts,nsub
type(domain_type) domain(nparts)
type(subdomain_type) subdomain
type(problem_type) problem
c
type(vf_type) blob_vf_rcv
type(coeffs_type) coeffs
real rtbl(nvect),utbl(nvect),vtbl(nvect),wtbl(nvect)
real etbl(nvect),sabl(nvect)
real eibl(nvect),ekbl(nvect),htbl(nvect),prbl(nvect)
c
real4 cph3g(nvect),cph1g(nvect),cph3d(nvect),cph1d(nvect)
real4 cgpx(nvect), cgpy(nvect), cgpz(nvect), cddd(nvect)
c
c *** Sub-domain Solver
c
c *** data for the sequential loop (not vectorized, how to place it in shared memory ?)
c
sa_cb1 = 0.1355
sa_sigma = 2./3.
…
…Here is how the private arrays are used
…
c *** Gather the coefficients of the spatial scheme
c
#ifdef _OPENACC
c *** Coalesced data access on GPU
!$acc loop vector
do iste=1,nelint
icoel = icoel_strt +iste
cph3g(iste) = subdomain%coe_gpu(icoel,1)
cph1g(iste) = subdomain%coe_gpu(icoel,2)
cph3d(iste) = subdomain%coe_gpu(icoel,3)
cph1d(iste) = subdomain%coe_gpu(icoel,4)
cgpx(iste) = subdomain%coe_gpu(icoel,5)
cgpy(iste) = subdomain%coe_gpu(icoel,6)
cgpz(iste) = subdomain%coe_gpu(icoel,7)
cddd(iste) = subdomain%coe_gpu(icoel,8)
enddo
#else
do iste=1,nelint
icoel = icoel_strt +iste
coeffs = subdomain%coeint(icoel)
cph3g(iste) = coeffs%cph3g
cph1g(iste) = coeffs%cph1g
cph3d(iste) = coeffs%cph3d
cph1d(iste) = coeffs%cph1d
cgpx(iste) = coeffs%cgpx
cgpy(iste) = coeffs%cgpy
cgpz(iste) = coeffs%cgpz
cddd(iste) = coeffs%cddd
enddo
#endif
c
c *** Gather the conservative variable fields in tne stencil FVs
c
!$acc loop vector
do iste=1,nelint
icoel = icoel_strt +iste
c
iblob = subdomain%intcel(icoel)
if(iblob.gt.0 .and. iblob.le.nblob_vf) then
blob_vf_rcv = subdomain%blob_vf(iblob)
else if(iblob.lt.0) then
index_snd = -iblob
iprt_snd = mod(index_snd,nparts)
index_snd = (index_snd-iprt_snd) / nparts
isub_snd = mod(index_snd,nsub)
nod_snd = (index_snd-isub_snd) / nsub
iprt_snd = iprt_snd+1
isub_snd = isub_snd+1
blob_vf_rcv = domain(iprt_snd)%subdomain(isub_snd)
1 %blob_vf(nod_snd)
endif
rtbl(iste) = blob_vf_rcv%ro
utbl(iste) = blob_vf_rcv%u
vtbl(iste) = blob_vf_rcv%v
wtbl(iste) = blob_vf_rcv%w
etbl(iste) = blob_vf_rcv%eto
sabl(iste) = blob_vf_rcv%samu
if(iste.eq.1) then
vol_lft = blob_vf_rcv%vol
disw_lft = blob_vf_rcv%disw
ichunk_lft = subdomain%mtlev(ivf_lft)
nbgh_lft = mod(ichunk_lft,100)
endif
if(iste.eq.2) then
vol_rgh = blob_vf_rcv%vol
disw_rgh = blob_vf_rcv%disw
if(iblob.gt.0) then
ichunk_rgh = subdomain%mtlev(ivf_rgh)
else
ichunk_rgh = domain(iprt_snd)%subdomain(isub_snd)
1 %mtlev(nod_snd)
endif
nbgh_rgh = mod(ichunk_rgh,100)
endif
c
enddo
!
These private arrays (vectors) should be vectorized for the coalesced data transfer (1st phase, gather the coefficients of the spatial scheme).
The second phase cannot use coalesced data transfer since there is an indirection (unstructured grid in this subdomain and/or access to data in other subdomains) .
Is adding the private vectors to the argument list correct ? Some prints in the parallel routine show that the first phase is correct.
For the second one, all the first 32 values in vector rtbl for example are the same, then 33 to 64,…
Thanks for your help,
jean-Marie