Hi, Mat,

I am back on this project.

If I declare the arrays as private before entering the collapsed loops with the routine call, and declare them as arguments to the subroutine, the code runs, but only some of these private arrays are used correctly, the other ones contain data messed up.

Here is the main program (nvect=128 is a parameter in the module):

!

!$acc kernels present(domain)

!

c *** Main compute routine inside 2 nested OMP loops

c *** *****************************************************************

C

!$OMP PARALLEL DO DEFAULT(PRIVATE)

!$OMP1 SCHEDULE(STATIC,1)

!$OMP1 SHARED(domain,nparts,nsub,

!$OMP2 istep,isweep,istage,icomput)

!$acc loop gang collapse(2)

!$acc1 private(rtbl(nvect),utbl(nvect),vtbl(nvect),wtbl(nvect),

!$acc2 etbl(nvect),sabl(nvect),eibl(nvect),ekbl(nvect),

!$acc3 htbl(nvect),prbl(nvect),

!$acc4 cph3g(nvect),cph1g(nvect),cph3d(nvect),cph1d(nvect),

!$acc5 cgpx(nvect), cgpy(nvect), cgpz(nvect), cddd(nvect) )

```
do ipart=1,nparts
```

!$OMP PARALLEL DO DEFAULT(PRIVATE)

!$OMP1 SCHEDULE(STATIC,1)

!$OMP1 SHARED(domain,ipart,nparts,nsub,

!$OMP2 istep,isweep,istage,icomput)

```
do isub=1,nsub
call Nxo_compute(istep,isweep,istage,icomput,
1 ipart,isub,nparts,nsub,
2 domain,domain(ipart)%subdomain(isub),
3 domain(ipart)%subdomain(isub)%problem,
4 rtbl,utbl,vtbl,wtbl,etbl,sabl,eibl,ekbl,htbl,prbl,
5 cph3g,cph1g,cph3d,cph1d,cgpx,cgpy,cgpz,cddd )
enddo
```

!$OMP END PARALLEL DO

enddo

!$OMP END PARALLEL DO

c

!$acc end kernels

The subroutine header is :

c ******************************************************************

subroutine Nxo_compute(istep,isweep,istage,icomput,

1 ipart,isub,nparts,nsub,

2 domain,subdomain,problem,

4 rtbl,utbl,vtbl,wtbl,etbl,sabl,eibl,ekbl,htbl,prbl,

5 cph3g,cph1g,cph3d,cph1d,cgpx,cgpy,cgpz,cddd )

c

!$acc routine vector

c

USE Nxo_commun

USE omp_lib

#ifdef _OPENACC

use openacc

use cudafor

c use wmma

#endif

use ieee_arithmetic

c

integer istep,isweep,istage,icomput,ipart,nparts,nsub

type(domain_type) domain(nparts)

type(subdomain_type) subdomain

type(problem_type) problem

c

type(vf_type) blob_vf_rcv

type(coeffs_type) coeffs

```
real rtbl(nvect),utbl(nvect),vtbl(nvect),wtbl(nvect)
real etbl(nvect),sabl(nvect)
real eibl(nvect),ekbl(nvect),htbl(nvect),prbl(nvect)
```

c

real*4 cph3g(nvect),cph1g(nvect),cph3d(nvect),cph1d(nvect)*

real4 cgpx(nvect), cgpy(nvect), cgpz(nvect), cddd(nvect)

c

c *** Sub-domain Solver

c

c *** data for the sequential loop (not vectorized, how to place it in shared memory ?)

c

sa_cb1 = 0.1355

sa_sigma = 2./3.

…

…Here is how the private arrays are used

…

c *** Gather the coefficients of the spatial scheme

c

#ifdef _OPENACC

c *** Coalesced data access on GPU

!$acc loop vector

do iste=1,nelint

icoel = icoel_strt +iste

cph3g(iste) = subdomain%coe_gpu(icoel,1)

cph1g(iste) = subdomain%coe_gpu(icoel,2)

cph3d(iste) = subdomain%coe_gpu(icoel,3)

cph1d(iste) = subdomain%coe_gpu(icoel,4)

cgpx(iste) = subdomain%coe_gpu(icoel,5)

cgpy(iste) = subdomain%coe_gpu(icoel,6)

cgpz(iste) = subdomain%coe_gpu(icoel,7)

cddd(iste) = subdomain%coe_gpu(icoel,8)

enddo

#else

do iste=1,nelint

icoel = icoel_strt +iste

coeffs = subdomain%coeint(icoel)

cph3g(iste) = coeffs%cph3g

cph1g(iste) = coeffs%cph1g

cph3d(iste) = coeffs%cph3d

cph1d(iste) = coeffs%cph1d

cgpx(iste) = coeffs%cgpx

cgpy(iste) = coeffs%cgpy

cgpz(iste) = coeffs%cgpz

cddd(iste) = coeffs%cddd

enddo

#endif

c

c *** Gather the conservative variable fields in tne stencil FVs

c

!$acc loop vector

do iste=1,nelint

icoel = icoel_strt +iste

c

iblob = subdomain%intcel(icoel)

if(iblob.gt.0 .and. iblob.le.nblob_vf) then

blob_vf_rcv = subdomain%blob_vf(iblob)

else if(iblob.lt.0) then

index_snd = -iblob

iprt_snd = mod(index_snd,nparts)

index_snd = (index_snd-iprt_snd) / nparts

isub_snd = mod(index_snd,nsub)

nod_snd = (index_snd-isub_snd) / nsub

iprt_snd = iprt_snd+1

isub_snd = isub_snd+1

blob_vf_rcv = domain(iprt_snd)%subdomain(isub_snd)

1 %blob_vf(nod_snd)

endif

rtbl(iste) = blob_vf_rcv%ro

utbl(iste) = blob_vf_rcv%u

vtbl(iste) = blob_vf_rcv%v

wtbl(iste) = blob_vf_rcv%w

etbl(iste) = blob_vf_rcv%eto

sabl(iste) = blob_vf_rcv%samu

if(iste.eq.1) then

vol_lft = blob_vf_rcv%vol

disw_lft = blob_vf_rcv%disw

ichunk_lft = subdomain%mtlev(ivf_lft)

nbgh_lft = mod(ichunk_lft,100)

endif

if(iste.eq.2) then

vol_rgh = blob_vf_rcv%vol

disw_rgh = blob_vf_rcv%disw

if(iblob.gt.0) then

ichunk_rgh = subdomain%mtlev(ivf_rgh)

else

ichunk_rgh = domain(iprt_snd)%subdomain(isub_snd)

1 %mtlev(nod_snd)

endif

nbgh_rgh = mod(ichunk_rgh,100)

endif

c

enddo

!

These private arrays (vectors) should be vectorized for the coalesced data transfer (1st phase, gather the coefficients of the spatial scheme).

The second phase cannot use coalesced data transfer since there is an indirection (unstructured grid in this subdomain and/or access to data in other subdomains) .

Is adding the private vectors to the argument list correct ? Some prints in the parallel routine show that the first phase is correct.

For the second one, all the first 32 values in vector rtbl for example are the same, then 33 to 64,…

Thanks for your help,

jean-Marie