Mat,

h, and h_edge are pointers,

real (kind=RKIND), dimension(:,:), pointer :: h_edge, h

integer, dimension(:,:), pointer :: cellsOnEdge

As this is not my code, and it is in a big model, so I won’t be able to switch the dimension index.

I tried with code:

4027 !

4028 !$acc data copyin(cellsOnEdge,h), copyout(h_edge)

4029 !$acc kernels

4030 !$acc loop gang worker(4) independent, &

4031 !$acc private(cell1, cell2)

4032 do iEdge=1,nEdges

4033 cell1 = cellsOnEdge(1,iEdge)

4034 cell2 = cellsOnEdge(2,iEdge)

4035 !$acc loop vector(64) independent

4036 do k=1,nVertLevels

4037 h_edge(k,iEdge) = 0.5 * (h(k,cell1) + h(k,cell2))

4038 end do

4039 end do

4040 !$acc end kernels

4041 !$acc end data

4042

Now, it compiled:

4028, Generating copyin(cellsonedge(:,:),h(:,:))

Generating copyout(h_edge(:,:))

4032, Loop is parallelizable

Accelerator kernel generated

Generating Tesla code

4032, !$acc loop gang, worker(4) ! blockidx%x threadidx%y

4036, !$acc loop vector(64) ! threadidx%x

Loop is parallelizable

Here are run time msg:

launch CUDA kernel file=/home/whuang/pgi/srcs/MPAS-Release-4.0/src/core_atmosphere/dynamics/mpas_atm_time_integration.F function=atm_init_coupled_diagnostics line=4452 device=0 threadid=1 num_gangs=4660 num_workers=1 vector_length=128 grid=4660 block=128

launch CUDA kernel file=/home/whuang/pgi/srcs/MPAS-Release-4.0/src/core_atmosphere/dynamics/mpas_atm_time_integration.F function=atm_init_coupled_diagnostics line=4461 device=0 threadid=1 num_gangs=4660 num_workers=1 vector_length=128 grid=4660 block=128

launch CUDA kernel file=/home/whuang/pgi/srcs/MPAS-Release-4.0/src/core_atmosphere/dynamics/mpas_atm_time_integration.F function=atm_init_coupled_diagnostics line=4470 device=0 threadid=1 num_gangs=4660 num_workers=1 vector_length=128 grid=4660 block=128

launch CUDA kernel file=/home/whuang/pgi/srcs/MPAS-Release-4.0/src/core_atmosphere/dynamics/mpas_atm_time_integration.F function=atm_init_coupled_diagnostics line=4480 device=0 threadid=1 num_gangs=4660 num_workers=1 vector_length=128 grid=4660 block=128

launch CUDA kernel file=/home/whuang/pgi/srcs/MPAS-Release-4.0/src/core_atmosphere/dynamics/mpas_atm_time_integration.F function=atm_init_coupled_diagnostics line=4489 device=0 threadid=1 num_gangs=4660 num_workers=1 vector_length=128 grid=4660 block=128

launch CUDA kernel file=/home/whuang/pgi/srcs/MPAS-Release-4.0/src/core_atmosphere/dynamics/mpas_atm_time_integration.F function=atm_compute_solve_diagnostics line=4032 device=0 threadid=1 num_gangs=11021 num_workers=4 vector_length=64 grid=11021 block=64x4 shared memory=64

Accelerator Kernel Timing data

/home/whuang/pgi/srcs/MPAS-Release-4.0/src/core_atmosphere/dynamics/mpas_atm_time_integration.F

atm_compute_solve_diagnostics NVIDIA devicenum=0

time(us): 4,123

4028: data region reached 1 time

4028: data copyin transfers: 5

device time(us): total=4,123 max=2,328 min=15 avg=824

4029: compute region reached 1 time

4032: kernel launched 1 time

grid: [11021] block: [64x4]

device time(us): total=0 max=0 min=0 avg=0

/home/whuang/pgi/srcs/MPAS-Release-4.0/src/core_atmosphere/dynamics/mpas_atm_time_integration.F

atm_init_coupled_diagnostics NVIDIA devicenum=0

time(us): 19,726

4447: data region reached 1 time

4447: data copyin transfers: 19

device time(us): total=11,301 max=1,699 min=7 avg=594

4449: compute region reached 1 time

4452: kernel launched 1 time

grid: [4660] block: [128]

elapsed time(us): total=577 max=577 min=577 avg=577

4458: compute region reached 1 time

4461: kernel launched 1 time

grid: [4660] block: [128]

elapsed time(us): total=375 max=375 min=375 avg=375

4467: compute region reached 1 time

4470: kernel launched 1 time

grid: [4660] block: [128]

elapsed time(us): total=2,029 max=2,029 min=2,029 avg=2,029

4477: compute region reached 1 time

4480: kernel launched 1 time

grid: [4660] block: [128]

elapsed time(us): total=4,620 max=4,620 min=4,620 avg=4,620

4486: compute region reached 1 time

4489: kernel launched 1 time

grid: [4660] block: [128]

elapsed time(us): total=719 max=719 min=719 avg=719

4496: data region reached 1 time

4496: data copyout transfers: 5

device time(us): total=8,425 max=1,774 min=1,443 avg=1,685

call to cuMemFreeHost returned error 700: Illegal address during kernel execution

Wei