A couple of ways. The “collapse(2) gang vector” clause when used with the “kernels” construct will merge the “k” and “j” loops into a 2-D gang and 2-D vector schedule. You would then add a “loop seq” clause around the “i” loops to force them to be run sequentially.
% cat test3.f90
program testbla
use openacc
integer :: i, j, k, n
real, allocatable, dimension(:,:,:) :: a, b, c
n = 10
allocate(a(n,n,n), b(n,n,n), c(n,n,n))
!$acc kernels loop collapse(2) gang vector
do k = 1, n
do j = 1, n
!$acc loop seq
do i = 1, n
a(i,j,k) = 0.0
b(i,j,k) = 1.0
enddo
enddo
enddo
!$acc parallel loop gang
do k = 1, n
!$acc loop vector
do j = 1, n
!$acc loop seq
do i = 1, n
c(i,j,k) = a(i,j,k) + b(i,j,k)
enddo
!$acc loop seq
do i = 1, n
c(i,j,k) = a(i,j,k) + b(i,j,k)
enddo
enddo
enddo
! need to print this out otherwise dead-code
! elemination will remove the above loops
print *, c(1,1,1), c(n,n,n)
end program testbla
% pgf90 -acc -Minfo=accel test3.f90 -V13.5
testbla:
10, Generating present_or_copyout(b(1:10,1:10,1:10))
Generating present_or_copyout(a(1:10,1:10,1:10))
Generating NVIDIA code
Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
Generating compute capability 3.0 binary
11, Loop is parallelizable
12, Loop is parallelizable
14, Loop is parallelizable
Accelerator kernel generated
11, !$acc loop gang, vector(4) ! blockidx%y threadidx%y
12, !$acc loop gang, vector(32) ! blockidx%x threadidx%x
21, Accelerator kernel generated
22, !$acc loop gang ! blockidx%x
24, !$acc loop vector(256) ! threadidx%x
21, Generating present_or_copyin(b(1:10,1:10,1:10))
Generating present_or_copyin(a(1:10,1:10,1:10))
Generating present_or_copyout(c(1:10,1:10,1:10))
Generating NVIDIA code
Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
Generating compute capability 3.0 binary
24, Loop is parallelizable
26, Loop is parallelizable
30, Loop is parallelizable