Hello,
I try to implement window slicing in a Fortran application, in order to hide PCIe cost.
I have a four dimensional array, Q, and I compute an other four dimensional array F on a GPU using openACC.
I cut the array Q in eight parts. I send a slice of Q to the device, I compute a kernel on the device and I get back on the host the newly computed slice of F. I try to use the ACC_MAP_DATA but I may miss something.
Here is my example
PROGRAM PIPELINE
!
IMPLICIT NONE
!
INTEGER, PARAMETER :: rp = kind(1.0D0)
!
! Sizes of the problem
INTEGER, PARAMETER :: NX = 100
INTEGER, PARAMETER :: NY = 150
INTEGER, PARAMETER :: NZ = 128
!
REAL(rp), PARAMETER :: one = 1.0_rp
REAL(rp), PARAMETER :: zero = 0.0_rp
REAL(rp), PARAMETER :: half = 0.5_rp
!
! Number of slices and temporary indexes
INTEGER, PARAMETER :: nbslice_z = 8
INTEGER :: kMinz, kMaxz
!
! Global dimensions for computing loops
INTEGER :: Imin, Imax, Jmin, Jmax, Kmin, Kmax
!
! Needed for ghost cells of Q
INTEGER, PARAMETER :: i1add = 3
INTEGER, PARAMETER :: j1add = 3
INTEGER, PARAMETER :: k1add = 3
INTEGER, PARAMETER :: i2add = 3
INTEGER, PARAMETER :: j2add = 3
INTEGER, PARAMETER :: k2add = 3
!
! temporary scalar variables like lopp indexes
INTEGER :: i, j, k, L, ik, length
INTEGER :: icode
!
! Arrays of data
REAL(rp), DIMENSION(:,:,:,:), ALLOCATABLE, TARGET :: Q, F
REAL(rp), POINTER, CONTIGUOUS :: PQ(:,:,:,:)
REAL(rp), POINTER, CONTIGUOUS :: PF(:,:,:,:)
INTEGER, DIMENSION(:), ALLOCATABLE :: kBeg, kEnd
!
!!!
!
! Debut du programme
!
!!!
!
! Initialization for loop indexes
kMin = 1 ; kMax = Nz
jMin = 1 ; jMax = Ny
iMin = 1 ; iMax = Nx
WRITE (6,'(A,6I8)') 'iMin, jMin, kMin, iMax, jMax, kMax : ', iMin, jMin, kMin, iMax, jMax, kMax
!
! Dynamic Allocation (the final program will use ghost cells)
ALLOCATE (Q (5, Imin-i1add:Imax+i2add, Jmin-j1add:Jmax+j2add, Kmin-k1add:Kmax+k2add),
& F (5, Imin-i1add:Imax+i2add, Jmin-j1add:Jmax+j2add, Kmin-K1add:Kmax+k2add) )
WRITE (6,'(A,8I4)') 'Q : ', LBOUND(Q ), UBOUND(Q )
WRITE (6,'(A,8I4)') 'F : ', LBOUND(F ), UBOUND(F )
!
! Initialization of the data
DO k = Kmin-k1add, Kmax+k2add
DO j = Jmin - j1add, Jmax + j2add
DO i = Imin - i1add, Imax + i2add
Q(1,i,j,k) = one
Q(2,i,j,k) = one
Q(3,i,j,k) = one
Q(4,i,j,k) = one
Q(5,i,j,k) = one
!
F(1,i,j,k) = zero
F(2,i,j,k) = zero
F(3,i,j,k) = zero
F(4,i,j,k) = zero
F(5,i,j,k) = zero
END DO
END DO
END DO
!
! Output of Q
WRITE (6,'(A)') 'Part of array Q :'
DO L = 1, 5
WRITE (6,'(4I4,ES22.15)') L, L, L, L, Q(L,L,L,L)
END DO
CALL Flush (6)
!
! Filling of the slice cutting array
! example : 8 slices
ALLOCATE (kBeg(nbslice_z), kEnd(nbslice_z) )
DO ik = 0, nbslice_z-1
kBeg(1+ik) = 1 + (ik *Nz) / nbslice_z
kEnd(1+ik) = ( (ik+1)*Nz) / nbslice_z
END DO
!
! Output of the slicing
WRITE (6,'(A,I4)') 'nbslice_z = ', nbslice_z
DO k = 1, nbslice_z
WRITE (6,'(3(A,I4) )') 'slice ', k, ' : ', kBeg(k), ' to ', kEnd(k)
CALL Flush (6)
END DO
!
!!!
! Loop on the slices
!!!
!
DO ik = 1, nbslice_z
!
! Current slice
kminz = kBeg(ik)
kmaxz = kEnd(ik)
WRITE (6,'(A,3I4)') 'Beginning slice ', ik, kminz, kmaxz
CALL Flush (6)
!
PQ => Q(1:5,1-i1add:Nx+i2add, 1-j1add:Ny+j2add, kminz-k1add:kmaxz+k2add)
PF => F(1:5,1-i1add:Nx+i2add, 1-j1add:Ny+j2add, kminz:kmaxz)
WRITE (6,'(A,8I4)') 'Q : ', LBOUND(Q ), UBOUND(Q )
WRITE (6,'(A,8I4)') 'F : ', LBOUND(F ), UBOUND(F )
WRITE (6,'(A,8I4)') 'PQ : ', LBOUND(PQ), UBOUND(PQ)
WRITE (6,'(A,8I4)') 'PF : ', LBOUND(PF), UBOUND(PF)
CALL Flush (6)
!$ACC ENTER DATA CREATE (PF) COPYIN(PQ)
!
WRITE (6,'(A)') 'Association on the device'
CALL FLUSH(6)
!
! Association on the device
length = 8 * SIZE(Q(1:5,1-i1add:Nx+i2add, 1-j1add:Ny+j2add, kminz-k1add:kmaxz+k2add) )
WRITE (6,'(A,I9)') 'length(Q) : ', length
CALL FLUSH(6)
CALL ACC_MAP_DATA (Q(1,1-i1add, 1-j1add, kminz-k1add), PQ(1,1-i1add, 1-j1add, kminz-k1add), length)
!
length = 8 * SIZE(F(1:5,1-i1add:Nx+i2add, 1-j1add:Ny+j2add, kminz:kmaxz) )
WRITE (6,'(A,I9)') 'length(F) : ', length
CALL FLUSH(6)
CALL ACC_MAP_DATA (F(1,1-i1add, 1-j1add, kminz), PF(1,1-i1add, 1-j1add, kminz), length)
!
WRITE (6,'(A)') 'Computing on the device'
CALL FLUSH(6)
! Computing of F's slice
!$ACC PARALLEL LOOP COLLAPSE(4) PRIVATE (i,j,k,L) PRESENT(PF,PQ)
DO k = kminz, kmaxz
DO j = 1, Ny+j2add+j1add
DO i = 1, Nx+i2add+i1add
DO L = 1, 5
PF(L,i,j,k) = PQ(L,i,j,k) * half + one
END DO
END DO
END DO
END DO
!$ACC END PARALLEL LOOP
!
! Device To Host of PF
!$ACC UPDATE SELF (PF)
!
! End of association
CALL ACC_UNMAP_DATA (Q)
CALL ACC_UNMAP_DATA (F)
!
!$ACC EXIT DATA
!
WRITE (6,'(A,3I4)') 'End of slice ', ik, kminz, kmaxz
CALL Flush (6)
END DO
!
!!!
!
! Output of F
WRITE (6,'(A)') 'Output of F :'
DO L = 1, 5
WRITE (6,'(4I4,2ES22.15)') L, L, L, L, F(L,L,L,L)
END DO
CALL Flush (6)
!
! Deallocation
DEALLOCATE (Q, F)
!
!!!
!
!
!!!
!
STOP
END PROGRAM PIPELINE
I compile this example with
make
pgf90 -c -Minfo=all -O0 -g -traceback -Mbounds -Mfixed -Mextend ./src/main.F90 -o ./obj_O0/main.o
pipeline:
144, Conflict or overlap between pf and pq
pgf90 -O0 ./obj_O0/main.o -o ./run/a_acc.out
When I run the code the get the following output :
./run/a_acc.out
iMin, jMin, kMin, iMax, jMax, kMax : 1 1 1 100 150 128
Q : 1 -2 -2 -2 5 103 153 131
F : 1 -2 -2 -2 5 103 153 131
Part of array Q :
1 1 1 1 1.000000000000000E+00
2 2 2 2 1.000000000000000E+00
3 3 3 3 1.000000000000000E+00
4 4 4 4 1.000000000000000E+00
5 5 5 5 1.000000000000000E+00
nbslice_z = 8
slice 1 : 1 to 16
slice 2 : 17 to 32
slice 3 : 33 to 48
slice 4 : 49 to 64
slice 5 : 65 to 80
slice 6 : 81 to 96
slice 7 : 97 to 112
slice 8 : 113 to 128
Beginning slice 1 1 16
Q : 1 -2 -2 -2 5 103 153 131
F : 1 -2 -2 -2 5 103 153 131
PQ : 1 1 1 1 5 106 156 22
PF : 1 1 1 1 5 106 156 16
Association on the device
length(Q) : 14551680
0: Subscript out of range for array pq (./src/main.F90: 128)
subscript=-2, lower bound=1, upper bound=106, dimension=2
If I look at the values of LBOUND and UBOUND for PF and PQ, I do not get those of F and Q. How can I preserve them ? I’d like to have the same indexes on the host and and the device.
Is the use ACC_MAP_DATA correct ?
The next step will be the use of multiple queues and asynchronous execution.
Thank you for your comments.
Regards,
Guy.