Hi Matt,
I went ahead a wrote a small matmul program using MPI and accelerator regions. The main thing to note is that each process should set the device number to avoid too much contention (though processes can share a device).
#ifdef _ACCEL
numdevices = acc_get_num_devices(acc_device_nvidia)
mydevice = mod(myRank,numdevices)
call acc_set_device_num(mydevice,acc_device_nvidia)
#endif
Here’s the full source. Please let me know if you spot any bugs since I haven’t done MPI programming since grad school. ;-)
% cat mpi_mm_acc.f90
! Simple matmul example
module mymm
contains
subroutine mm_acc( a, b, c, m, start, end)
implicit none
real, dimension(:,:), intent(out) :: a
real, dimension(:,:), intent(in) :: b,c
integer m, start, end, i,j,k
!$acc region copyin(b(:m,:m), c(:m,:m)), &
!$acc& copyout(a(start:end,:m))
do j = 1, m
do i = start,end
a(i,j) = 0.0
enddo
do i = start,end
do k = 1,m
a(i,j) = a(i,j) + b(i,k) * c(k,j)
enddo
enddo
enddo
!$acc end region
end subroutine
end module
PROGRAM mpi_matmul_acc
use mymm
use mpi
#ifdef _ACCEL
use accel_lib
#endif
IMPLICIT NONE
INTEGER :: myRank, ierr, status(MPI_STATUS_SIZE)
INTEGER :: count, dest, source, tag, size, icode
INTEGER :: nn, chunk, start, end, a_chunk_type
INTEGER :: i, j, n, mydevice, numdevices
REAL,allocatable :: a(:,:)
REAL,allocatable :: b(:,:), c(:,:)
! set-up MPI
CALL MPI_INIT(ierr)
CALL MPI_COMM_RANK( MPI_COMM_WORLD, myRank, ierr )
CALL MPI_COMM_SIZE( MPI_COMM_WORLD, size, ierr )
! have task 0 get the size of the array from the user
IF (myRank.EQ.0) THEN
print *, 'enter array size to run'
read *, nn
END IF
! send the size to the processes
call MPI_BCAST(nn,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
! set the number of columns each process should work on
chunk = nn / size
#ifdef _ACCEL
! set the device for each process
numdevices = acc_get_num_devices(acc_device_nvidia)
mydevice = mod(myRank,numdevices)
call acc_set_device_num(mydevice,acc_device_nvidia)
#endif
! set-up the vector type for our arrays
CALL MPI_TYPE_VECTOR(1, nn, nn, MPI_REAL, a_chunk_type, ierr)
CALL MPI_TYPE_COMMIT(a_chunk_type,ierr)
! allocate our arrays
allocate(a(nn,nn))
allocate(b(nn,nn))
allocate(c(nn,nn))
tag=99
a=0
! have task 0 initialize the b and c arrays with random numbers
IF (myRank.EQ.0) THEN
call RANDOM_NUMBER(b)
call RANDOM_NUMBER(c)
end if
! sync the b and c arrays
call MPI_BCAST(b,nn*nn,MPI_REAL,0,MPI_COMM_WORLD,ierr)
call MPI_BCAST(c,nn*nn,MPI_REAL,0,MPI_COMM_WORLD,ierr)
! determine the starting column
start = (myRank*chunk)+1
! call the MM routine
call mm_acc( a, b, c, nn, start, start+chunk-1 )
! Have process 0 Gather the results
IF (size .gt. 1. .and. myRank.EQ.0) THEN
do source=1,size-1
start = (source*chunk)+1
do i=start,start+chunk
CALL MPI_RECV(a(i,:), 1, a_chunk_type, source, tag, &
MPI_COMM_WORLD, status, ierr)
end do
end do
END IF
!have the other processes send their data to process 0
IF (size .gt. 1. .and. myRank.ne.0) THEN
dest = 0
do i=start,start+chunk
CALL MPI_SEND(a(i,:), 1, a_chunk_type, dest, tag, &
MPI_COMM_WORLD, ierr)
end do
END IF
! have task 0 print out a few of the values to make sure it got the results back
IF (myRank.EQ.0) THEN
do i=1,size
PRINT*, "TASK #", myRank, " ", a((i*chunk)-1,34)
end do
end if
! Clean-up
deallocate(a)
deallocate(b)
deallocate(c)
CALL MPI_FINALIZE(ierr)
END
% mpif90 -Mpreprocess -o mmacc.out mpi_mm_acc.f90 -fastsse -Minfo=accel -V9.0-3 -ta=nvidia
mm_acc:
12, Generating copyout(a(start:end,:m))
Generating copyin(c(:m,:m))
Generating copyin(b(:m,:m))
15, Loop is parallelizable
16, Loop is parallelizable
Accelerator kernel generated
15, !$acc do parallel, vector(16)
16, !$acc do parallel, vector(16)
20, Loop is parallelizable
21, Complex loop carried dependence of a prevents parallelization
Loop carried dependence of a prevents parallelization
Loop carried backward dependence of a prevents vectorization
Inner sequential loop scheduled on accelerator
Accelerator kernel generated
15, !$acc do parallel, vector(16)
20, !$acc do parallel, vector(16)
Using register for 'a'
21, !$acc do seq
Cached references to size [16x16] block of 'b'
Cached references to size [16x16] block of 'c'
% mpirun -np 4 mmacc.out
enter array size to run
1024
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=16 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=21 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=16 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=16 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=21 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=21 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=16 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=21 grid=64x16 block=16x16
TASK # 0 264.2788
TASK # 0 267.8382
TASK # 0 268.5576
TASK # 0 263.5367