Hi Mat,
here a simplified test case, performing a vector addition:
main.F90
program vector_add_test
#ifdef _ACCEL
use accel_lib
#endif
use mpi
use storage
implicit none
call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world,n_rank,ierr)
call mpi_comm_size(mpi_comm_world,n_proc,ierr)
GPU = .false.
#ifdef _ACCEL
print*,'NEW n_rank,NCOREXNODE: ',n_rank,NCOREXNODE
if(mod(n_rank,NCOREXNODE) .lt. NGPUXNODE) then
call acc_set_device(acc_device_nvidia)
device_kind = acc_get_device()
print*,'Selected device kind: ',device_kind
num_devices = acc_get_num_devices(acc_device_nvidia)
print*,'Number of devices available: ',num_devices
call acc_set_device_num(mod(n_rank,NCOREXNODE),acc_device_nvidia)
print*,'n_rank: ',n_rank,' tries to set GPU: ',mod(n_rank,NCOREXNODE)
my_device = acc_get_device_num(acc_device_nvidia)
print*,'n_rank: ',n_rank,' is using device: ',my_device
print*,'Set GPU to true for rank: ',n_rank
GPU = .true.
else
call acc_set_device(0)
endif
#endif
call MPI_BARRIER(MPI_COMM_WORLD,ierr)
print*,'ciao'
if(GPU) then ; n = n_long ; else ; n = n_short ; endif
call allocate_storage()
!allocate(a(n),b(n),c(n))
call random(a) ; call random(b)
if(GPU) then
print*,'updating device'
!$acc update device(a,b)
endif
call vector_add()
if(GPU) then
print*,'updating host'
!$acc update host(c)
endif
print*,'GPU?: ',GPU,' c(5): ',c(5)
call mpi_finalize(ierr)
end program vector_add_test
allocate_storage.f90
subroutine allocate_storage()
use storage
implicit none
integer :: i
allocate(a(n),b(n),c(n))
end subroutine allocate_storage
storage.f90
module storage
use accel_lib
integer, parameter :: myk = kind(1.d0)
real(myk), allocatable, dimension(:) :: a,b,c
integer, parameter :: NCOREXNODE=12
integer, parameter :: NGPUXNODE=2
integer :: num_devices
integer(acc_device_kind) :: my_device,device_kind
integer :: n_long=1000,n_short=150,n
integer :: ierr,n_rank,n_proc
logical :: GPU
!$acc mirror(a,b,c)
end module storage
vector_add.f90
subroutine vector_add()
use storage
implicit none
integer :: i
!$acc region
do i=1,n
c(i) = a(i) + b(i)
enddo
!$acc end region
end subroutine vector_add
I compile using:
mpif90 -ta=nvidia,cc20,cuda4.0,host storage.f90 allocate_storage.f90 main.F90 vector_add.f90
Every MPI process should select the device type (host or nvidia) and, in case of GPU, the device number. It is now set on a node with 12 cores and 2 GPUs: the first 2 use GPUs while the other 10 use CPUs.
The error at runtime is:
Usage error: multiple calls to acc_set_device with different device types
Another curious error. If I allocate without calling a subroutine but inserting allocate(a(n),b(n),c(n)) in the main program I cannot use the second GPU (even for GPU only runs) because the first one seems to be automatically initialized and I do not know how to avoid it.
thanks a lot,
Francesco