Dear nividia experts,
I test the cuda-aware mpi using a very simple case. But it failed I am not sure what happened. Could you please help to take a look at? Thank you very much!
errors:
1 Currently Loaded Modulefiles:
2 1) nvhpc/20.11
3 2) cudatoolkit/11.0
4 3) openmpi/cuda-11.0/nvhpc-20.11/4.0.4/64
5 [tiger-i21g2:25229] *** Process received signal ***
6 [tiger-i21g2:25229] Signal: Segmentation fault (11)
7 [tiger-i21g2:25229] Signal code: Invalid permissions (2)
8 [tiger-i21g2:25229] Failing at address: 0x2b5b7c800200
9 [tiger-i21g2:25229] *** End of error message ***
10 [tiger-i21g2:25228] *** Process received signal ***
11 [tiger-i21g2:25228] Signal: Segmentation fault (11)
12 [tiger-i21g2:25228] Signal code: Invalid permissions (2)
13 [tiger-i21g2:25228] Failing at address: 0x2b3d5c800200
14 [tiger-i21g2:25228] *** End of error message ***
15 srun: error: tiger-i21g2: tasks 0-1: Segmentation fault
16 srun: launch/slurm: _step_signal: Terminating StepId=6608445.0
The code
program GPUdirect
use cudafor
use mpi
use mpiDeviceUtil
implicit none
!include 'mpif.h'
integer :: direct
character(len=255) :: env_var
integer :: rank, size, ierror, rank2
integer,dimension(:),allocatable :: h_buff
integer,device :: d_rank
integer,dimension(:),allocatable,device :: d_buff
integer :: i
integer:: deviceID
!call getenv("MPICH_RDMA_ENABLED_CUDA", env_var)
!read( env_var, '(i10)' ) direct
!if (direct .NE. 1) then
! print *, 'MPICH_RDMA_ENABLED_CUDA not enabled!'
! call exit(1)
!endif
call MPI_INIT(ierror)
! Get MPI rank and size
call MPI_COMM_RANK (MPI_COMM_WORLD, rank, ierror)
call MPI_COMM_SIZE (MPI_COMM_WORLD, size, ierror)
call assignDevice(deviceID)
! Initialize host and device buffers
allocate(h_buff(size))
allocate(d_buff(size))
! Implicity copy rank to device
d_rank = rank
rank2 = 0
print *, rank2,'1'
rank2 = d_rank
print *, rank2
!! Preform allgather using device buffers
call MPI_ALLGATHER(d_rank, 1, MPI_INTEGER, d_buff, 1, MPI_INTEGER, MPI_COMM_WORLD, ierror)
!
!! Check that buffer is correct
!h_buff = d_buff(1:size)
!do i=1,size
! if (h_buff(i) .NE. i-1) then
! print *, 'Alltoall Failed!'
! call exit(1)
! endif
!enddo
!if (rank .EQ. 0) then
! print *, 'Success!'
!endif
!
!! Clean up
!deallocate(h_buff)
!deallocate(d_buff)
call MPI_FINALIZE(ierror)
end program GPUdirect
module mpiDeviceUtil
interface
subroutine quicksort(base , nmemb , elemsize , compar) &
bind(C,name='qsort ')
use iso_c_binding
implicit none
!pgi$ ignore_tkr base ,nmemb ,elemsize ,compar
type(C_PTR), value :: base
integer(C_SIZE_T), value :: nmemb , elemsize
type(C_FUNPTR), value :: compar
end subroutine quicksort
integer function strcmp(a,b) bind(C,name='strcmp ')
use iso_c_binding
implicit none
!pgi$ ignore_tkr a,b
type(C_PTR), value :: a, b
end function strcmp
end interface
contains
subroutine assignDevice(dev)
use mpi
use cudafor
implicit none
integer :: dev
character (len=MPI_MAX_PROCESSOR_NAME), allocatable :: hosts (:)
character (len=MPI_MAX_PROCESSOR_NAME) :: hostname
integer :: namelength , color , i
integer :: nProcs , myrank , newComm , newRank , ierr
call MPI_COMM_SIZE(MPI_COMM_WORLD , nProcs , ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD , myrank , ierr)
! allocate array of hostnames
allocate(hosts (0: nProcs -1))
! Every process collects the hostname of all the nodes
call MPI_GET_PROCESSOR_NAME(hostname , namelength , ierr)
hosts(myrank )= hostname (1: namelength)
do i=0,nProcs -1
call MPI_BCAST(hosts(i),MPI_MAX_PROCESSOR_NAME , &
MPI_CHARACTER ,i,MPI_COMM_WORLD ,ierr)
end do
! sort the list of names
call quicksort(hosts ,nProcs ,MPI_MAX_PROCESSOR_NAME ,strcmp)
! assign the same color to the same node
color =0
do i=0,nProcs -1
if (i > 0) then
if ( lne(hosts(i-1), hosts(i)) ) color=color +1
end if
if ( leq(hostname ,hosts(i)) ) exit
end do
call MPI_COMM_SPLIT(MPI_COMM_WORLD ,color ,0,newComm ,ierr)
call MPI_COMM_RANK(newComm , newRank , ierr)
dev = newRank
ierr = cudaSetDevice(dev)
deallocate(hosts)
end subroutine assignDevice
! lexical .eq.
function leq(s1 , s2) result(res)
implicit none
character (len =*) :: s1 , s2
logical :: res
res = .false.
if (lle(s1 ,s2) .and. lge(s1 ,s2)) res = .true.
end function leq
! lexical .ne.
function lne(s1 , s2) result(res)
implicit none
character (len =*) :: s1 , s2
logical :: res
res = .not. leq(s1 , s2)
end function lne
end module mpiDeviceUtil