Cuda-aware mpi segfault and Invalid permissions

Dear nividia experts,
I test the cuda-aware mpi using a very simple case. But it failed I am not sure what happened. Could you please help to take a look at? Thank you very much!
errors:

1 Currently Loaded Modulefiles:
 2  1) nvhpc/20.11
 3  2) cudatoolkit/11.0
 4  3) openmpi/cuda-11.0/nvhpc-20.11/4.0.4/64
 5 [tiger-i21g2:25229] *** Process received signal ***
 6 [tiger-i21g2:25229] Signal: Segmentation fault (11)
 7 [tiger-i21g2:25229] Signal code: Invalid permissions (2)
 8 [tiger-i21g2:25229] Failing at address: 0x2b5b7c800200
 9 [tiger-i21g2:25229] *** End of error message ***
 10 [tiger-i21g2:25228] *** Process received signal ***
 11 [tiger-i21g2:25228] Signal: Segmentation fault (11)
 12 [tiger-i21g2:25228] Signal code: Invalid permissions (2)
 13 [tiger-i21g2:25228] Failing at address: 0x2b3d5c800200
 14 [tiger-i21g2:25228] *** End of error message ***
 15 srun: error: tiger-i21g2: tasks 0-1: Segmentation fault
 16 srun: launch/slurm: _step_signal: Terminating StepId=6608445.0

The code

program GPUdirect
    use cudafor
    use mpi
    use mpiDeviceUtil
    implicit none
    !include 'mpif.h'

    integer :: direct
    character(len=255) :: env_var
    integer :: rank, size, ierror, rank2
    integer,dimension(:),allocatable :: h_buff
    integer,device :: d_rank
    integer,dimension(:),allocatable,device :: d_buff
    integer :: i
    integer:: deviceID

    !call getenv("MPICH_RDMA_ENABLED_CUDA", env_var)
    !read( env_var, '(i10)' ) direct
    !if (direct .NE. 1) then
    !  print *, 'MPICH_RDMA_ENABLED_CUDA not enabled!'
    !  call exit(1)
    !endif

    call MPI_INIT(ierror)

    ! Get MPI rank and size
    call MPI_COMM_RANK (MPI_COMM_WORLD, rank, ierror)
    call MPI_COMM_SIZE (MPI_COMM_WORLD, size, ierror)

    call assignDevice(deviceID)

    ! Initialize host and device buffers
    allocate(h_buff(size))
    allocate(d_buff(size))
    ! Implicity copy rank to device
    d_rank = rank
    rank2 = 0
    print *, rank2,'1'
    rank2 = d_rank

    print *, rank2

    !! Preform allgather using device buffers
    call MPI_ALLGATHER(d_rank, 1, MPI_INTEGER, d_buff, 1, MPI_INTEGER, MPI_COMM_WORLD, ierror)
!
    !! Check that buffer is correct
    !h_buff = d_buff(1:size)
    !do i=1,size
    !    if (h_buff(i) .NE. i-1) then
    !        print *, 'Alltoall Failed!'
    !        call exit(1)
    !    endif
    !enddo
    !if (rank .EQ. 0) then
    !    print *, 'Success!'
    !endif
!
    !! Clean up
    !deallocate(h_buff)
    !deallocate(d_buff)
    call MPI_FINALIZE(ierror)

end program GPUdirect

module mpiDeviceUtil
  interface
    subroutine quicksort(base , nmemb , elemsize , compar) &
        bind(C,name='qsort ')
      use iso_c_binding
      implicit none
      !pgi$ ignore_tkr base ,nmemb ,elemsize ,compar
      type(C_PTR), value :: base
      integer(C_SIZE_T), value :: nmemb , elemsize
      type(C_FUNPTR), value :: compar
    end subroutine quicksort

    integer function strcmp(a,b) bind(C,name='strcmp ')
      use iso_c_binding
      implicit none
      !pgi$ ignore_tkr a,b
      type(C_PTR), value :: a, b
    end function strcmp
  end interface

contains
  subroutine assignDevice(dev)
    use mpi
    use cudafor
    implicit none
    integer :: dev
    character (len=MPI_MAX_PROCESSOR_NAME), allocatable :: hosts (:)
    character (len=MPI_MAX_PROCESSOR_NAME) :: hostname
    integer :: namelength , color , i
    integer :: nProcs , myrank , newComm , newRank , ierr

    call MPI_COMM_SIZE(MPI_COMM_WORLD , nProcs , ierr)
    call MPI_COMM_RANK(MPI_COMM_WORLD , myrank , ierr)

    ! allocate array of hostnames
    allocate(hosts (0: nProcs -1))

    ! Every process collects the hostname of all the nodes
    call MPI_GET_PROCESSOR_NAME(hostname , namelength , ierr)
    hosts(myrank )= hostname (1: namelength)

    do i=0,nProcs -1
      call MPI_BCAST(hosts(i),MPI_MAX_PROCESSOR_NAME , &
        MPI_CHARACTER ,i,MPI_COMM_WORLD ,ierr)
    end do

    ! sort the list of names
    call quicksort(hosts ,nProcs ,MPI_MAX_PROCESSOR_NAME ,strcmp)

    ! assign the same color to the same node
    color =0
    do i=0,nProcs -1
      if (i > 0) then
        if ( lne(hosts(i-1), hosts(i)) ) color=color +1
      end if
      if ( leq(hostname ,hosts(i)) ) exit
    end do

    call MPI_COMM_SPLIT(MPI_COMM_WORLD ,color ,0,newComm ,ierr)
    call MPI_COMM_RANK(newComm , newRank , ierr)

    dev = newRank
    ierr = cudaSetDevice(dev)

    deallocate(hosts)
  end subroutine assignDevice

  ! lexical .eq.
  function leq(s1 , s2) result(res)
    implicit none
    character (len =*) :: s1 , s2
    logical :: res
    res = .false.
    if (lle(s1 ,s2) .and. lge(s1 ,s2)) res = .true.
  end function leq

  ! lexical .ne.
  function lne(s1 , s2) result(res)
    implicit none
    character (len =*) :: s1 , s2
    logical :: res
    res = .not. leq(s1 , s2)
  end function lne
end module mpiDeviceUtil

A common problem with a segfault with MPI and CUDA is that the MPI you are using is not CUDA-aware. Passing a CUDA device pointer to a non-CUDA-aware MPI will result in a segfault.

Thank you Robert!
But the computational center said the cuda-aware mpi is enabled on this machine.
How to know if it is indeed enabled? Or if there is any benchmark case to test it?
Now I am not sure if it is the environment issue or my code issue. but my code is simple enough and it is downloaded from ORNL test case.
Could you give more ideas?
Thank you very much!
Chen