Transfer data which from different gpu between processes with mpi

i want make a test that data from different gpu on different process with cuda fotran ,but it always print Signal: Segmentation fault (11)
the code if follow and i run tow process and tow gpu:

program main
use mpi
use cudafor
implicit none
real,device,allocatable::d_0(:),d_1(:)
real,allocatable::h_0(:),h_1(:)
integer :: STATUS(MPI_STATUS_SIZE),numprocs
call mpi_init(ierr)
call mpi_comm_size(mpi_comm_world, numprocs,ierr)
call mpi_comm_rank(mpi_comm_world,myrank,ierr)
do i =0,numprocs-1
istat=cudaSetDevice(i)
istat = cudaGetDevice(i)
end do
allocate(d_0(1:100))
allocate(d_1(1:100))
allocate(h_0(1:100))
allocate(h_1(1:100))
if(myrank .eq. 0) then
d_0 = 0.0
call MPI_SENDRECV(d_0,100,mpi_real,1,sendtag,d_1,100,mpi_real,0,recvtag,mpi_comm_world,status,ierror)
h_0=d_0
write(0,) ‘0 myrank h_0 is’ ,h_0
end if
if(myrank .eq. 1) then
d_1 =1.0
call MPI_SENDRECV(d_0,100,mpi_real,0,sendtag,d_1,100,mpi_real,1,recvtag,mpi_comm_world,status,ierror)
h_1=d_1
write(0,
) ‘1 myrank h_1 is’ ,h_1
end if
!call cpy1()
deallocate(d)
call mpi_finalize(rc)

end program main

Hi liuzheliuhao,

Is your MPI CUDA Aware? If not, it will be expecting host pointers. Passing device pointers would cause a seg fault.

Note that your example had several syntax errors, so I updated the following example to do a similar operation. I’m use the OpenMPI 3.1.5 that we ship with the NV HPC SDK and does support CUDA Aware MPI.

% cat mpi.cuf
PROGRAM mpiex
use mpi
IMPLICIT NONE

INTEGER:: err, nproc, myid
INTEGER,DIMENSION(MPI_STATUS_SIZE):: status
INTEGER, PARAMETER:: N=10000
REAL, allocatable :: h(:)
REAL, allocatable,  device :: d_0(:), d_1(:)

CALL MPI_INIT (err)
CALL MPI_COMM_SIZE (MPI_COMM_WORLD, nproc, err)
CALL MPI_COMM_RANK (MPI_COMM_WORLD, myid, err)

allocate(h(N))
allocate(d_0(N))
allocate(d_1(N))

IF (myid==0)  THEN
    d_0 = 0
    CALL MPI_SENDRECV (d_0, N, MPI_REAL, 1, 10, & ! Sent data
                       d_1, N, MPI_REAL, 1, 11, & ! Received data
                       MPI_COMM_WORLD, status, err)
    h = d_1
    WRITE(*,*) "Process", myid, "h=", h(1:3), "..."
ELSE
    d_0 = 1
    CALL MPI_SENDRECV (d_0, N, MPI_REAL, 0, 11, &
                       d_1, N, MPI_REAL, 0, 10, &
                       MPI_COMM_WORLD, status, err)
    h = d_1
    WRITE(*,*) "Process", myid, "h=", h(1:3), "..."
END IF

deallocate(d_0)
deallocate(d_0)
deallocate(h)
CALL MPI_FINALIZE (err)
END PROGRAM mpiex
% mpif90  mpi.cuf
% mpirun -np 2 ./a.out
 Process            0 h=    1.000000        1.000000        1.000000     ...
 Process            1 h=    0.000000        0.000000        0.000000     ...