I am making a test that transfer data between two gpus directly, and the one gpu is attached 0 process ,the other gpu is attached with 1 process
if using the MPI_SENDRECV , the program do not write log that each process has own log (.eg. 0 process’log is 0.log) ,but do not use MPI_SENDRECV , the process will write its own log
the code is follow :
mpicomm.f90:
module mpicomm
implicit none
contains
subroutine openfile(fh,myid)
integer,intent(inout) :: fh
integer,intent(in) :: myid
integer :: ierror
character(len=20) :: filename
character(len=100) :: errstring
write(filename ,“(i3,‘.log’)”) myid
open(fh,file =filename,status =“replace”,action=“write”,iostat =ierror)
end subroutine openfile
end module mpicomm
test.f90:
module test
use cudafor
use mpi
implicit none
contains
subroutine transferData(myrank,fh)
integer,intent(in) ::myrank,fh
integer ::numprocs,ierror,sendtag,recvtag
real,device,allocatable::d_0(:),d_s(:),d_r(:)
real,allocatable::h_0(:),h_1(:)
integer :: STATUS(MPI_STATUS_SIZE)
allocate(d_0(1:100))
allocate(d_r(1:100))
allocate(d_s(1:100))
allocate(h_1(1:100))
sendtag= 77
recvtag =77
!call mpi_comm_size(mpi_comm_world,numprocs,ierror)
!call mpi_comm_rank(mpi_comm_world,myrank,ierror)
! I found if I use the up subroutine , the var myrank is not the same as the myrank at the begin, I want to know why
if(myrank .eq. 0) then
d_0 = 0.0
d_s=d_0
d_r =d_0
call MPI_SENDRECV(d_s,100,mpi_real,1,sendtag,d_r,100,mpi_real,0,recvtag,mpi_comm_world,status,ierror)
!if use the subroutine MPI_SENDRECV , the process will not write its own log , and the process will not end, it is liked to suspend, and I do not use MPI_SENDRECV ,the process will be ok
h_1=d_r
write(fh,) 'rank_0 h-1 is ’ ,h_1
end if
if(myrank .eq. 1) then
d_0 =1.0
h_1=d_0
write(fh,) 'rank_1 h-1 before is ’ ,h_1
d_s=d_0
d_r=d_0
**call MPI_SENDRECV(d_s,100,mpi_real,1,sendtag,d_r,100,mpi_real,0,recvtag,mpi_comm_world,status,ierror)
! if use the subroutine , the process will not write its own log
h_1=d_r
write(fh,*) 'rank_1 h-1 after is ’ ,h_1
end if
deallocate(d_0)
deallocate(d_s)
deallocate(d_r)
deallocate(h_1)
end subroutine transferData
end module test
main.f90:
program main
use mpi
use mpicomm
use cudafor
use test
implicit none
call mpi_init(ierr)
call mpi_comm_size(mpi_comm_world, numprocs,ierr)
call mpi_comm_rank(mpi_comm_world,myrank,ierr)
call mpi_get_processor_name(hostname,namelen,ierr)
allocate(hosts(0:numprocs-1))
call openfile(fh,myrank)
do i =0,numprocs-1
istat=cudaSetDevice(i)
istat = cudaGetDevice(i)
end do
call transferData(myrank,fh)
print * ,‘transferData is over’
call mpi_finalize(rc)
print * ,‘mpi is over’
close(fh)
end program main
sbatch shell is :
#!/bin/bash
#SBATCH -J test
#SBATCH --comment=test
#SBATCH -o out/%x-%j.out
#SBATCH -e out/%x-%j.err
#SBATCH -p GPU
#SBATCH --ntasks=2
#SBATCH --nodes=2
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=2
mpirun ./a.exe
#srun --mpi=pmi2 a.out