Hi, I am running a large MPI+OpenACC based parallel multi-GPU program written in fortran on 4 A100 (40GB) GPUs on the NVIDIA DGX cluster. The communication bit of the program between the GPUs is supposedly very small, should account for 5-10% of the total time per iteration. But for some reason the communication part is consuming more than 50% of the total time per iteration. This is making the overall efficiency of my code very bad in multi-GPU runs. My communication routine is quite basic and looks as follows:
SUBROUTINE MPI_INTERFACE_on_TOP(block, Home_Block, Neigh_Block, home_block_rank, neigh_block_rank, comm_tag)
use global_variables
use mpi_variables
use mpi
implicit none
integer :: home_block_rank, neigh_block_rank
integer :: block, Home_Block, Neigh_Block
double precision :: Qp_packed(NK(block)*NGh*NI(block)*nprims)
integer :: SPt_I, EPt_I, SPt_J, EPt_J, SPt_K, EPt_K, from
integer :: package_size, comm_tag
integer :: status(MPI_STATUS_SIZE)
package_size = NK(block)*NGh*NI(block)*nprims
IF (rank == neigh_block_rank) THEN
! pack from neigh_block
nbl = Neigh_Block
SPt_J = 1; EPt_J = NGh
SPt_K = 1; EPt_K = NK(nbl)
SPt_I = 1; EPt_I = NI(nbl)
call PACK_DATA(Qp, Qp_packed, nbl, SPt_I, EPt_I, SPt_J, EPt_J, SPt_K, EPt_K, NI,NJ,NK,nblocks,nprims,NGh)
! send
dest_rank = home_block_rank; tag = comm_tag
#ifdef CUDA_AWARE_MPI
!$acc host_data use_device(Qp_packed)
#else
!$acc update self(Qp_packed)
#endif
call MPI_SEND(Qp_packed, package_size, MPI_DOUBLE_PRECISION, dest_rank, tag, MPI_COMM_WORLD, ierr)
#ifdef CUDA_AWARE_MPI
!$acc end host_data
#endif
ELSEIF (rank == home_block_rank) THEN
! recieve
from = neigh_block_rank; tag = comm_tag;
#ifdef CUDA_AWARE_MPI
!$acc host_data use_device(Qp_packed)
#endif
call MPI_RECV(Qp_packed, package_size, MPI_DOUBLE_PRECISION, from, tag, MPI_COMM_WORLD, status, ierr)
#ifdef CUDA_AWARE_MPI
!$acc end host_data
#else
!$acc update device(Qp_packed)
#endif
! unpack into home block
nbl = Home_Block
SPt_J = NJ(nbl)+1; EPt_J = NJ(nbl)+NGh
SPt_K = 1; EPt_K = NK(nbl)
SPt_I = 1; EPt_I = NI(nbl)
call UNPACK_DATA(Qp, Qp_packed, nbl, SPt_I, EPt_I, SPt_J, EPt_J, SPt_K, EPt_K, NI,NJ,NK,nblocks,nprims,NGh)
ENDIF
END
SUBROUTINE PACK_DATA(Qp, Qp_packed, nbl, SPt_I, EPt_I, SPt_J, EPt_J, SPt_K, EPt_K, NI,NJ,NK,nblocks,nprims,NGh)
implicit none
integer :: SPt_I, EPt_I, SPt_J, EPt_J, SPt_K, EPt_K, nblocks,nprims,NGh
integer, dimension(nblocks) :: NI, NJ, NK
integer :: i,j,k,nbl,npr
double precision, dimension(-NGh+1:MAXVAL(NI)+NGh, -NGh+1:MAXVAL(NJ)+NGh, -NGh+1:MAXVAL(NK)+NGh, nblocks, nprims):: Qp
double precision :: Qp_packed((EPt_I-SPt_I+1)*(EPt_J-SPt_J+1)*(EPt_K-SPt_K+1)*nprims)
!$acc parallel loop gang vector collapse(4) default(present)
DO npr = 1,nprims
DO k = SPt_K,EPt_K
DO j = SPt_J,EPt_J
DO i = SPt_I,EPt_I
Qp_packed(1 &
+ (i-SPt_I) &
+ (EPt_I-SPt_I+1)*(j-Spt_J) &
+ (EPt_I-SPt_I+1)*(EPt_J-SPt_J+1)*(k-SPt_K) &
+ (EPt_I-SPt_I+1)*(EPt_J-SPt_J+1)*(EPt_K-SPt_K+1)*(npr-1)) = Qp(i,j,k,nbl,npr)
ENDDO
ENDDO
ENDDO
ENDDO
END
SUBROUTINE UNPACK_DATA(Qp, Qp_packed, nbl, SPt_I, EPt_I, SPt_J, EPt_J, SPt_K, EPt_K, NI,NJ,NK,nblocks,nprims,NGh)
implicit none
integer :: SPt_I, EPt_I, SPt_J, EPt_J, SPt_K, EPt_K, nblocks,nprims,NGh
integer, dimension(nblocks) :: NI, NJ, NK
integer :: i,j,k,nbl,npr
double precision, dimension(-NGh+1:MAXVAL(NI)+NGh, -NGh+1:MAXVAL(NJ)+NGh, -NGh+1:MAXVAL(NK)+NGh, nblocks, nprims):: Qp
double precision :: Qp_packed((EPt_I-SPt_I+1)*(EPt_J-SPt_J+1)*(EPt_K-SPt_K+1)*nprims)
!$acc parallel loop gang vector collapse(4) default(present)
DO npr = 1,nprims
DO k = SPt_K,EPt_K
DO j = SPt_J,EPt_J
DO i = SPt_I,EPt_I
Qp(i,j,k,nbl,npr) = Qp_packed(1 &
+ (i-SPt_I) &
+ (EPt_I-SPt_I+1)*(j-Spt_J) &
+ (EPt_I-SPt_I+1)*(EPt_J-SPt_J+1)*(k-SPt_K) &
+ (EPt_I-SPt_I+1)*(EPt_J-SPt_J+1)*(EPt_K-SPt_K+1)*(npr-1))
ENDDO
ENDDO
ENDDO
ENDDO
END
As you can see, the communication routine (1st subroutine) is quite simple which involves packing → sending → receiving ->unpacking.
I have used the same routines as a part of a smaller toy program transferring arrays of similar size, and it seems the communication speed is fine in that toy program, giving perfect scaling. However in my real program which has lot of global variables and lot of computations involved, it seems for some reason affecting the communication part and is making it quite slow. I don’t understand why in my real program the communication is slow. What are the possible reasons that I am facing this issue? And is there a way to fix it? Is there a better and more efficient way to transfer memory from one GPU to another, other than the naive way that I have implemented?
Thanks