Efficiency of MPI transfer(in cuda fortran + MPI code)

Hi
I developed cuda fortran + mpi multi GPU code by cuda-aware MPI. But I think my MPI transfer efficiency is not good enough.
Is there any way to improve efficiency of MPI transfer?
I’ll attach some part of my code below.

program main
use cudafor
use cuda_kernel
use mpi

implicit none

integer:: numgpus_per_node
integer:: myrank, nprocs, tag, ierr
integer:: kup, kdown
integer:: istatus(MPI_STATUS_SIZE)
real(8), managed,allocatable,dimension(:,:,:):: A, A_new, B, B_new, C
! and other parameter


! - - - - - MPI initialization - - - - -
call MPI_init(ierr)
call MPI_comm_rank( MPI_COMM_WORLD , myrank , ierr )   
ierr = cudaGetDeviceCount(numgpus_per_node)
ierr = cudaSetDevice(mod(myrank,numgpus_per_node))
call MPI_comm_size( MPI_COMM_WORlD , nprocs, ierr )

kup = myrank + 1
kdown = myrank -1
if ( myrank .eq. nprocs -1 ) kup = 0
if ( myrank .eq. 0 ) kdown = nprocs-1


! - - - - - Setting some parameters - - - - -
nx = 1024
ny = 1024
nz = 1024

ilen = nx
jlen = ny
klen = nz / nprocs

allocate ( A(0:ilen+1 , 0:jlen+1 , 0:klen+1 ) )
allocate ( B(0:ilen+1 , 0:jlen+1 , 0:klen+1 ) )
allocate ( C(0:ilen+1 , 0:jlen+1 , 0:klen+1 ) )
allocate ( A_new(0:ilen+1 , 0:jlen+1 , 0:klen+1 ) )
allocate ( B_new(0:ilen+1 , 0:jlen+1 , 0:klen+1 ) )

blocks = dim3(ilen/16,jlen/16,1)
threads = dim3(16,16,1)

! and other parameters ... 


! - - - - - Initial condition of arrays - - - - -
A(i,j,k), B(i,j,k), C(i,j,k)
! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 


! - - - - - - - - - - Main loop start - - - - - - - - - -
do time = 1, timemax


	! - - - - - - - - - - MPI transfer for array A and B - - - - - - - - - -
	call mpi_isend(A(0,0,klen),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
								MPI_COMM_WORLD,sendijp,ierr)
	call mpi_isend(A(0,0,1),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
								MPI_COMM_WORLD,sendijn,ierr)
	call mpi_irecv(A(0,0,0),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
								MPI_COMM_WORLD,recvijn,ierr)
	call mpi_irecv(A(0,0,klen+1),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
								MPI_COMM_WORLD,recvijp,ierr)
	call MPI_WAIT(sendijp,istatus,ierr)
	call MPI_WAIT(sendijn,istatus,ierr)
	call MPI_WAIT(recvijn,istatus,ierr)
	call MPI_WAIT(recvijp,istatus,ierr)
	
	call mpi_isend(B(0,0,klen),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
								MPI_COMM_WORLD,sendijp,ierr)
	call mpi_isend(B(0,0,1),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
								MPI_COMM_WORLD,sendijn,ierr)
	call mpi_irecv(B(0,0,0),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
								MPI_COMM_WORLD,recvijn,ierr)
	call mpi_irecv(B(0,0,klen+1),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
								MPI_COMM_WORLD,recvijp,ierr)
	call MPI_WAIT(sendijp,istatus,ierr)
	call MPI_WAIT(sendijn,istatus,ierr)
	call MPI_WAIT(recvijn,istatus,ierr)
	call MPI_WAIT(recvijp,istatus,ierr)
	!  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


	!  - - - - - - - - - - GPU computation for array A_new and C by A and B - - - - - - - - - -
	call calanew<<<blocks,threads>>>( some parameters )
	state = cudaThreadSynchronize()



	! - - - - - - - - - - MPI transfer for array C - - - - - - - - - -
	call mpi_isend(C(0,0,klen),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
								MPI_COMM_WORLD,sendijp,ierr)
	call mpi_isend(C(0,0,1),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
								MPI_COMM_WORLD,sendijn,ierr)
	call mpi_irecv(C(0,0,0),(ilen+2)*(jlen+2),MPI_REAL8,kdown,1,&
								MPI_COMM_WORLD,recvijn,ierr)
	call mpi_irecv(C(0,0,klen+1),(ilen+2)*(jlen+2),MPI_REAL8,kup,1,&
								MPI_COMM_WORLD,recvijp,ierr)
	call MPI_WAIT(sendijp,istatus,ierr)
	call MPI_WAIT(sendijn,istatus,ierr)
	call MPI_WAIT(recvijn,istatus,ierr)
	call MPI_WAIT(recvijp,istatus,ierr)
	!  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


	!  - - - - - - - - - - GPU computation for array B_new by A, B and C - - - - - - - - - -
	call calbnew<<<blocks,threads>>>( some parameters )
	state = cudaThreadSynchronize()


	! - - - - - - - - - - Update arrays - - - - - - - - - -
	A = A_new
	B= B_new
	! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


	! - - - - - - - - - - Graphic output part - - - - - - - - - -

	
	! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


end do
! - - - - - - - - - - - - - - - Main loop end - - - - - - - - - - - - - - - - 


deallocate ( A(0:ilen+1 , 0:jlen+1 , 0:klen+1 ) )
deallocate ( B(0:ilen+1 , 0:jlen+1 , 0:klen+1 ) )
deallocate ( C(0:ilen+1 , 0:jlen+1 , 0:klen+1 ) )
deallocate ( A_new(0:ilen+1 , 0:jlen+1 , 0:klen+1 ) )
deallocate ( B_new(0:ilen+1 , 0:jlen+1 , 0:klen+1 ) )


state = cudaFree(A)
state = cudaFree(B)
state = cudaFree(C)
state = cudaFree(A_new)
state = cudaFree(B_new)


call MPI_Finalize(ierr)
stop
end program main

I thought some possibility of bad efficiency by following things

  1. Using managed alloy is bad for MPI transfer?
  2. MPI_Isend, MPI_Irecv have bad efficiency?
  3. Problem of my code
    in the main loop, the code work by following step
    (1) MPI transfer → (2) GPU computation → (3) MPI transfer
    ->(4) GPU computation → (5) Update
    Is there any way to perform simultaneously
    MPI transfer and GPU computation?

Please comment and advise me!
Thank you