Consider the following demo, where I have some unrelated data sent over MPI while I run some OpenMP kernel and cufft:
program overlapping_cuda
use mpi_f08
use iso_fortran_env, only: wp => real64
use iso_c_binding, only: c_ptr, c_size_t, c_loc, c_int, c_double_complex
use omp_lib
use cufft
use cudafor
implicit none
integer :: ierr, rank, nprocs, nlocal, i, dev, num_dev, j, k, iter
integer, parameter :: n = 300000000 !global data
integer :: Nfft = 512
integer :: N2 = 1024*1024
complex(8), allocatable, dimension(:, :) :: hx
complex(c_double_complex), pointer :: sendbuf(:), recvbuf(:)
type(c_ptr) :: sendptr, recvptr
type(MPI_Request) :: req
integer(c_size_t) :: sendsize, recvsize
integer(c_int) :: plan, stream
real(wp) :: tstart, tend
call MPI_Init(ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr)
num_dev = omp_get_num_devices()
dev = mod(rank, num_dev)
call omp_set_default_device(dev)
print *, 'Rank', rank, 'of', nprocs, 'using device', dev, 'out of', num_dev
allocate(hx(Nfft, N2))
hx(:, :) = (1, -1)
!$omp target enter data map(to: hx)
ierr = cudaStreamCreate(stream)
ierr = cufftPlan1d(plan, int(Nfft, c_int), CUFFT_Z2Z, int(N2, c_int))
if (ierr /= 0) then
print *, "cufftPlan1d failed on rank", rank, "err=", ierr
end if
ierr = cufftSetStream(plan, stream)
nlocal = n / nprocs
recvsize = nlocal
sendsize = nlocal
sendptr = omp_target_alloc(sendsize*int(16, c_size_t), omp_get_default_device())
recvptr = omp_target_alloc(recvsize*int(16, c_size_t), omp_get_default_device())
call c_f_pointer(sendptr, sendbuf, [sendsize])
call c_f_pointer(recvptr, recvbuf, [recvsize])
do iter = 1,10
tstart = MPI_Wtime()
call MPI_IAlltoall(sendbuf, nlocal/nprocs, MPI_C_DOUBLE_COMPLEX, &
recvbuf, nlocal/nprocs, MPI_C_DOUBLE_COMPLEX, &
MPI_COMM_WORLD, req, ierr)
!$omp target teams distribute parallel do collapse(2) nowait
do j = 1, N2 / 2
do i = 1, Nfft
hx(i, j) = hx(i, j) * 2.0d0
end do
end do
!$omp target data use_device_addr(hx)
ierr = cufftExecZ2Z(plan, hx(1, 1), hx(1, 1), CUFFT_FORWARD)
!$omp end target data
if (ierr /= 0) then
print *, "cufftExecZ2Z failed on rank", rank, "err=", ierr
end if
!$omp target teams distribute parallel do collapse(2) nowait
do j = 1, N2 /2
do i = 1, Nfft
hx(i, j) = hx(i, j) * 2.0d0
end do
end do
call MPI_Wait(req, MPI_STATUS_IGNORE, ierr)
!$omp taskwait
tend = MPI_Wtime()
write(*,'(A,I0,A,I0,A,F8.6)') 'Rank ', rank, ', iter ', iter, ': timestep duration (s) = ', tend - tstart
end do
ierr = cufftDestroy(plan)
call omp_target_free(sendptr, omp_get_default_device())
call omp_target_free(recvptr, omp_get_default_device())
call MPI_Finalize(ierr)
end program overlapping_cuda
compiled with
mpif90 -mp -mp=gpu overlapping.f90 -o foo -isystem /opt/nvhpc/Linux_x86_64/25.5/cuda/12.9/targets/x86_64-linux/include /opt/nvhpc/Linux_x86_64/25.5/math_libs/12.9/lib64/libcufft.so /opt/nvhpc/Linux_x86_64/25.5/cuda/12.9/targets/x86_64-linux/lib/libcudart.so -Minfo=mp -Mcache_align -cuda
I see that the communication and computation indeed overlap (good):
But the kernels are out of order, which is not good. With pure OpenMP, I would define dependencies - i.e. depend(in)
and depend(out)
. With cufft, I was thinking of using an OpenMP detachable task, but that doesn’t seem to be supported by nvhpc (25.5 at least).
So, what do I do in this case?
Thanks in advance :)