Overlapping communication and computation - OpenMP detachable tasks

Consider the following demo, where I have some unrelated data sent over MPI while I run some OpenMP kernel and cufft:

program overlapping_cuda
   use mpi_f08
   use iso_fortran_env, only: wp => real64
   use iso_c_binding, only: c_ptr, c_size_t, c_loc, c_int, c_double_complex
   use omp_lib
   use cufft
   use cudafor
   implicit none

   integer :: ierr, rank, nprocs, nlocal, i, dev, num_dev, j, k, iter
   integer, parameter :: n = 300000000 !global data
   integer :: Nfft = 512
   integer :: N2 = 1024*1024
   complex(8), allocatable, dimension(:, :) :: hx
   complex(c_double_complex), pointer :: sendbuf(:), recvbuf(:)
   type(c_ptr) :: sendptr, recvptr
   type(MPI_Request) :: req
   integer(c_size_t) :: sendsize, recvsize
   integer(c_int) :: plan, stream 
   real(wp) :: tstart, tend



   call MPI_Init(ierr)
   call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
   call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr)

   num_dev = omp_get_num_devices()
   dev = mod(rank, num_dev)
   call omp_set_default_device(dev)

   print *, 'Rank', rank, 'of', nprocs, 'using device', dev, 'out of', num_dev

   allocate(hx(Nfft, N2))
   hx(:, :) = (1, -1)

   !$omp target enter data map(to: hx)

   ierr = cudaStreamCreate(stream)

   ierr = cufftPlan1d(plan, int(Nfft, c_int), CUFFT_Z2Z, int(N2, c_int))
   if (ierr /= 0) then
     print *, "cufftPlan1d failed on rank", rank, "err=", ierr
   end if

   ierr = cufftSetStream(plan, stream)

   nlocal = n / nprocs
   recvsize = nlocal
   sendsize = nlocal

   sendptr = omp_target_alloc(sendsize*int(16, c_size_t), omp_get_default_device())
   recvptr = omp_target_alloc(recvsize*int(16, c_size_t), omp_get_default_device())
   call c_f_pointer(sendptr, sendbuf, [sendsize])
   call c_f_pointer(recvptr, recvbuf, [recvsize])

   do iter = 1,10


      tstart = MPI_Wtime()

      call MPI_IAlltoall(sendbuf, nlocal/nprocs, MPI_C_DOUBLE_COMPLEX, &
                           recvbuf, nlocal/nprocs, MPI_C_DOUBLE_COMPLEX, &
                           MPI_COMM_WORLD, req, ierr)

      !$omp target teams distribute parallel do collapse(2) nowait
      do j = 1, N2 / 2
         do i = 1, Nfft
            hx(i, j) = hx(i, j) * 2.0d0
         end do
      end do

      !$omp target data use_device_addr(hx)
      ierr = cufftExecZ2Z(plan, hx(1, 1), hx(1, 1), CUFFT_FORWARD)
      !$omp end target data
      if (ierr /= 0) then
        print *, "cufftExecZ2Z failed on rank", rank, "err=", ierr
      end if

      !$omp target teams distribute parallel do collapse(2) nowait
      do j = 1, N2 /2 
         do i = 1, Nfft
            hx(i, j) = hx(i, j) * 2.0d0
         end do
      end do

      call MPI_Wait(req, MPI_STATUS_IGNORE, ierr)

      !$omp taskwait 

      tend = MPI_Wtime()

      write(*,'(A,I0,A,I0,A,F8.6)') 'Rank ', rank, ', iter ', iter, ': timestep duration (s) = ', tend - tstart

   end do

   ierr = cufftDestroy(plan)

   call omp_target_free(sendptr, omp_get_default_device())
   call omp_target_free(recvptr, omp_get_default_device())
   call MPI_Finalize(ierr)

end program overlapping_cuda


compiled with

mpif90 -mp -mp=gpu overlapping.f90 -o foo -isystem /opt/nvhpc/Linux_x86_64/25.5/cuda/12.9/targets/x86_64-linux/include /opt/nvhpc/Linux_x86_64/25.5/math_libs/12.9/lib64/libcufft.so /opt/nvhpc/Linux_x86_64/25.5/cuda/12.9/targets/x86_64-linux/lib/libcudart.so  -Minfo=mp -Mcache_align -cuda

I see that the communication and computation indeed overlap (good):

But the kernels are out of order, which is not good. With pure OpenMP, I would define dependencies - i.e. depend(in) and depend(out). With cufft, I was thinking of using an OpenMP detachable task, but that doesn’t seem to be supported by nvhpc (25.5 at least).

So, what do I do in this case?

Thanks in advance :)

Hi Jonathan,

I think what’s going on is that given the cuFFT kernels are on a different stream, there’s not guarantee that the two OpenMP offload kernels will be executed in order with respect to cuFFT. When I ran your code, the kernels were in order, but still on different streams so could be run out of order.

In OpenACC, what I’ve done for similar codes is to set the OpenACC’s async queue’s stream to the same one cuFFT uses, or visa versa, via “acc_[set|get]_cuda_stream” API. For an example see: Examples — NVIDIA Fortran CUDA Interfaces 25.9 documentation

However, OpenMP doesn’t provide a similar API routine. Instead, streams are assigned to a OpenMP task.

Now I’ve not done this in OpenMP, but I believe the correct solution is to add “depends(inout:stream)” on each of the two offload regions.

Give that a try. If it doesn’t work, I’ll ask around to see if others have ideas.

-Mat