CUDA Graph in Cuda Fortran

Hi Phoon,

While we don’t have a CUDA Fortran interface module that ships with the compilers, you can write your own that will work with CUDA Graphs. Below is a basic interface module with a few of the most used features.

Also, I’ve included a simple program based on the C example shown in the article you pointed to.

cuda_graph_m.f90:

 module cuda_graph
  use iso_c_binding

  ! enums

  enum, bind(C) ! cudaStreamCaptureMode
     enumerator :: cudaStreamCaptureModeGlobal=0
     enumerator :: cudaStreamCaptureModeThreadLocal=1
     enumerator :: cudaStreamCaptureModeRelaxed=2
  end enum

  ! types

  type cudaGraph
    type(c_ptr) :: graph
  end type cudaGraph

  type cudaGraphExec
    type(c_ptr) :: graph_exec
  end type cudaGraphExec

  type cudaGraphNode
    type(c_ptr) :: graph_node
  end type cudaGraphNode

  ! ---------
  ! functions
  ! ---------

  !----------------------------------
  ! Additional cudaStream functions
  !----------------------------------
  interface
     integer(c_int) function cudaStreamBeginCapture(stream, mode) &
          bind(C,name='cudaStreamBeginCapture')
       use cudafor
       integer(cuda_stream_kind), value ::  stream
       integer(c_int), value :: mode
     end function cudaStreamBeginCapture
  end interface

  interface
     integer(c_int) function cudaStreamEndCapture(stream, pGraph) &
          bind(C,name='cudaStreamEndCapture')
       use cudafor
       import cudaGraph
       integer(cuda_stream_kind), value ::  stream
       type(cudaGraph) :: pGraph
     end function cudaStreamEndCapture
  end interface


  !----------------------------------
  ! new cudaGraph functions
  !----------------------------------
  interface
     integer(c_int) function cudaGraphCreate(pGraph, flags) &
          bind(C,name='cudaGraphCreate')
       import cudaGraph
       type(cudaGraph) :: graph
       integer :: flags
     end function cudaGraphCreate
  end interface

  interface
     integer(c_int) function cudaGraphInstantiate(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize) &
          bind(C,name='cudaGraphInstantiate')
       use cudafor
       import cudaGraph
       import cudaGraphExec
       import cudaGraphNode
       type(cudaGraphExec) :: pGraphExec
       type(cudaGraph), value :: graph
       type(cudaGraphNode) :: pErrorNode
       character(kind=C_CHAR, len=*) :: pLogBuffer
       integer(c_size_t), value :: bufferSize
     end function cudaGraphInstantiate
  end interface

  interface
     integer(c_int) function cudaGraphLaunch(graphExec, stream) &
          bind(C,name='cudaGraphLaunch')
       use cudafor
       import cudaGraphExec
       type(cudaGraphExec), value :: graphExec
       integer(cuda_stream_kind), value ::  stream
     end function cudaGraphLaunch
  end interface

end module

main_graph.f90:

program main
  use cudafor
  use cuda_graph
  use iso_c_binding
  implicit none

  integer :: i, n, istat
  integer(kind=cuda_stream_kind) :: stream
  integer, device, allocatable :: a(:)
  type(cudaGraph) :: graph
  type(cudaGraphExec) :: graph_exec
  type(cudaGraphNode) :: error_node
  character(c_char) :: buffer
  integer(c_size_t) :: buffer_len

  istat = cudaStreamCreateWithFlags(stream, cudaStreamNonBlocking)
  print*, istat
  !istat = cudaGraphCreate(graph, 0)

  allocate(a(1024))

  istat = cudaStreamBeginCapture(stream, 0)
  print*, istat
  do n = 1,1000
    !$cuf kernel do <<<*,*,stream=stream>>>
    do i = 1, 1024
      a(i) = 1234
    end do
  end do
  istat = cudaStreamEndCapture(stream, graph)
  print*, istat

  buffer_len = 0
  istat = cudaGraphInstantiate(graph_exec, graph, error_node, buffer, buffer_len)
  print*, istat

  istat = cudaGraphLaunch(graph_exec, stream)
  print*, istat

  istat = cudaStreamSynchronize(stream)


end program

Hope this helps,
Mat