Hi Phoon,
While we don’t have a CUDA Fortran interface module that ships with the compilers, you can write your own that will work with CUDA Graphs. Below is a basic interface module with a few of the most used features.
Also, I’ve included a simple program based on the C example shown in the article you pointed to.
cuda_graph_m.f90:
module cuda_graph
use iso_c_binding
! enums
enum, bind(C) ! cudaStreamCaptureMode
enumerator :: cudaStreamCaptureModeGlobal=0
enumerator :: cudaStreamCaptureModeThreadLocal=1
enumerator :: cudaStreamCaptureModeRelaxed=2
end enum
! types
type cudaGraph
type(c_ptr) :: graph
end type cudaGraph
type cudaGraphExec
type(c_ptr) :: graph_exec
end type cudaGraphExec
type cudaGraphNode
type(c_ptr) :: graph_node
end type cudaGraphNode
! ---------
! functions
! ---------
!----------------------------------
! Additional cudaStream functions
!----------------------------------
interface
integer(c_int) function cudaStreamBeginCapture(stream, mode) &
bind(C,name='cudaStreamBeginCapture')
use cudafor
integer(cuda_stream_kind), value :: stream
integer(c_int), value :: mode
end function cudaStreamBeginCapture
end interface
interface
integer(c_int) function cudaStreamEndCapture(stream, pGraph) &
bind(C,name='cudaStreamEndCapture')
use cudafor
import cudaGraph
integer(cuda_stream_kind), value :: stream
type(cudaGraph) :: pGraph
end function cudaStreamEndCapture
end interface
!----------------------------------
! new cudaGraph functions
!----------------------------------
interface
integer(c_int) function cudaGraphCreate(pGraph, flags) &
bind(C,name='cudaGraphCreate')
import cudaGraph
type(cudaGraph) :: graph
integer :: flags
end function cudaGraphCreate
end interface
interface
integer(c_int) function cudaGraphInstantiate(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize) &
bind(C,name='cudaGraphInstantiate')
use cudafor
import cudaGraph
import cudaGraphExec
import cudaGraphNode
type(cudaGraphExec) :: pGraphExec
type(cudaGraph), value :: graph
type(cudaGraphNode) :: pErrorNode
character(kind=C_CHAR, len=*) :: pLogBuffer
integer(c_size_t), value :: bufferSize
end function cudaGraphInstantiate
end interface
interface
integer(c_int) function cudaGraphLaunch(graphExec, stream) &
bind(C,name='cudaGraphLaunch')
use cudafor
import cudaGraphExec
type(cudaGraphExec), value :: graphExec
integer(cuda_stream_kind), value :: stream
end function cudaGraphLaunch
end interface
end module
main_graph.f90:
program main
use cudafor
use cuda_graph
use iso_c_binding
implicit none
integer :: i, n, istat
integer(kind=cuda_stream_kind) :: stream
integer, device, allocatable :: a(:)
type(cudaGraph) :: graph
type(cudaGraphExec) :: graph_exec
type(cudaGraphNode) :: error_node
character(c_char) :: buffer
integer(c_size_t) :: buffer_len
istat = cudaStreamCreateWithFlags(stream, cudaStreamNonBlocking)
print*, istat
!istat = cudaGraphCreate(graph, 0)
allocate(a(1024))
istat = cudaStreamBeginCapture(stream, 0)
print*, istat
do n = 1,1000
!$cuf kernel do <<<*,*,stream=stream>>>
do i = 1, 1024
a(i) = 1234
end do
end do
istat = cudaStreamEndCapture(stream, graph)
print*, istat
buffer_len = 0
istat = cudaGraphInstantiate(graph_exec, graph, error_node, buffer, buffer_len)
print*, istat
istat = cudaGraphLaunch(graph_exec, stream)
print*, istat
istat = cudaStreamSynchronize(stream)
end program
Hope this helps,
Mat