I’m checking the async execution of a simple saxpy using Cuda Fortran. I got the profiling in the picture, but I was expecting more concurrency between GPU and CPU. Can you guys see if I’m missing something?
module mathOps
contains
attributes(global) subroutine saxpy(x, y, a, N)
implicit none
real :: x(N), y(N)
real, value :: a
integer :: i, n
!
n = size(x)
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= n) y(i) = y(i) + a*x(i)
end subroutine saxpy
end module mathOps
program testSaxpy
use mathOps
use cudafor
USE nvtx ! CUDA profiling tools
implicit none
integer, parameter :: N = 400000
real , DIMENSION(:) , ALLOCATABLE, pinned :: x, y
real :: a
real, device :: x_d(N), y_d(N)
integer :: istat, i
type(dim3) :: grid, tBlock
INTEGER(kind=cuda_stream_kind) :: stream(3), str ! Stream ID
!
allocate(x(N),y(N))
DO i = 1, 3
istat = cudaStreamCreate(stream(i))
IF(istat /= 0) print , ‘Error in Stream creation’, i
END DO
str = stream(1)
!
tBlock = dim3(256,1,1)
grid = dim3(ceiling(real(N)/tBlock%x),1,1)
!
x = 1.0; y = 2.0; a = 2.0
!
CALL nvtxStartRange(“cpy”,1)
istat = cudaMemcpyAsync(x_d, x, N , str )
istat = cudaMemcpyAsync(y_d, y, N , str )
CALL nvtxEndRange
CALL nvtxStartRange(“krn”,2)
call saxpy<<<grid, tBlock,0, str>>>(x_d, y_d, a)
CALL nvtxEndRange
CALL nvtxStartRange(“cpyb”,3)
istat = cudaMemcpyAsync(y, y_d, N , str )
CALL nvtxEndRange
CALL nvtxStartRange(“cpu”,4)
write(,*) 'Max error: ', maxval(abs(x-1))
CALL nvtxEndRange
end program testSaxpy