I was wondering if the following “algorithm” would work to asynchronously
transfer data to device while other work (to eventually supplement the data being transferred) is done on cpu …
Two Questions:
(1) Is it ok to mix fortran allocate, etc with CUDA runtime API?
(2) To transfer the 3D array as shown, can I use a cudaMemcpyAsync()
or do I need a cudaMemcpy3DAsync() whereby I need to set pitch
and depth parameters to the array prior to transferring?
Thanks
… program below …
module cuda_gen
use cudafor
real, pinned, allocatable:: a_cpu(:,:,:)
real, device, allocatable:: a_dev(:,:,:)
contains
subroutine set_memory(ix,jx,kx)
integer,intent(in):: ix,jx,kx
allocate(a_cpu(ix,jx,kx))
allocate(a_dev(ix,jx,kx))
return
end subroutine
end module cuda gen
program main
use cuda_gen
ix= 90
jx= 10
kx= 13
call set_memory(ix,jx,kx)
a_cpu(:,:,:)= …
istat= cudaStreamCreate(id_stream)
istat= cudaMemcpyAsync(a_dev,a_cpu, ixjxkx, id_stream)
. . .
do something else on cpu but not on a_cpu
. . .
istat= cudaStreamSynchronize(id_stream)
istat= cudaStreamDestroy(id_stream)
. . .
end program main