Hi,
I am trying to accelerate FVCOM by cudaFortran.
But it is slower than CPU.
I found that data copying took up most of the time.
How can I improve it?
Host code:
SUBROUTINE N2E2D(NVAR,EVAR)
use cudafor
use cuda_N2E2D
IMPLICIT NONE
REAL(SP), DIMENSION(0:MT), INTENT(IN) :: NVAR
REAL(SP), DIMENSION(0:NT), INTENT(INOUT) :: EVAR
INTEGER I,K
!sub1
REAL(SP), device ,allocatable:: EVAR_d(:)
REAL(SP), device ,allocatable:: NVAR_d(:)
INTEGER, device ,allocatable:: NV_d(:,:)
INTEGER I,K
ALLOCATE(NV_d(0:NT,4))
ALLOCATE(NVAR_d(0:MT))
ALLOCATE(EVAR_d(0:NT))
NV_d=NV
NVAR_d=NVAR
EVAR_d=EVAR
blocksize=128
blocknum=(NT/blocksize)+1
call N2E2D1<<<blocknum,blocksize>>>(NT,ONE_THIRD,EVAR_d,NVAR_d,NV_d)
EVAR=EVAR_d
deallocate(NV_d)
deallocate(NVAR_d)
deallocate(EVAR_d)
RETURN
END SUBROUTINE N2E2D
Device code:
module cuda_N2E2D
USE MOD_PREC
contains
attributes(global) subroutine N2E2D1(NT,ONE_THIRD,EVAR,NVAR,NV_d)
integer ,value :: NT
REAL(DP),value :: ONE_THIRD
REAL(SP), device :: EVAR(0:)
REAL(SP), device :: NVAR(0:)
INTEGER, device :: NV_d(0:,:)
integer :: id
id=(blockIdx%x-1)*blockDim%x+(threadIdx%x-1)+1
IF(id <=NT)THEN
EVAR(id) = ONE_THIRD*(NVAR(NV_d(id,1))+NVAR(NV_d(id,2))+NVAR(NV_d(id,3)))
end if
end subroutine N2E2D1
end module cuda_N2E2D
nvprof result:
Thanks,
wjx