Why CudaMemcpyH2D cost a lot of time?

Hi,

I am trying to accelerate FVCOM by cudaFortran.
But it is slower than CPU.
I found that data copying took up most of the time.
How can I improve it?

Host code:



   SUBROUTINE N2E2D(NVAR,EVAR)
   use cudafor
   use cuda_N2E2D

   IMPLICIT NONE
   REAL(SP), DIMENSION(0:MT), INTENT(IN)  :: NVAR
   REAL(SP), DIMENSION(0:NT), INTENT(INOUT) :: EVAR
   INTEGER I,K

   !sub1
   REAL(SP), device ,allocatable:: EVAR_d(:)
   REAL(SP), device ,allocatable:: NVAR_d(:)
   INTEGER, device ,allocatable:: NV_d(:,:)
   INTEGER I,K

   ALLOCATE(NV_d(0:NT,4))
   ALLOCATE(NVAR_d(0:MT))
   ALLOCATE(EVAR_d(0:NT))

   NV_d=NV
   NVAR_d=NVAR
   EVAR_d=EVAR

   blocksize=128
   blocknum=(NT/blocksize)+1
   call N2E2D1<<<blocknum,blocksize>>>(NT,ONE_THIRD,EVAR_d,NVAR_d,NV_d)
   EVAR=EVAR_d


   deallocate(NV_d)
   deallocate(NVAR_d)
   deallocate(EVAR_d)

   RETURN
   END SUBROUTINE N2E2D

Device code:

module cuda_N2E2D
    USE MOD_PREC
    contains
      attributes(global) subroutine N2E2D1(NT,ONE_THIRD,EVAR,NVAR,NV_d)
        integer ,value :: NT
        REAL(DP),value :: ONE_THIRD

        REAL(SP), device :: EVAR(0:)
        REAL(SP), device :: NVAR(0:)
        INTEGER, device :: NV_d(0:,:)

        integer :: id
        id=(blockIdx%x-1)*blockDim%x+(threadIdx%x-1)+1

        IF(id <=NT)THEN
            EVAR(id) = ONE_THIRD*(NVAR(NV_d(id,1))+NVAR(NV_d(id,2))+NVAR(NV_d(id,3)))
        end if

      end subroutine N2E2D1
end module cuda_N2E2D

nvprof result:

Thanks,
wjx