The cudamemcpy operation consumes as much as 50% of the time

Hi,

I am trying to accelerate FVCOM by cudaFortran.
However, I have noticed that the program running with CUDA acceleration is much slower than running with a single-core CPU. Through debugging with nvprof, I found that a significant amount of time is spent on data transfers. Could you please help me take a look at my code to see if there are any issues? How can I optimize this? I believe the time spent on data transfers is highly unreasonable.

Host code:

SUBROUTINE WET_JUDGE
   USE MOD_PREC
   USE ALL_VARS
   USE MOD_PAR
   use cuda_WET_JUDGE

   IMPLICIT NONE
   REAL(SP) :: DTMP
   INTEGER  :: ITA_TEMP
   INTEGER  :: I,IL,IA,IB,K1,K2,K3,K4,K5,K6

   integer :: KT

   !sub1
   REAL(SP), ALLOCATABLE,device :: H_d(:)
   REAL(SP), ALLOCATABLE,device :: ELF_d(:)
   INTEGER , ALLOCATABLE,device :: ISWETN_d(:)

   !sub2
   INTEGER , device,ALLOCATABLE :: ISWETC_d(:)
   INTEGER, device,ALLOCATABLE :: NV_d(:,:)

   !sub3
   INTEGER, ALLOCATABLE,device :: NBVE_d(:,:)
   INTEGER, ALLOCATABLE,device :: NTVE_d(:)

   !sub4
   REAL(SP), ALLOCATABLE,device :: ELF1_d(:)


   !sub1
   ALLOCATE(H_d(0:MT))
   ALLOCATE(ELF_d(0:MT))
   ALLOCATE(ISWETN_d(0:MT))
   ISWETN = 1
   H_d=H
   ELF_d=ELF
   ISWETN_d=ISWETN
   blocknum=(M/blocksize)+1
   call WET_JUDGE1<<<blocknum,blocksize>>>(M,MIN_DEPTH,H_d,ELF_d,ISWETN_d)
   ISWETN=ISWETN_d

   !sub2
   ALLOCATE(ISWETC_d(0:NT))
   ALLOCATE(NV_d(0:NT,4))
   ISWETC = 1
   ISWETC_d=ISWETC
   NV_d=NV
   blocknum=(N/blocksize)+1
   call WET_JUDGE2<<<blocknum,blocksize>>>(N,MIN_DEPTH,NV_d,H_d,ELF_d,ISWETC_d)
   ISWETC=ISWETC_d

   !sub3
   ALLOCATE(NBVE_d(M,MX_NBR_ELEM+1))
   ALLOCATE(NTVE_d(0:MT))
   NBVE_d=NBVE
   NTVE_d=NTVE
   blocknum=(M/blocksize)+1
   call WET_JUDGE3<<<blocknum,blocksize>>>(M,ISWETC_d,NBVE_d,NTVE_d,ISWETN_d)
   ISWETN=ISWETN_d


   !sub4
   ALLOCATE(ELF1_d(0:NT))
   ELF1_d=ELF1
   call WET_JUDGE4<<<blocknum,blocksize>>>(N,ONE_THIRD,ELF1_d,ELF_d,NV_d)
   ELF1=ELF1_d

   END SUBROUTINE WET_JUDGE

Device code:

module cuda_WET_JUDGE
    USE MOD_PREC
    contains
      attributes(global) subroutine WET_JUDGE1(M,MIN_DEPTH,H,ELF,ISWETN_d)
        integer ,value :: M
        REAL(SP) ,value :: MIN_DEPTH

        REAL(SP), device :: H(0:)
        REAL(SP), device :: ELF(0:)
        INTEGER , device :: ISWETN_d(0:)

        REAL(SP) :: DTMP
        integer :: id
        id=(blockIdx%x-1)*blockDim%x+(threadIdx%x-1)+1

        IF(id <=M)THEN
            DTMP = H(id) + ELF(id)

            IF((DTMP - MIN_DEPTH) < 1.0E-5_SP) ISWETN_d(id) = 0
        end if

      end subroutine WET_JUDGE1


      attributes(global) subroutine WET_JUDGE2(N,MIN_DEPTH,NV,H,ELF,ISWETC_d)
      integer ,value :: N
      REAL(SP) ,value :: MIN_DEPTH

      INTEGER, device :: ISWETC_d(0:)
      INTEGER, device :: NV(0:,:)
      REAL(SP), device :: ELF(0:)
      REAL(SP), device :: H(0:)


      REAL(SP) :: DTMP
      integer :: id
      id=(blockIdx%x-1)*blockDim%x+(threadIdx%x-1)+1


      IF(id <=N)THEN
        DTMP =  MAX(ELF(NV(id,1)),ELF(NV(id,2)),ELF(NV(id,3)))  + &
                MIN(  H(NV(id,1)),  H(NV(id,2)),  H(NV(id,3)))

        IF((DTMP - MIN_DEPTH) < 1.0E-5_SP) ISWETC_d(id) = 0
      end if

    end subroutine WET_JUDGE2


    attributes(global) subroutine WET_JUDGE3(M,ISWETC_d,NBVE,NTVE,ISWETN_d)
    integer ,value :: M

    INTEGER,device :: ISWETC_d(0:)
    INTEGER,device :: NBVE(:,:)
    INTEGER,device :: NTVE(0:)
    INTEGER,device :: ISWETN_d(0:)

    integer :: id
    id=(blockIdx%x-1)*blockDim%x+(threadIdx%x-1)+1


    IF(id <=M)THEN
        IF(SUM(ISWETC_d(NBVE(id,1:NTVE(id)))) == 0)  ISWETN_d(id) = 0
    end if

  end subroutine WET_JUDGE3


    attributes(global) subroutine WET_JUDGE4(N,ONE_THIRD,ELF1,ELF,NV)
        integer ,value :: N
        REAL(DP), value :: ONE_THIRD

        REAL(SP), device :: ELF1(0:)
        REAL(SP), device :: ELF(0:)
        INTEGER, device :: NV(0:,:)

        integer :: id
        id=(blockIdx%x-1)*blockDim%x+(threadIdx%x-1)+1

        IF(id <=N)THEN
            ELF1(id) = ONE_THIRD*(ELF(NV(id,1))+ELF(NV(id,2))+ELF(NV(id,3)))
        end if

    end subroutine WET_JUDGE4
end module cuda_WET_JUDGE

nvprof result:

I wonder if it could be due to the data copying method I’m using? Or perhaps it’s because I’m calling four kernels in one subroutine?How can i fix it?

Thanks,
wjx

That is a trivially small piece of code, and its commonly the case that the GPU doesn’t have attractive performance when you are doing almost no work on the GPU.

The cost of moving data to and from the GPU must be amortized by doing enough work on the GPU that will run faster than the CPU. You probably don’t have that situation here, with what you have shown.

Your kernel durations are in the 2-5 microsecond range. That is not long enough to be interesting on the GPU. To “fix” this, you would need to take a larger scope view at the work you are doing, and see if you can move more work to the GPU. In so doing, you would want to try to get away from the pattern of:

  1. move data to the GPU
  2. call a single, trivially small/short kernel
  3. move results back to host

Instead you would want to:

  1. move data to the GPU
  2. call a kernel
  3. call a kernel
  4. call a kernel
  5. call a kernel
  6. call a kernel
  7. copy results back to host

The profiler data suggests that the sequence of events here is something like

  1. cudaMalloc()
  2. cudaMempcy()
  3. launch kernel
  4. cudaMemcpy()
  5. cudaFree()

  1. cudaMalloc()
  2. cudaMemcpy()
  3. launch kernel
    etc.

This is close to “optimally bad”. cudaMalloc() and cudaFree() are expensive API calls, and should be issued infrequently. Ideally one would allocate once at application startup, and free once at application shutdown, reusing existing allocations in between.

Likewise, copying data between host and device (in either direction) is expensive as the bandwidth of the PCIe interconnect (e.g. 25 GB/sec) is only a fraction of the bandwidth of both the host system memory bandwidth (e.g. 200 GB/sec) and the GPU’s onboard memory (e.g. 1000 GB/sec). Therefore data movement should be minimized to increase the ratio of actual computation to data movement.

1 Like

Hi, thank you very much for your advice.
I did what you suggested, but the results were no better.
I think the key point is not whether I call the kernel continuously or disperse it.
You can look at the example below, which is the most computationally intensive part of FVCOM. And I only used one kernel,but cudamemcpy takes up to 72% of the time.
(Because there are too many program codes, I only gave the parts related to GPU calculations.)

Host code:

   SUBROUTINE ADVAVE_EDGE_GCN(XFLUX,YFLUX)

   USE ALL_VARS
   USE MOD_UTILS
   USE MOD_SPHERICAL
   USE MOD_NORTHPOLE
   use cudafor
   use cuda_advave_EDGE_GCN
   ......

   !kernel1
   INTEGER,  allocatable,device :: IEC_d(:,:)
   INTEGER ,  allocatable,device :: ISWETC_d(:)
   INTEGER ,  allocatable,device :: ISWETCE_d(:)
   INTEGER,  allocatable,device :: IENODE_d(:,:)
   INTEGER,  allocatable,device :: NBE_d(:,:)
   INTEGER,  allocatable,device :: ISBC_d(:)
   REAL(SP), allocatable,device :: XIJC_d(:)
   REAL(SP), allocatable,device :: YIJC_d(:)
   REAL(SP), allocatable,device :: XC_d(:)               !!X-COORD AT FACECENTER
   REAL(SP), allocatable, device :: YC_d(:)
   REAL(SP), allocatable, device :: A1U_d(:,:)
   REAL(SP), allocatable, device :: A2U_d(:,:)
   REAL(SP), allocatable,device :: DLTXC_d(:)
   REAL(SP), allocatable,device :: DLTYC_d(:)
   REAL(SP), allocatable, device :: ART_d(:)
   REAL(SP), allocatable, device :: CC_HVC_d(:)
   REAL(SP), allocatable, device :: D_d(:)
   REAL(SP), allocatable, device :: EL_d(:)
   REAL(SP), allocatable, device :: UA_d(:)
   REAL(SP), allocatable, device :: VA_d(:)
   REAL(SP), allocatable, device :: IUCP_d(:)
   REAL(SP), allocatable, device :: PSTX_d(:)
   REAL(SP), allocatable, device :: PSTY_d(:)
   REAL(SP), allocatable,device :: GRAV_E_d(:)
   REAL(SP), allocatable, device :: D1_d(:)
   REAL(SP), allocatable, device :: XFLUX_d(:)
   REAL(SP), allocatable, device :: YFLUX_d(:)
   REAL(SP), allocatable,device :: EPOR_d(:)


   ......

!kernel1
     ALLOCATE(IEC_d(NE,2))
     ALLOCATE(ISWETC_d(0:NT))
     ALLOCATE(ISWETCE_d(0:NT))
     ALLOCATE(IENODE_d(NE,2))
     ALLOCATE(NBE_d(0:NT,3))
     ALLOCATE(ISBC_d(NE))
     ALLOCATE(XIJC_d(NE))
     ALLOCATE(YIJC_d(NE))
     ALLOCATE(XC_d(0:NT))
     ALLOCATE(YC_d(0:NT))
     ALLOCATE(A1U_d(0:NT,4))
     ALLOCATE(A2U_d(0:NT,4))
     ALLOCATE(DLTXC_d(NE))
     ALLOCATE(DLTYC_d(NE))
     ALLOCATE(ART_d(0:NT))
     ALLOCATE(CC_HVC_d(0:NT))
     ALLOCATE(D_d(0:MT))
     ALLOCATE(EL_d(0:MT))
     ALLOCATE(UA_d(0:NT))
     ALLOCATE(VA_d(0:NT))
     ALLOCATE(IUCP_d(0:NT))
     ALLOCATE(PSTX_d(0:NT))
     ALLOCATE(PSTY_d(0:NT))
     ALLOCATE(GRAV_E_d(0:NT))
     ALLOCATE(D1_d(0:NT))
     ALLOCATE(XFLUX_d(0:NT))
     ALLOCATE(YFLUX_d(0:NT))
     ALLOCATE(EPOR_d(0:NT))

     IEC_d=IEC
     ISWETC_d=ISWETC
     ISWETCE_d=ISWETCE
     IENODE_d=IENODE
     NBE_d=NBE
     ISBC_d=ISBC
     XIJC_d=XIJC
     YIJC_d=YIJC
     XC_d=XC
     YC_d=YC
     A1U_d=A1U
     A2U_d=A2U
     DLTXC_d=DLTXC
     DLTYC_d=DLTYC
     ART_d=ART
     CC_HVC_d=CC_HVC
     D_d=D
     EL_d=EL
     UA_d=UA
     VA_d=VA
     IUCP_d=IUCP
     PSTX_d=PSTX
     PSTY_d=PSTY
     GRAV_E_d=GRAV_E
     D1_d=D1
     XFLUX_d=XFLUX
     YFLUX_d=YFLUX
     EPOR_d=EPOR

     blocksize=256
     blocknum=(NE/blocksize)+1

             !   DO xx=nt-15,nt
             !           write(*,*)'cpu',ua(xx)
             !   end do


     call advave_EDGE_GCN1<<<blocknum,blocksize+1>>>(NE,HPRNU,FACT,FM1,IEC_d,ISWETC_d,ISWETCE_d,IENODE_d,NBE_d,ISBC_d,XIJC_d,YIJC_d,XC_d,YC_d,A1U_d,A2U_d,DLTXC_d,DLTYC_d,ART_d,CC_HVC_d,D_d,EL_d,UA_d,VA_d,IUCP_d,PSTX_d,PSTY_d,GRAV_E_d,D1_d,XFLUX_d,YFLUX_d,EPOR_d,nt,mt)

     XFLUX=XFLUX_d
     YFLUX=YFLUX_d
     PSTX=PSTX_d
     PSTY=PSTY_d

     ......

Device Code:

module cuda_advave_EDGE_GCN
    USE MOD_PREC
    USE CUDAFOR
    contains

      attributes(global) subroutine advave_EDGE_GCN1(NE,HPRNU,FACT,FM1,IEC,ISWETC,ISWETCE,IENODE,NBE,ISBC,XIJC,YIJC,XC,YC,A1U,A2U,DLTXC,DLTYC,ART,CC_HVC,D,EL,UA,VA,IUCP,PSTX,PSTY,GRAV_E,D1,XFLUX,YFLUX,EPOR,nt,mt)


        implicit none
        integer,value :: NE
        integer,value :: nt
        integer,value :: mt
        REAL(SP),value:: HPRNU
        REAL(SP) , value :: FACT
        REAL(SP) , VALUE :: FM1

        INTEGER, device :: IEC(:,:)
        INTEGER , device :: ISWETC(0:)
        INTEGER , device :: ISWETCE(0:)
        INTEGER, device :: IENODE(:,:)
        INTEGER, device :: NBE(0:,:)
        INTEGER, device :: ISBC(:)
        REAL(SP),device :: XIJC(:)
        REAL(SP),device :: YIJC(:)
        REAL(SP), device :: XC(0:)               !!X-COORD AT FACE CENTER
        REAL(SP), device :: YC(0:)
        REAL(SP), device :: A1U(0:,:)
        REAL(SP), device :: A2U(0:,:)
        REAL(SP),device :: DLTXC(:)
        REAL(SP),device :: DLTYC(:)
        REAL(SP), device :: ART(0:)
        REAL(SP), device :: CC_HVC(0:)
        REAL(SP), device :: D(0:)
        REAL(SP), device :: EL(0:)
        REAL(SP), device :: UA(0:)
        REAL(SP), device :: VA(0:)
        REAL(SP), device :: IUCP(0:)
        REAL(SP), device :: PSTX(0:)
        REAL(SP), device :: PSTY(0:)
        REAL(SP),device :: GRAV_E(0:)
        REAL(SP), device :: D1(0:)
        REAL(SP), device :: XFLUX(0:)
        REAL(SP), device :: YFLUX(0:)
        REAL(SP),device :: EPOR(0:)

        INTEGER  :: i,J,K,IA,IB,J1,J2,K1,K2,K3,I1,I2,II,xx,istat
        REAL(SP) :: UIJ1,VIJ1,UIJ2,VIJ2,FXX,FYY
        REAL(SP) :: DIJ,ELIJ,XIJ,YIJ,UIJ,VIJ
        REAL(SP) :: A1UIA1,A1UIA2,A1UIA3,A1UIA4,A2UIA1,A2UIA2,A2UIA3,A2UIA4
        REAL(SP) :: A1UIB1,A1UIB2,A1UIB3,A1UIB4,A2UIB1,A2UIB2,A2UIB3,A2UIB4
        REAL(SP) :: COFA1,COFA2,COFA3,COFA4,COFA5,COFA6,COFA7,COFA8
        INTEGER  :: J11,J12,J21,J22,E1,E2,ISBCE1,ISBC_TMP,IB_TMP
        REAL(SP) :: XADV,YADV,TXXIJ,TYYIJ,TXYIJ,UN_TMP
        REAL(SP) :: VISCOF,VISCOF1,VISCOF2,TEMP
        REAL(SP) :: ISWETTMP


        integer :: id
        id=(blockIdx%x-1)*blockDim%x+(threadIdx%x-1)+1


        IF(id <=NE)THEN

          IA=IEC(id,1)
          IB=IEC(id,2)

          J1=IENODE(id,1)
          J2=IENODE(id,2)

          DIJ=0.5_SP*(D(J1)+D(J2))
          ELIJ=0.5_SP*(EL(J1)+EL(J2))




          IF(ISWETCE(IA)*ISWETC(IA) == 1 .OR. ISWETCE(IB)*ISWETC(IB) ==1)THEN
     !    FLUX FROM LEFT
          K1=NBE(IA,1)
          K2=NBE(IA,2)
          K3=NBE(IA,3)

          A1UIA1 = A1U(IA,1)
          A1UIA2 = A1U(IA,2)
          A1UIA3 = A1U(IA,3)
          A1UIA4 = A1U(IA,4)
          A2UIA1 = A2U(IA,1)
          A2UIA2 = A2U(IA,2)
          A2UIA3 = A2U(IA,3)
          A2UIA4 = A2U(IA,4)
     !---------------------------------------------------------------
          COFA1=A1UIA1*UA(IA)+A1UIA2*UA(K1)+A1UIA3*UA(K2)+A1UIA4*UA(K3)
          COFA2=A2UIA1*UA(IA)+A2UIA2*UA(K1)+A2UIA3*UA(K2)+A2UIA4*UA(K3)
          COFA5=A1UIA1*VA(IA)+A1UIA2*VA(K1)+A1UIA3*VA(K2)+A1UIA4*VA(K3)
          COFA6=A2UIA1*VA(IA)+A2UIA2*VA(K1)+A2UIA3*VA(K2)+A2UIA4*VA(K3)

          XIJ=XIJC(id)-XC(IA)
          YIJ=YIJC(id)-YC(IA)
          UIJ1=UA(IA)+COFA1*XIJ+COFA2*YIJ
          VIJ1=VA(IA)+COFA5*XIJ+COFA6*YIJ

     !    FLUX FROM RIGHT
          K1=NBE(IB,1)
          K2=NBE(IB,2)
          K3=NBE(IB,3)
          IB_TMP = IB

          A1UIB1 = A1U(IB_TMP,1)
          A1UIB2 = A1U(IB_TMP,2)
          A1UIB3 = A1U(IB_TMP,3)
          A1UIB4 = A1U(IB_TMP,4)
          A2UIB1 = A2U(IB_TMP,1)
          A2UIB2 = A2U(IB_TMP,2)
          A2UIB3 = A2U(IB_TMP,3)
          A2UIB4 = A2U(IB_TMP,4)

          COFA3=A1UIB1*UA(IB_TMP)+A1UIB2*UA(K1)+A1UIB3*UA(K2)+A1UIB4*UA(K3)
          COFA4=A2UIB1*UA(IB_TMP)+A2UIB2*UA(K1)+A2UIB3*UA(K2)+A2UIB4*UA(K3)
          COFA7=A1UIB1*VA(IB_TMP)+A1UIB2*VA(K1)+A1UIB3*VA(K2)+A1UIB4*VA(K3)
          COFA8=A2UIB1*VA(IB_TMP)+A2UIB2*VA(K1)+A2UIB3*VA(K2)+A2UIB4*VA(K3)

          XIJ=XIJC(id)-XC(IB_TMP)
          YIJ=YIJC(id)-YC(IB_TMP)
          UIJ2=UA(IB_TMP)+COFA3*XIJ+COFA4*YIJ
          VIJ2=VA(IB_TMP)+COFA7*XIJ+COFA8*YIJ

     !    NORMAL VELOCITY
          UIJ=0.5_SP*(UIJ1+UIJ2)
          VIJ=0.5_SP*(VIJ1+VIJ2)
          UN_TMP=-UIJ*DLTYC(id) + VIJ*DLTXC(id)

     !    VISCOSITY COEFFICIENT
          VISCOF1=ART(IA)*SQRT(COFA1**2+COFA6**2+0.5_SP*(COFA2+COFA5)**2)
          VISCOF2=ART(IB_TMP)*SQRT(COFA3**2+COFA8**2+0.5_SP*(COFA4+COFA7)**2)
          ! David moved HPRNU and added HVC
          VISCOF=(FACT*0.5_SP*(VISCOF1*CC_HVC(IA)+VISCOF2*CC_HVC(IB_TMP))+ FM1*0.5_SP*(CC_HVC(IA)+CC_HVC(IB_TMP)))/HPRNU

     !    SHEAR STRESSES
          TXXIJ=(COFA1+COFA3)*VISCOF
          TYYIJ=(COFA6+COFA8)*VISCOF
          TXYIJ=0.5_SP*(COFA2+COFA4+COFA5+COFA7)*VISCOF
          FXX=DIJ*(TXXIJ*DLTYC(id)-TXYIJ*DLTXC(id))
          FYY=DIJ*(TXYIJ*DLTYC(id)-TYYIJ*DLTXC(id))

     !    ADD CONVECTIVE AND VISCOUS FLUXES
          XADV=DIJ*UN_TMP*&
               ((1.0_SP-SIGN(1.0_SP,UN_TMP))*UIJ2+(1.0_SP+SIGN(1.0_SP,UN_TMP))*UIJ1)*0.5_SP
          YADV=DIJ*UN_TMP* &
               ((1.0_SP-SIGN(1.0_SP,UN_TMP))*VIJ2+(1.0_SP+SIGN(1.0_SP,UN_TMP))*VIJ1)*0.5_SP

     !    ACCUMULATE FLUX
          ISBC_TMP = ISBC(id)

           istat=atomicadd(XFLUX(IA),(XADV+FXX*EPOR(IA))*(1.0_SP-ISBC_TMP)*IUCP(IA))
           istat=atomicadd(YFLUX(IA),(YADV+FYY*EPOR(IA))*(1.0_SP-ISBC_TMP)*IUCP(IA))
           istat=atomicadd(XFLUX(IB),-(XADV+FXX*EPOR(IB))*(1.0_SP-ISBC_TMP)*IUCP(IB))
           istat=atomicadd(YFLUX(IB),-(YADV+FYY*EPOR(IB))*(1.0_SP-ISBC_TMP)*IUCP(IB))

          END IF

     !    ACCUMULATE BAROTROPIC FLUX
     !for spherical coordinator and domain across 360^o latitude
          istat= atomicadd(PSTX(IA),-GRAV_E(IA)*D1(IA)*ELIJ*DLTYC(id))
          istat=atomicadd(PSTY(IA),GRAV_E(IA)*D1(IA)*ELIJ*DLTXC(id))
          istat=atomicadd(PSTX(IB),GRAV_E(IB)*D1(IB)*ELIJ*DLTYC(id))
          istat=atomicadd(PSTY(IB),-GRAV_E(IB)*D1(IB)*ELIJ*DLTXC(id))
        end if

      end subroutine advave_EDGE_GCN1
end module cuda_advave_EDGE_GCN

nvprof result:

I have a lot of variables that need to be passed into the GPU. This is inevitable because the calculations require.
How can I improve the performance of my code?

Thanks,
wjx

Disclaimer: I am not familiar with Fortran.

You could try to create a large contiguous allocation which contains multiple arrays. For example, you seem to have many arrays with NT elements, say L arrays. Create a host array with L * NT elements, copy the input arrays into that large array. Then transfer the array to the device with a single cudaMemcpy.

1 Like