Hi,
I am trying to accelerate FVCOM by cudaFortran.
However, I have noticed that the program running with CUDA acceleration is much slower than running with a single-core CPU. Through debugging with nvprof, I found that a significant amount of time is spent on data transfers. Could you please help me take a look at my code to see if there are any issues? How can I optimize this? I believe the time spent on data transfers is highly unreasonable.
Host code:
SUBROUTINE WET_JUDGE
USE MOD_PREC
USE ALL_VARS
USE MOD_PAR
use cuda_WET_JUDGE
IMPLICIT NONE
REAL(SP) :: DTMP
INTEGER :: ITA_TEMP
INTEGER :: I,IL,IA,IB,K1,K2,K3,K4,K5,K6
integer :: KT
!sub1
REAL(SP), ALLOCATABLE,device :: H_d(:)
REAL(SP), ALLOCATABLE,device :: ELF_d(:)
INTEGER , ALLOCATABLE,device :: ISWETN_d(:)
!sub2
INTEGER , device,ALLOCATABLE :: ISWETC_d(:)
INTEGER, device,ALLOCATABLE :: NV_d(:,:)
!sub3
INTEGER, ALLOCATABLE,device :: NBVE_d(:,:)
INTEGER, ALLOCATABLE,device :: NTVE_d(:)
!sub4
REAL(SP), ALLOCATABLE,device :: ELF1_d(:)
!sub1
ALLOCATE(H_d(0:MT))
ALLOCATE(ELF_d(0:MT))
ALLOCATE(ISWETN_d(0:MT))
ISWETN = 1
H_d=H
ELF_d=ELF
ISWETN_d=ISWETN
blocknum=(M/blocksize)+1
call WET_JUDGE1<<<blocknum,blocksize>>>(M,MIN_DEPTH,H_d,ELF_d,ISWETN_d)
ISWETN=ISWETN_d
!sub2
ALLOCATE(ISWETC_d(0:NT))
ALLOCATE(NV_d(0:NT,4))
ISWETC = 1
ISWETC_d=ISWETC
NV_d=NV
blocknum=(N/blocksize)+1
call WET_JUDGE2<<<blocknum,blocksize>>>(N,MIN_DEPTH,NV_d,H_d,ELF_d,ISWETC_d)
ISWETC=ISWETC_d
!sub3
ALLOCATE(NBVE_d(M,MX_NBR_ELEM+1))
ALLOCATE(NTVE_d(0:MT))
NBVE_d=NBVE
NTVE_d=NTVE
blocknum=(M/blocksize)+1
call WET_JUDGE3<<<blocknum,blocksize>>>(M,ISWETC_d,NBVE_d,NTVE_d,ISWETN_d)
ISWETN=ISWETN_d
!sub4
ALLOCATE(ELF1_d(0:NT))
ELF1_d=ELF1
call WET_JUDGE4<<<blocknum,blocksize>>>(N,ONE_THIRD,ELF1_d,ELF_d,NV_d)
ELF1=ELF1_d
END SUBROUTINE WET_JUDGE
Device code:
module cuda_WET_JUDGE
USE MOD_PREC
contains
attributes(global) subroutine WET_JUDGE1(M,MIN_DEPTH,H,ELF,ISWETN_d)
integer ,value :: M
REAL(SP) ,value :: MIN_DEPTH
REAL(SP), device :: H(0:)
REAL(SP), device :: ELF(0:)
INTEGER , device :: ISWETN_d(0:)
REAL(SP) :: DTMP
integer :: id
id=(blockIdx%x-1)*blockDim%x+(threadIdx%x-1)+1
IF(id <=M)THEN
DTMP = H(id) + ELF(id)
IF((DTMP - MIN_DEPTH) < 1.0E-5_SP) ISWETN_d(id) = 0
end if
end subroutine WET_JUDGE1
attributes(global) subroutine WET_JUDGE2(N,MIN_DEPTH,NV,H,ELF,ISWETC_d)
integer ,value :: N
REAL(SP) ,value :: MIN_DEPTH
INTEGER, device :: ISWETC_d(0:)
INTEGER, device :: NV(0:,:)
REAL(SP), device :: ELF(0:)
REAL(SP), device :: H(0:)
REAL(SP) :: DTMP
integer :: id
id=(blockIdx%x-1)*blockDim%x+(threadIdx%x-1)+1
IF(id <=N)THEN
DTMP = MAX(ELF(NV(id,1)),ELF(NV(id,2)),ELF(NV(id,3))) + &
MIN( H(NV(id,1)), H(NV(id,2)), H(NV(id,3)))
IF((DTMP - MIN_DEPTH) < 1.0E-5_SP) ISWETC_d(id) = 0
end if
end subroutine WET_JUDGE2
attributes(global) subroutine WET_JUDGE3(M,ISWETC_d,NBVE,NTVE,ISWETN_d)
integer ,value :: M
INTEGER,device :: ISWETC_d(0:)
INTEGER,device :: NBVE(:,:)
INTEGER,device :: NTVE(0:)
INTEGER,device :: ISWETN_d(0:)
integer :: id
id=(blockIdx%x-1)*blockDim%x+(threadIdx%x-1)+1
IF(id <=M)THEN
IF(SUM(ISWETC_d(NBVE(id,1:NTVE(id)))) == 0) ISWETN_d(id) = 0
end if
end subroutine WET_JUDGE3
attributes(global) subroutine WET_JUDGE4(N,ONE_THIRD,ELF1,ELF,NV)
integer ,value :: N
REAL(DP), value :: ONE_THIRD
REAL(SP), device :: ELF1(0:)
REAL(SP), device :: ELF(0:)
INTEGER, device :: NV(0:,:)
integer :: id
id=(blockIdx%x-1)*blockDim%x+(threadIdx%x-1)+1
IF(id <=N)THEN
ELF1(id) = ONE_THIRD*(ELF(NV(id,1))+ELF(NV(id,2))+ELF(NV(id,3)))
end if
end subroutine WET_JUDGE4
end module cuda_WET_JUDGE
nvprof result:
I wonder if it could be due to the data copying method I’m using? Or perhaps it’s because I’m calling four kernels in one subroutine?How can i fix it?
Thanks,
wjx