Hello,
When I run the following CUDA code I get different results for the RAD_VEC than when it is run on the CPU. Can you please explain why this is happening?
PROGRAM testCUDA
USE GPU_KERNELS
REAL, ALLOCATABLE :: RAD_VEC(:,:)
REAL, DEVICE, ALLOCATABLE :: RAD_VEC_DEV(:,:)
INTEGER :: IBK, IWL
INTEGER, PARAMETER :: NWL = 224, NBCKGND = 64
REAL :: CL
REAL, DEVICE :: CL_DEV
ALLOCATE( RAD_VEC(NWL,NBCKGND), STAT=IOS)
DO IWL = 1,NWL
DO IBK = 1,NBCKGND
RAD_VEC(IWL,IBK)=2.0
END DO
END DO
ALLOCATE( RAD_VEC_DEV(NWL,NBCKGND) )
RAD_VEC_DEV = RAD_VEC(1:NWL, 1:NBCKGND)
CL = 0.0
CL_DEV = 0.0
!*** Begin Non-CUDA
! DO IBK = 1,NBCKGND
! CL=0.0
! DO IWL = 1,NWL
! CL=CL+RAD_VEC(IWL,IBK)**2
! END DO
! IF (CL<EPSMIN4) CL=1.0
! CL=SQRT(CL)
! DO IWL = 1,NWL
! RAD_VEC(IWL,IBK)=RAD_VEC(IWL,IBK)/CL
! END DO
! END DO
!*** End Non-CUDA
!*** Begin CUDA Calls
call TEST_KERNEL<<<(NBCKGND-1)/16+1,16>>>(RAD_VEC_DEV, NWL, NBCKGND, CL_DEV)
RAD_VEC(1:NWL,1:NBCKGND) = RAD_VEC_DEV
CL = CL_DEV
!*** End CUDA Calls
print *, "CL = ", CL
DO IBK = 1,NBCKGND
DO IWL = 1,NWL
IF ( IBK .EQ. 1 ) THEN
print *, RAD_VEC(IWL,IBK)
END IF
END DO
END DO
END PROGRAM testCUDA
module GPU_KERNELS
use cudafor
contains
attributes(global) subroutine TEST_KERNEL(RAD_VEC, NWL, NBCKGND, CL)
real, device :: RAD_VEC(NWL, NBCKGND), CL
integer, value :: NWL, NBCKGND
integer :: tx, ibk, iwl, i
real, parameter :: EPSMIN4 = 1.1754944E-38
tx = threadidx%x
i = ( blockidx%x-1 ) * blockdim%x + tx
if ( i .le. NBCKGND ) then
do iwl = 1,NWL
CL = CL + RAD_VEC(iwl, i)
end do
if ( CL < EPSMIN4 ) CL = 1.0
CL=SQRT(CL)
do iwl = 1, NWL
RAD_VEC(iwl,i) = RAD_VEC(iwl,i) + RAD_VEC(iwl, i)/CL
end do
end if
call syncthreads()
end subroutine
end module GPU_KERNELS