I am getting incorrect results for following code.
========================================================================
MODULE vecAdd
CONTAINS
ATTRIBUTES(GLOBAL) SUBROUTINE deviceVecAdd(A, B, C, N)
REAL, DEVICE :: A(N), B(N), C(N)
INTEGER, VALUE :: N
INTEGER :: i
i = threadIdx%x
C(i) = A(i) + B(i)
END SUBROUTINE
END MODULE
PROGRAM main
USE cudafor
USE vecAdd
INTEGER, PARAMETER :: N = 1024
INTEGER :: i, nerrors, istat, idevice
! Vector data
REAL, DIMENSION(N) :: A, B, C, devResult
REAL, ALLOCATABLE, DEVICE, DIMENSION(:) :: dA, dB, dC
TYPE(cudaEvent) :: start, stop
TYPE(dim3) :: blocks
TYPE(dim3) :: threads
idevice = 0
istat = cudaSetDevice(idevice)
CALL random_number(A)
CALL random_number(B)
ALLOCATE(dA(N))
ALLOCATE(dB(N))
ALLOCATE(dC(N))
dA = A
dB = B
dC = 0.0
blocks = dim3(N/16, 1, 1)
threads = dim3(16, 1, 1)
! Host implementation
DO i = 1, N
C(i) = A(i) + B(i)
END DO
CALL deviceVecAdd<<<blocks, threads>>>(dA, dB, dC, N)
istat = cudaThreadSynchronize()
devResult = dC
nerrors = 0
DO i = 1, N
IF(abs(C(i) - devResult(i)) .gt. 1.0e-4) THEN
nerrors = nerrors + 1
END IF
END DO
IF(nerrors .eq. 0) THEN
PRINT *,“Test passed!”
ELSE
PRINT *," Test failed! ", "No of elements failed = ", nerrors
ENDIF
END PROGRAM
========================================================================
i don’t understand why it is giving incorrect results.
Let me know if i am missing any thing.
i am using emulation mode presently (pgfortran -o vecAdd -Mcuda=emu vecAdd.cuf)
MY system information
PGI - 9.0-4 Workstation Linux 64 bit
OS : OpenSUSE 10.3 64 bit