Hi, I am a beginneer of CUDA fortran and I am testing the following program. The code is compiled as **pgf95 -ta=nvidia sumAB.cuf** and it runs but gives me the wrong results. Any suggestion? Thanks,

!----------------module for sumAB--------------------------

module m_sumAB

use cudafor

contains

!-------------kernel subroutine-----------------

attributes(global) subroutine k_sumAB(n,A,B,C)

integer :: i

integer, value :: n

real, dimension (n) :: A,B,C

i=(blockidx%x-1)*blockdim%x+threadidx%x

if (i<=n) C(i)=A(i)+B(i)

end subroutine k_sumAB

!-------------host subrotuine--------------------

subroutine h_sumAB(n,bdim,A,B,C)

implicit none

integer :: n,bdim

real, dimension (n) :: A,B,C

real, device, dimension (n) :: Adev,Bdev,Cdev

Adev=A

Bdev=B

call k_sumAB<<<n/bdim, bdim>>>(n,Adev,Bdev,Cdev)

C=Cdev

end subroutine h_sumAB

end module m_sumAB

!---------------------------end module----------------------

program sumAB

!----------------------------------------------------

!

!purpose: sum two vector A and B of n-elements

!

!----------------------------------------------------

use m_sumAB

integer i

integer :: n=1000

integer :: bdim=100

real :: times,timef,sum

real, dimension (n) :: A,B,C,D

!-----------------end declaration variable-----------

!Initialzation arrays

A=1.2

B=2.2

C=0.

D=0.

E=0.

!CPU calculation

call cpu_time(times)

do i=1,n

D(i)=A(i)+B(i)

end do

call cpu_time(timef)

print *,‘CPU time required is: ‘,timef-times,’ seconds’

!GPU calculation

call cpu_time(times)

call h_sumAB(n,bdim,A,B,C)

call cpu_time(timef)

print *,‘GPU time required is: ‘,timef-times,’ seconds’

!diff between results

sum=0.

do i=1,n

sum=sum+C(i)-D(i)

end do

print *,'Difference between results is: ',sum,C(1),D(1)

pause

end program sumAB