Hi all,
I have troubles using the function atomiccas as suggested in the book by Ruetsch and Fatica. Here is my code:
module test
implicit none
integer(kind=4),device::lock
contains
attributes(global) subroutine adds(a_d,asum_d,lock)
implicit none
real(kind=8),device::a_d(:),asum_d
integer(kind=4),device::lock
integer n
n=threadidx%x+(blockidx%x-1)*blockDim%x
do while(atomiccas(lock,0,1)==1) ! set lock
end do
asum_d=asum_d+a_d(n)
call threadfence()
lock=0 ! release lock
end subroutine
end module
!---------------------------
program test2
use cudafor
use test
implicit none
real(kind=8),allocatable::a(:)
real(kind=8),allocatable,device::a_d(:)
real(kind=8),device::asum_d
real(kind=8)::asum
integer n,j
n=1024
lock=0
allocate(a(n),a_d(n))
do j=1,n
a(j)=1.0d0*j
end do
a_d=a
asum_d=0.0d0
call adds<<<(n-1)/32+1,32>>>(a_d,asum_d,lock)
istat=cudaDeviceSynchronize()
asum=asum_d
print*,asum
end
But when I compile and run this simple code, it never finishes, just hanging there.
Can you tell me what’s wrong?
Thanks,
Lam