Hello
There is the code:
module gpu_f
use cudafor
contains
attributes(global) subroutine force_gpu(SX_pr)
real :: sx_pr(4)
integer :: i,j,i1
i=(blockidx%x-1)*blockdim%x + threadidx%x
i1=i+1
do 1 j=i1,4
if (i.le.3 ) then
sx_pr(j)=sx_pr(j)-1
sx_pr_i=sx_pr_i+1
endif
call syncthreads()
1 continue
sx_pr(i)=sx_pr(i)+sx_pr_i
endsubroutine
end module
program main
use gpu_f
real, device :: SX_pr_d(4)
real :: SX_pr(4)
sx_pr=0
sx_pr_d=sx_pr
call force_gpu<<<1,4>>>(sx_pr_d)
sx_pr=sx_pr_d
do i= 1, 4
print*,'i',i,'=',sx_pr(i)
end do
end program
The program gives the answer
D:\PVFProject2\Win32\Debug>PVFProject2.exe
i 1 = 3.000000
i 2 = 1.000000
i 3 = -1.000000
i 4 = -3.000000
If I change the number of blocks on
call force_gpu<<<2,2>>>(sx_pr_d)
or
call force_gpu<<<4,1>>>(sx_pr_d)
I get the result:
i1 = 3.000000
i2 = 1.000000
i3 = 0.000000
i4 = -2.000000
Help me understand where the error?