Hello

There is the code:

```
module gpu_f
use cudafor
contains
attributes(global) subroutine force_gpu(SX_pr)
real :: sx_pr(4)
integer :: i,j,i1
i=(blockidx%x-1)*blockdim%x + threadidx%x
i1=i+1
do 1 j=i1,4
if (i.le.3 ) then
sx_pr(j)=sx_pr(j)-1
sx_pr_i=sx_pr_i+1
endif
call syncthreads()
1 continue
sx_pr(i)=sx_pr(i)+sx_pr_i
endsubroutine
end module
program main
use gpu_f
real, device :: SX_pr_d(4)
real :: SX_pr(4)
sx_pr=0
sx_pr_d=sx_pr
call force_gpu<<<1,4>>>(sx_pr_d)
sx_pr=sx_pr_d
do i= 1, 4
print*,'i',i,'=',sx_pr(i)
end do
end program
```

The program gives the answer

D:\PVFProject2\Win32\Debug>PVFProject2.exe

i 1 = 3.000000

i 2 = 1.000000

i 3 = -1.000000

i 4 = -3.000000

If I change the number of blocks on

call force_gpu<<<2,2>>>(sx_pr_d)

or

call force_gpu<<<4,1>>>(sx_pr_d)

I get the result:

i1 = 3.000000

i2 = 1.000000

i3 = 0.000000

i4 = -2.000000

Help me understand where the error?