Hello,

I have recently encountered a problem in my code when trying to access vectors in a certain way. I tried to break it down as compactly as possible in this example code:

```
module cuda_subs
implicit none
integer, parameter :: pr=8
contains
!-------------------------------------------------------------------------------
attributes(global) subroutine test(N,M,perm1_d,perm2_d,x_in_d, x_out_d, x_inout_d, check1_d,check2_d)
integer,value, intent(in) :: N,M
integer, intent(in) :: perm1_d(N*M), perm2_d(N*M)
real(kind=pr), intent(out) :: check1_d(5,N*M), check2_d(5,N*M)
real(kind=pr), intent(in) :: x_in_d(N*M)
real(kind=pr), intent(out) :: x_out_d(N*M)
real(kind=pr), intent(inout) :: x_inout_d(N*M)
real(kind=pr) :: x_local1_d(N*M), x_local2_d(N*M)
integer :: t, i
t=threadIdx%x
i=blockDim%x*(blockIdx%x-1) + threadIdx%x
x_out_d = 44.0_pr
x_inout_d = 55.0_pr
x_local1_d = 66.0_pr
x_local2_d = 77.0_pr
if(t <= N) then
x_local1_d(i) = x_in_d(i)
x_local2_d(i) = x_in_d(i)
x_out_d(i) = x_local1_d(i)
x_inout_d(i) = x_local2_d(i)
check1_d(1,i) = x_in_d(perm1_d(i))
check1_d(2,i) = x_out_d(perm1_d(i))
check1_d(3,i) = x_inout_d(perm1_d(i))
check1_d(4,i) = x_local1_d(perm1_d(i))
check1_d(5,i) = x_local2_d(perm1_d(i))
check2_d(1,i) = x_in_d(perm2_d(i))
check2_d(2,i) = x_out_d(perm2_d(i))
check2_d(3,i) = x_inout_d(perm2_d(i))
check2_d(4,i) = x_local1_d(perm2_d(i))
check2_d(5,i) = x_local2_d(perm2_d(i))
endif
end subroutine test
!-------------------------------------------------------------------------------
end module cuda_subs
!##########################################################################
!##########################################################################
program main
use cuda_subs
use cudafor
implicit none
integer, parameter :: N=3,M=3
real(kind=pr), device :: x_in_d(N*M), x_out_d(N*M)
real(kind=pr), device :: x_inout_d(N*M), x_local1_d(N*M)
real(kind=pr), device :: check1_d(5,N*M), check2_d(5,N*M)
integer, device :: perm1_d(N*M), perm2_d(N*M)
real(kind=pr) :: x_in(N*M), x_out(N*M), x_inout(N*M)
real(kind=pr) :: check1(5,N*M), check2(5,N*M)
integer :: i,j, NM
type(dim3) :: block_grid, thread_block
NM=N*M
do i=1,NM
x_in_d(i) = i
enddo
perm1_d(1:9) = (/ 1,2,3,4,5,6,7,8,9 /)
perm2_d(1:9) = (/ 9,8,7,6,5,4,3,2,1 /)
block_grid = dim3(M,1,1)
thread_block = dim3(N,1,1)
call test<<<block_grid,thread_block>>>(N,M,perm1_d,perm2_d,x_in_d, x_out_d, x_inout_d, check1_d,check2_d)
check1=check1_d
check2=check2_d
write(*,'(//,A8,/)') 'check1 ='
do j=1,5
write(*,'(F5.2,x,F5.2,x,F5.2,x, &
F5.2,x,F5.2,x,F5.2,x, &
F5.2,x,F5.2,x,F5.2,x)'), &
check1(j,1), &
check1(j,2), &
check1(j,3), &
check1(j,4), &
check1(j,5), &
check1(j,6), &
check1(j,7), &
check1(j,8), &
check1(j,9)
enddo
write(*,'(//,A8,/)') 'check2 ='
do j=1,5
write(*,'(F5.2,x,F5.2,x,F5.2,x, &
F5.2,x,F5.2,x,F5.2,x, &
F5.2,x,F5.2,x,F5.2,x)'), &
check2(j,1), &
check2(j,2), &
check2(j,3), &
check2(j,4), &
check2(j,5), &
check2(j,6), &
check2(j,7), &
check2(j,8), &
check2(j,9)
enddo
end program main
```

In this example code I only use three blocks of three threads each to keep it simple. The basic idea is that I have several (five) vectors x_*_d on the device, three of them am I passing via the subroutine call, and two are local (though one of these two is already declared in the main function). I initialize x_in_d to be [1,2,3,4,5,6,7,8,9], and I set up two “permutation vectors” to be [1,2,3,4,5,6,7,8,9] and [9,8,7,6,5,4,3,2,1], respectively.

In the subroutine I basically copy x_in_d to all the other x-vectors on the device and then copy all five x-vectors to my check-matrices (one using the first permutation vector, another using the second one). At last I copy those matrices to the RAM memory and print them to the screen.

The output is as follows:

```
check1 =
1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 ! x_in_d
1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 ! x_out_d
55.00 55.00 55.00 4.00 5.00 6.00 7.00 8.00 9.00 ! x_inout_d
1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 ! x_local1_d
1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 ! x_local2_d
check2 =
9.00 8.00 7.00 6.00 5.00 4.00 3.00 2.00 1.00 ! x_in_d
44.00 44.00 44.00 6.00 5.00 4.00 44.00 44.00 44.00 ! x_out_d
55.00 55.00 55.00 6.00 5.00 4.00 55.00 55.00 55.00 ! x_inout_d
66.00 66.00 66.00 66.00 5.00 66.00 66.00 66.00 66.00 ! x_local1_d
77.00 77.00 77.00 77.00 5.00 77.00 77.00 77.00 77.00 ! x_local2_d
```

The first check-matrix reveals that all five x-vectors have the desired content and therefore x_in_d seems to be copied to the other four x-vectors correctly. However, when I reverse the order in which the elements are accessed only x_in_d has the expected behaviour, x_out_d and x_inout_d only seem to work as expected from element four on, and the local x-vectors only work correctly on the fifth element (where both permutation vector have the same value). In the positions in which the last four x-vectors do not behave as expected their values are the ones they were initialised to before x_in_d was copied to them.

Since all of the x-vectors should reside in the global memory of the GPU (shouldn’t they?) I do not understand why the order in which what thread accesses what element should make a difference. Though it appears that there is some scoping issue here.

Does anybody know why this is happening and how I could avoid this problem? I am working with the Compressed Sparse Row matrix storage, and this will have to deal with accesses of the form vec1(vec2(i)).

Nils