I ran the following code on SDK version: 22.7 with 16 threads per block (I have also tried with 32, 64 and 256 threads per block but I got same result)
Please help me with this issue
-Thank you
Code:
module mat_op
integer, parameter:: n=2048, blocksize=16
contains
attributes( global) subroutine row_add(sums,a,n)
implicit none
integer:: i,j,idx,n
real *8::sum
real *8, device :: sums(:),a(:,:)
idx= blockDim%x*(blockIdx%x -1)+ threadIdx%x
sum=0.0d0
i=0
do while (i<n)
sum=sum+a(idx,i)
i=i+1
enddo
print *, sum
call syncthreads()
sums(idx)=sum
end subroutine row_add
end module mat_op
program main
use cudafor
use mat_op
implicit none
integer, device :: n_d
real *8 :: sums(n),a(n,n)
real *8 , device :: sums_d(n),a_d(n,n)
!type(dim3) :: blocks,threads
integer :: p,q,istat
sums=0.0d0
a=1.0d0
sums_d=sums
a_d=a
!blocks=dim3(n/blocksize, n/blocksize, 1)
!threads=dim3(blocksize,blocksize,1)
call row_add<<<n/blocksize,blocksize>>>(sums_d,a_d,n_d)
istat = cudaDeviceSynchronize()
sums=sums_d
a=a_d
write(*,*) sums
end program main