Loop isn't executing inside the kernel

I ran the following code on SDK version: 22.7 with 16 threads per block (I have also tried with 32, 64 and 256 threads per block but I got same result)

Please help me with this issue
-Thank you

Code:

module mat_op
integer, parameter:: n=2048, blocksize=16
contains
attributes( global) subroutine row_add(sums,a,n)
implicit none
integer:: i,j,idx,n

 real *8::sum
 real *8, device :: sums(:),a(:,:)
 idx=   blockDim%x*(blockIdx%x -1)+ threadIdx%x
 sum=0.0d0
 i=0
 do while (i<n)
 	sum=sum+a(idx,i)
 	i=i+1
 enddo
 print *, sum
 call syncthreads()
 sums(idx)=sum

end subroutine row_add
end module mat_op

program main
use cudafor
use mat_op
implicit none
integer, device :: n_d
real *8 :: sums(n),a(n,n)
real *8 , device :: sums_d(n),a_d(n,n)
!type(dim3) :: blocks,threads
integer :: p,q,istat

 sums=0.0d0

 a=1.0d0
 sums_d=sums

 a_d=a
 !blocks=dim3(n/blocksize, n/blocksize, 1)
 !threads=dim3(blocksize,blocksize,1)
 call row_add<<<n/blocksize,blocksize>>>(sums_d,a_d,n_d)
 istat = cudaDeviceSynchronize()
 sums=sums_d

 a=a_d	
 write(*,*) sums

end program main