Issue with cuda fortran code

I am having issue with shared arrays in cuda fortran code.
The following code is giving me incorrect results even though logically (to my knowledge with no bugs) it is correct. If some one can tell me where I am going wrong, I would be very happy.
Thanks in advance.

module xi2
use paramtrs

contains
attributes(global) subroutine xi2_kernel( xd, yd, zd, WLKR, NCL_NO, wf2d)
implicit none
integer, value, intent(in) :: WLKR, NCL_NO
real *8, device, intent(in), dimension(WLKR,NCL_NO) :: xd, yd, zd
real *8, device, intent(out), dimension(WLKR) :: wf2d

integer :: m, n, p, i, j, k, tx, ty, tid
real *8 :: xij, yij, zij,xij1,yij1,zij1,wf2_loc
real *8, shared, dimension(NCL_NO) :: xs,ys,zs
real 8, shared, dimension(NCL_NONCL_NO) :: wf2s

i = blockidx%x
tx = threadidx%x; ty = threadidx%y
tid = (ty - 1) * blockdim%x + tx
! copy x,y,z of nuclei for ith walker onto shared memory
if (tid <= NCL_NO) then
xs(tid) = xd(i,tid); ys(tid) = yd(i,tid); zs(tid) = zd(i,tid)
endif
call syncthreads()

xij1 = xs(tx) - xs(ty); yij1 = ys(tx) - ys(ty); zij1 = zs(tx) - zs(ty)
wf2_loc = 0
do m = -1, 1
do n = -1, 1
do p = -1, 1
xij = xij1 + pL
yij = yij1 + n
L
zij = zij1 + m*L

xij = sqrt(xijxij + yijyij + zij*zij)
if( xij /= 0 ) then
wf2_loc = wf2_loc - (B/xij)**5
end if
end do
end do
end do
wf2_loc = wf2_loc/2

! Copy local copy of wf2 to shared memory for reduction
wf2s(tid) = wf2_loc
call syncthreads()

! Sum the elements of shared wf2
j = blockdim%x * blockdim%y/2
do
if (tid <= j) wf2s(tid) = wf2s(tid) + wf2s(tid + j)
call syncthreads
j = ishft(j,-1)
if (j <= 0) exit
end do
call syncthreads()

! copy the reduced wf2 on shared memory to device memory
if (tid == 1) wf2d(i) = wf2s(1)
call syncthreads()
end subroutine xi2_kernel

subroutine cal_xi2(x,y,z,WLKR,NCL_NO,wf2)

use cudafor
implicit none

integer, intent(in) :: NCL_NO,WLKR
real *8, intent(in), dimension(WLKR,NCL_NO) :: x,y,z
real *8, intent(out), dimension(WLKR) :: wf2

! Define dummy variables
integer :: istat, shBytes ! Bytes for shared mem
type(dim3) :: dimGrid, dimBlock
real 8, dimension(WLKR,NCL_NONCL_NO) :: wf2_tmp

! Define device variables
real *8, device, allocatable, dimension(:,:) :: xd,yd,zd
real *8, device, allocatable, dimension(:) :: wf2d

! Copy data to device
allocate(xd(WLKR,NCL_NO), yd(WLKR,NCL_NO), zd(WLKR,NCL_NO))
allocate(wf2d(WLKR))
xd = x; yd = y; zd = z

! Launch wf2 kernel
dimBlock = dim3(32,32,1); dimGrid = dim3(WLKR,1,1)
shBytes = 38NCL_NO + 8NCL_NONCL_NO
call xi2_kernel<<<dimGrid, dimBlock, shBytes>>>(xd,yd,zd,WLKR,NCL_NO,wf2d)

! Copy wf2 from device to host
wf2 = wf2d
do istat = 1, WLKR
! print *, wf2_tmp(istat,:)
enddo
!wf2 = sum(wf2_tmp,2)
! Wait for the above execution to be complete
istat = cudaThreadSynchronize()

wf2 = exp(wf2)
deallocate(xd,yd,zd)
deallocate(wf2d)
return
end subroutine cal_xi2
end module xi2

Hi Bharat,

Assuming this is the same problem as your outer post:

The issue is that you’re using too many registers per block.

  • Mat