The following code is a simple model of n-bode force calculation. It uses an advantage of shared memory, it works perfect if the number of particles (N) is number is divisible by the block size (nTr) without a rest, otherwise it gives the wrong answer. Does anybody have idea how to fix this problem?
module force_mod
use cudafor
contains
attributes(global) subroutine force_kernel(x,y,fx,fy,N,nTr)
real,device :: x(N), y(N),fx(N),fy(N)
real::x0,y0,fxS,fyS
integer, value :: N, nTr
integer :: i, j, kb, k, tx, bx, ib, ic
real, shared :: Xsub(64), Ysub(64)
fxS=0.E0;fyS=0.E0
tx=threadidx%x
i=(blockidx%x-1)*nTr+tx
if (i.le.N) then
x0=x(i); y0=y(i);
do ib=1,N,nTr
Xsub(tx)=x(ib+tx)
Ysub(tx)=y(ib+tx)
call syncthreads()
do ic=1,nTr
rx=x0-Xsub(ic)
ry=y0-Ysub(ic)
fxS=fxS+1.E0 !rx*0.0001E0
fyS=fyS+1.E0 !ry*0.0001E0
enddo
call syncthreads()
enddo
fx(i)=fxS
fy(i)=fyS
endif
end subroutine force_kernel
subroutine force(x,y,fx,fy,N,nTr)
implicit real(4)(a-h,o-z)
real, dimension(:) :: x,y,fx,fy
integer :: N,nTr
real, device, allocatable, dimension(:) :: xD,yD,fxD,fyD
type(dim3) :: dimGrid, dimBlock
integer :: r
real ctimeall, ctimekernel
integer c1, c2, c3, c4
call system_clock( count=c1 )
allocate(xD(N),yD(N),fxD(N),fyD(N))
xD=x(1:N)
yD =y(1:N)
fxD=fx(1:N)
fyD=fy(1:N)
dimGrid = dim3( N/nTr+1, 1, 1 )
dimBlock = dim3( nTr, 1, 1 )
call system_clock( count=c2 )
call force_kernel<<<dimGrid>>>(xD,yD,fxD,fyD,N,nTr)
r = cudathreadsynchronize()
call system_clock( count=c3 )
fx(1:N)=fxD
fy(1:N)=fyD
call system_clock( count=c4 )
ctimekernel = c3 - c2
ctimeall = c4 - c1
print *, 'GPU time'
print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
print *, 'Total time including data xfer: ', ctimeall, ' microseconds'
deallocate(xD,yD,fxD,fyD)
end subroutine force
end module force_mod
program forces
use force_mod
implicit real(4)(a-h,o-z)
real,dimension(:),allocatable :: x,y,fx,fy,fxH,fyH
integer N,i,j,ixy
integer idevice, istat
real:: f
integer c1, c2, CPUtime
! Begin execution
nTr=64
N = 1024*8
print *,N
idevice = 0
print *,' arrays size ', N
allocate(x(N),y(N),fx(N),fy(N),fxH(N),fyH(N))
ixy=0
print *,'start coord'
do i = 1,N
15 x(i) = random(f)*500.E0
y(i) = random(f)*500.E0
ixy=0
do j=1,i-1
rr=sqrt( (x(i)-x(j) )**2+( y(i)-y(j) )**2)
if (rr.lt.1.E0) ixy=1
enddo
if (ixy.eq.1) goto 15
x(i)=0.E0
y(i)=0.E0
fx(i)=0.0E0
fy(i)=0.0E0
fxH(i)=0.0E0
fyH(i)=0.0E0
enddo
print *,'finish coord'
! Initialize GPU device
istat = cudaSetDevice(idevice)
print *,'calling mmul'
call force(x,y,fx,fy,N,nTr)
call system_clock( count=c1 )
do i = 1,N
do j=1,N
rx=x(i)-x(j)
ry=y(i)-y(j)
fxH(i)=fxH(i)+1.E0 !rx*0.0001
fyH(i)=fyH(i)+1.E0 !ry*0.0001
enddo
enddo
call system_clock( count=c2 )
! Check for errors
CPUtime=c2-c1
print *,''
print *, 'CPU time:', CPUtime, ' microseconds'
ierr = 0
do i = 1,N
diff = abs((fxH(i) - fx(i))+(fyH(i) - fy(i)))
if ( diff.ge.2.0e-2 ) then
ierr = ierr + 1
if ( ierr <= 10 ) then
print *, 'fx(',i,') = ',fx(i), ' should be ', fxH(i), ' error=', diff
endif
endif
enddo
if( ierr == 0 )then
print *, ' No errors found'
else
print *, ierr, ' ERRORS FOUND!!!'
endif
end program
fxH(i)=fxH(i)+1.E0 is made in order to show how the results are different