 # N-body simulation using shared memory

The following code is a simple model of n-bode force calculation. It uses an advantage of shared memory, it works perfect if the number of particles (N) is number is divisible by the block size (nTr) without a rest, otherwise it gives the wrong answer. Does anybody have idea how to fix this problem?

``````module force_mod
use cudafor
contains
attributes(global) subroutine force_kernel(x,y,fx,fy,N,nTr)
real,device :: x(N), y(N),fx(N),fy(N)
real::x0,y0,fxS,fyS
integer, value :: N, nTr
integer :: i, j, kb, k, tx, bx, ib, ic
real, shared :: Xsub(64), Ysub(64)
fxS=0.E0;fyS=0.E0
i=(blockidx%x-1)*nTr+tx
if (i.le.N) then
x0=x(i); y0=y(i);
do ib=1,N,nTr
Xsub(tx)=x(ib+tx)
Ysub(tx)=y(ib+tx)
do ic=1,nTr
rx=x0-Xsub(ic)
ry=y0-Ysub(ic)
fxS=fxS+1.E0 !rx*0.0001E0
fyS=fyS+1.E0 !ry*0.0001E0
enddo

enddo
fx(i)=fxS
fy(i)=fyS
endif
end subroutine force_kernel

subroutine force(x,y,fx,fy,N,nTr)
implicit real(4)(a-h,o-z)
real, dimension(:) :: x,y,fx,fy
integer :: N,nTr
real, device, allocatable, dimension(:) :: xD,yD,fxD,fyD
type(dim3) :: dimGrid, dimBlock
integer :: r
real ctimeall, ctimekernel
integer c1, c2, c3, c4
call system_clock( count=c1 )
allocate(xD(N),yD(N),fxD(N),fyD(N))
xD=x(1:N)
yD =y(1:N)
fxD=fx(1:N)
fyD=fy(1:N)
dimGrid = dim3( N/nTr+1, 1, 1 )
dimBlock = dim3( nTr, 1, 1 )
call system_clock( count=c2 )
call force_kernel<<<dimGrid>>>(xD,yD,fxD,fyD,N,nTr)
call system_clock( count=c3 )
fx(1:N)=fxD
fy(1:N)=fyD
call system_clock( count=c4 )
ctimekernel = c3 - c2
ctimeall = c4 - c1
print *, 'GPU time'
print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
print *, 'Total time including data xfer: ', ctimeall, ' microseconds'
deallocate(xD,yD,fxD,fyD)
end subroutine force
end module force_mod

program forces
use force_mod
implicit real(4)(a-h,o-z)
real,dimension(:),allocatable :: x,y,fx,fy,fxH,fyH
integer N,i,j,ixy
integer idevice, istat
real:: f
integer c1, c2, CPUtime
! Begin execution
nTr=64
N = 1024*8
print *,N
idevice = 0
print *,' arrays size ', N
allocate(x(N),y(N),fx(N),fy(N),fxH(N),fyH(N))

ixy=0
print *,'start coord'
do i = 1,N
15       x(i) = random(f)*500.E0
y(i) = random(f)*500.E0
ixy=0
do j=1,i-1
rr=sqrt( (x(i)-x(j) )**2+( y(i)-y(j) )**2)
if (rr.lt.1.E0) ixy=1
enddo
if (ixy.eq.1) goto 15
x(i)=0.E0
y(i)=0.E0
fx(i)=0.0E0
fy(i)=0.0E0
fxH(i)=0.0E0
fyH(i)=0.0E0
enddo
print *,'finish coord'

! Initialize GPU device
istat = cudaSetDevice(idevice)

print *,'calling mmul'
call force(x,y,fx,fy,N,nTr)

call system_clock( count=c1 )
do i = 1,N
do j=1,N
rx=x(i)-x(j)
ry=y(i)-y(j)
fxH(i)=fxH(i)+1.E0 !rx*0.0001
fyH(i)=fyH(i)+1.E0 !ry*0.0001
enddo
enddo
call system_clock( count=c2 )
! Check for errors
CPUtime=c2-c1
print *,''
print *, 'CPU time:', CPUtime, ' microseconds'

ierr = 0
do i = 1,N
diff = abs((fxH(i) - fx(i))+(fyH(i) - fy(i)))
if ( diff.ge.2.0e-2 ) then
ierr = ierr + 1
if ( ierr <= 10 ) then
print *, 'fx(',i,') = ',fx(i), ' should be ', fxH(i), ' error=', diff
endif
endif
enddo

if( ierr == 0 )then
print *, ' No errors found'
else
print *, ierr, ' ERRORS FOUND!!!'
endif

end program
``````

fxH(i)=fxH(i)+1.E0 is made in order to show how the results are different

These two programs calculate forces acting on every atom of the system. Forces are calculated using the neighbors lists.

The first code is the most obvious solution of the force computation procedure, while in the second case forces are determined using advantages of the shared memory. Unfortunately the secomd program works three times slower then the first one. Does anybody know where the mistake is?

NnMx – the maximum number of neighbors, in this case 128 (in fact it never reached, and is taken just for sure)
jarlst(NNnMx) – the Verlet lists (neighbor list). For example: indexes of Neighbors of the atom 6 are stored in jarlst array from (6128+1) to (7*128)
njlst(N) – number of neighbors for each specific atom
x(N),y(N),z(N),fx(N),fy(N),fz(N) – coordinates and forces acting on each particle

The first code:

``````     attributes(global) subroutine ForceKernel(n,NnMx,rcCh,Re,bt,De,x,y,z,jarlst,njlst,&
&fx,fy,fz,Xh,Yh,Zh)
implicit real(4)(a-h,o-z)
real(4), device :: x(N),y(N),z(N),fx(N),fy(N),fz(N)
integer, device :: jarlst(N*NnMx),njlst(N)
integer, value :: N,NnMx
real(4)::x0,y0,z0,fxS,fyS,fzS,dfX,dfY,dfZ,rr,dflj,rx,ry,rz
real(4), value::Re,bt,De
real(4), value :: Xh,Yh,Zh,rcCh
integer :: i,j,jm, kb, k, tx, bx, ib, ic
i=(blockidx%x-1)*blockdim%x+tx
if (i.le.N) then
fx(i)=0.E0; fy(i)=0.E0; fz(i)=0.E0
fxS=0.E0;fyS=0.E0;fzS=0.E0
do ic=1,njlst(i)
rx=x(i)-X(jarlst((i-1)*NnMx+ic))
ry=y(i)-Y(jarlst((i-1)*NnMx+ic))
rz=z(i)-Z(jarlst((i-1)*NnMx+ic))
rr=sqrt(rx*rx+ry*ry+rz*rz)
dflj=0.E0
if (rr.le.rcCh) then
dexp1ij=2.E0*exp(-2.E0*bt*(rr-Re))
dexp2ij=2.E0*exp(-bt*(rr-Re))
dflj=bt*De*(dexp1ij-dexp2ij)/rr
fxS=fxS+dflj*rx
fyS=fyS+dflj*ry
fzS=fzS+dflj*rz
endif
enddo
fx(i)=fxS
fy(i)=fyS
fz(i)=fzS
endif
end subroutine ForceKernel
``````

The second code:

``````attributes(global) subroutine ForceKernel(n,NnMx,rcCh,Re,bt,De,x,y,z,jarlst,njlst,&
&fx,fy,fz,Xh,Yh,Zh)
implicit real(4)(a-h,o-z)
real(4), device :: x(N),y(N),z(N),fx(N),fy(N),fz(N)
integer, device :: jarlst(N*NnMx),njlst(N)
integer, value :: N,NnMx
real(4)::x0,y0,z0,fxS,fyS,fzS,dfX,dfY,dfZ,rr,dflj,rx,ry,rz
real(4), value::Re,bt,De
real(4), value :: Xh,Yh,Zh,rcCh
integer :: i,j,jm, kb, k, tx, bx, ib, ic,i0,ty,Ny
real(4), shared :: Xsub(64,8),Ysub(64,8),Zsub(64,8)
Ny=blockdim%y
i=(blockidx%x-1)*blockdim%x+tx
if (i.le.N) then
fxS=0.E0;fyS=0.E0;fzS=0.E0
x0=x(i); y0=y(i);z0=z(i);i0=i
do ib=0,NnMx-Ny,Ny
Xsub(tx,ty)=X(jarlst((i-1)*NnMx+ib+ty))
Ysub(tx,ty)=Y(jarlst((i-1)*NnMx+ib+ty))
Zsub(tx,ty)=Z(jarlst((i-1)*NnMx+ib+ty))
do ic=1,Ny
rx=x0-Xsub(tx,ic)
ry=y0-Ysub(tx,ic)
rz=z0-Zsub(tx,ic)
rr=sqrt(rx*rx+ry*ry+rz*rz)
dflj=0.E0
if (rr.le.rcCh) then
dexp1ij=2.E0*exp(-2.E0*bt*(rr-Re))
dexp2ij=2.E0*exp(-bt*(rr-Re))
dflj=bt*De*(dexp1ij-dexp2ij)/rr
fxS=fxS+dflj*rx
fyS=fyS+dflj*ry
fzS=fzS+dflj*rz
endif
enddo
enddo
fx(i)=fxS
fy(i)=fyS
fz(i)=fzS
endif
end subroutine ForceKernel
``````

Hi Mike,

For the first posted code, the wrong answers are due to your use of the inner loop with a fixed iteration of 64. This forces the calculations to always go a little beyond N when N is not a multiple of 64.

As for the second post, I don’t think using shared memory is needed here since you only use the shared data once. Shared memory only helps when you need to access the data multiple times, otherwise you’re just adding overhead.

• Mat