N-body simulation using shared memory

The following code is a simple model of n-bode force calculation. It uses an advantage of shared memory, it works perfect if the number of particles (N) is number is divisible by the block size (nTr) without a rest, otherwise it gives the wrong answer. Does anybody have idea how to fix this problem?

module force_mod
    use cudafor
    contains
    attributes(global) subroutine force_kernel(x,y,fx,fy,N,nTr)
    real,device :: x(N), y(N),fx(N),fy(N)
    real::x0,y0,fxS,fyS
    integer, value :: N, nTr
    integer :: i, j, kb, k, tx, bx, ib, ic
    real, shared :: Xsub(64), Ysub(64)
    fxS=0.E0;fyS=0.E0
	tx=threadidx%x
    i=(blockidx%x-1)*nTr+tx
    if (i.le.N) then
    x0=x(i); y0=y(i);
    	do ib=1,N,nTr
			Xsub(tx)=x(ib+tx)
			Ysub(tx)=y(ib+tx)
			call syncthreads()
			do ic=1,nTr
				rx=x0-Xsub(ic)
				ry=y0-Ysub(ic)
				fxS=fxS+1.E0 !rx*0.0001E0
				fyS=fyS+1.E0 !ry*0.0001E0
			enddo
			call syncthreads()
			
		enddo
		fx(i)=fxS
		fy(i)=fyS 
	endif
    end subroutine force_kernel


	subroutine force(x,y,fx,fy,N,nTr)
	implicit real(4)(a-h,o-z)
    real, dimension(:) :: x,y,fx,fy
	integer :: N,nTr
    real, device, allocatable, dimension(:) :: xD,yD,fxD,fyD
    type(dim3) :: dimGrid, dimBlock
    integer :: r
    real ctimeall, ctimekernel
    integer c1, c2, c3, c4
    call system_clock( count=c1 )
    allocate(xD(N),yD(N),fxD(N),fyD(N))
	xD=x(1:N)
    yD =y(1:N)
	fxD=fx(1:N)
    fyD=fy(1:N)
    dimGrid = dim3( N/nTr+1, 1, 1 )
    dimBlock = dim3( nTr, 1, 1 )
    call system_clock( count=c2 )
    call force_kernel<<<dimGrid>>>(xD,yD,fxD,fyD,N,nTr)
    r = cudathreadsynchronize()
    call system_clock( count=c3 )
    fx(1:N)=fxD
    fy(1:N)=fyD
    call system_clock( count=c4 )
    ctimekernel = c3 - c2
    ctimeall = c4 - c1
    print *, 'GPU time'
    print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
    print *, 'Total time including data xfer: ', ctimeall, ' microseconds' 
	deallocate(xD,yD,fxD,fyD)
    end subroutine force
end module force_mod



program forces
   use force_mod
   implicit real(4)(a-h,o-z)
   real,dimension(:),allocatable :: x,y,fx,fy,fxH,fyH
   integer N,i,j,ixy
   integer idevice, istat
   real:: f
   integer c1, c2, CPUtime
! Begin execution
   nTr=64
   N = 1024*8
   print *,N
   idevice = 0
   print *,' arrays size ', N
   allocate(x(N),y(N),fx(N),fy(N),fxH(N),fyH(N))
	
	ixy=0
	print *,'start coord'
      do i = 1,N
15       x(i) = random(f)*500.E0
         y(i) = random(f)*500.E0
         ixy=0
         do j=1,i-1
		rr=sqrt( (x(i)-x(j) )**2+( y(i)-y(j) )**2)
         if (rr.lt.1.E0) ixy=1
         enddo
         if (ixy.eq.1) goto 15
         x(i)=0.E0
         y(i)=0.E0
         fx(i)=0.0E0
         fy(i)=0.0E0
         fxH(i)=0.0E0
         fyH(i)=0.0E0
      enddo
	print *,'finish coord'

! Initialize GPU device
  istat = cudaSetDevice(idevice)  

   print *,'calling mmul'
   call force(x,y,fx,fy,N,nTr)

   call system_clock( count=c1 )
   do i = 1,N
	  do j=1,N
      rx=x(i)-x(j)
      ry=y(i)-y(j)
	  fxH(i)=fxH(i)+1.E0 !rx*0.0001
	  fyH(i)=fyH(i)+1.E0 !ry*0.0001      
      enddo
   enddo
	call system_clock( count=c2 )
! Check for errors
	CPUtime=c2-c1
	print *,''
    print *, 'CPU time:', CPUtime, ' microseconds'

   ierr = 0
   do i = 1,N
         diff = abs((fxH(i) - fx(i))+(fyH(i) - fy(i)))
         if ( diff.ge.2.0e-2 ) then
            ierr = ierr + 1
            if ( ierr <= 10 ) then
               print *, 'fx(',i,') = ',fx(i), ' should be ', fxH(i), ' error=', diff
            endif
         endif
   enddo

   if( ierr == 0 )then
      print *, ' No errors found'
   else
      print *, ierr, ' ERRORS FOUND!!!'
   endif

end program

fxH(i)=fxH(i)+1.E0 is made in order to show how the results are different

These two programs calculate forces acting on every atom of the system. Forces are calculated using the neighbors lists.

The first code is the most obvious solution of the force computation procedure, while in the second case forces are determined using advantages of the shared memory. Unfortunately the secomd program works three times slower then the first one. Does anybody know where the mistake is?

NnMx – the maximum number of neighbors, in this case 128 (in fact it never reached, and is taken just for sure)
jarlst(NNnMx) – the Verlet lists (neighbor list). For example: indexes of Neighbors of the atom 6 are stored in jarlst array from (6128+1) to (7*128)
njlst(N) – number of neighbors for each specific atom
x(N),y(N),z(N),fx(N),fy(N),fz(N) – coordinates and forces acting on each particle

The first code:

     attributes(global) subroutine ForceKernel(n,NnMx,rcCh,Re,bt,De,x,y,z,jarlst,njlst,&
&fx,fy,fz,Xh,Yh,Zh)
     implicit real(4)(a-h,o-z)
     real(4), device :: x(N),y(N),z(N),fx(N),fy(N),fz(N)
     integer, device :: jarlst(N*NnMx),njlst(N)
     integer, value :: N,NnMx
     real(4)::x0,y0,z0,fxS,fyS,fzS,dfX,dfY,dfZ,rr,dflj,rx,ry,rz
     real(4), value::Re,bt,De
     real(4), value :: Xh,Yh,Zh,rcCh
     integer :: i,j,jm, kb, k, tx, bx, ib, ic
     tx=threadidx%x
    i=(blockidx%x-1)*blockdim%x+tx     
     if (i.le.N) then     
        fx(i)=0.E0; fy(i)=0.E0; fz(i)=0.E0
        fxS=0.E0;fyS=0.E0;fzS=0.E0
          do ic=1,njlst(i)
               rx=x(i)-X(jarlst((i-1)*NnMx+ic))
               ry=y(i)-Y(jarlst((i-1)*NnMx+ic))
               rz=z(i)-Z(jarlst((i-1)*NnMx+ic))
                    rr=sqrt(rx*rx+ry*ry+rz*rz)
                    dflj=0.E0
                    if (rr.le.rcCh) then
                         dexp1ij=2.E0*exp(-2.E0*bt*(rr-Re))
                         dexp2ij=2.E0*exp(-bt*(rr-Re))
                         dflj=bt*De*(dexp1ij-dexp2ij)/rr
                    fxS=fxS+dflj*rx
                    fyS=fyS+dflj*ry
                    fzS=fzS+dflj*rz
                    endif
          enddo
               fx(i)=fxS
               fy(i)=fyS
               fz(i)=fzS
     endif     
end subroutine ForceKernel

The second code:

attributes(global) subroutine ForceKernel(n,NnMx,rcCh,Re,bt,De,x,y,z,jarlst,njlst,&
&fx,fy,fz,Xh,Yh,Zh)
     implicit real(4)(a-h,o-z)
     real(4), device :: x(N),y(N),z(N),fx(N),fy(N),fz(N)
     integer, device :: jarlst(N*NnMx),njlst(N)
     integer, value :: N,NnMx
     real(4)::x0,y0,z0,fxS,fyS,fzS,dfX,dfY,dfZ,rr,dflj,rx,ry,rz
     real(4), value::Re,bt,De
     real(4), value :: Xh,Yh,Zh,rcCh
     integer :: i,j,jm, kb, k, tx, bx, ib, ic,i0,ty,Ny
     real(4), shared :: Xsub(64,8),Ysub(64,8),Zsub(64,8)
     tx=threadidx%x
     ty=threadidx%y
     Ny=blockdim%y
    i=(blockidx%x-1)*blockdim%x+tx
     if (i.le.N) then     
          call syncthreads()
               fxS=0.E0;fyS=0.E0;fzS=0.E0
               x0=x(i); y0=y(i);z0=z(i);i0=i
               do ib=0,NnMx-Ny,Ny
                    Xsub(tx,ty)=X(jarlst((i-1)*NnMx+ib+ty))
                    Ysub(tx,ty)=Y(jarlst((i-1)*NnMx+ib+ty))
                    Zsub(tx,ty)=Z(jarlst((i-1)*NnMx+ib+ty))
                    call syncthreads()
                    do ic=1,Ny
                      rx=x0-Xsub(tx,ic) 
                      ry=y0-Ysub(tx,ic)
                      rz=z0-Zsub(tx,ic)
                      rr=sqrt(rx*rx+ry*ry+rz*rz)
                      dflj=0.E0
                      if (rr.le.rcCh) then
                         dexp1ij=2.E0*exp(-2.E0*bt*(rr-Re))
                         dexp2ij=2.E0*exp(-bt*(rr-Re))
                         dflj=bt*De*(dexp1ij-dexp2ij)/rr
                         fxS=fxS+dflj*rx
                         fyS=fyS+dflj*ry
                         fzS=fzS+dflj*rz
                      endif
                    enddo
                    call syncthreads()
               enddo
               fx(i)=fxS
               fy(i)=fyS
               fz(i)=fzS
     endif     
end subroutine ForceKernel

Hi Mike,

For the first posted code, the wrong answers are due to your use of the inner loop with a fixed iteration of 64. This forces the calculations to always go a little beyond N when N is not a multiple of 64.

As for the second post, I don’t think using shared memory is needed here since you only use the shared data once. Shared memory only helps when you need to access the data multiple times, otherwise you’re just adding overhead.

  • Mat