Hi,
I have a working pure MPI code and try to parallelize the code using OpenACC. It is a Smoothed Particle Hydrodynamics (SPH) code. After changing the data structure of the SPH code (to get a good GPU speed up), I found out that the problem comes from the neighbor search subroutine. Here are several points that I have discovered so far:
- Pure MPI works fine.
- With OpenMP (just for the neighbor search subroutine), it works, but the maximum threads are two (200 %), although I set, for example, “export OMP_NUM_THREADS=8” and expect CPU activity as 800 %.
- With OpenACC, the code correctly works only for the very first timestep and then crashes by overestimating the particle cell or pair number.
I tried multiple different ways to cope with this problem, but in vain so far :(
Any insight will be deeply appreciated. I am attaching the code below:
subroutine neightbor_list
use mpi
use openacc
use omp_lib
use YX_variable
implicit none
integer :: np, npn, n, nn, i,j,k, ci,cj,ck, ppi,ppj, ni,nj, &
cxmin, cxmax, cymin, cymax, czmin, czmax, err, ii
real(8) :: xxmin, xxmax, yymin, yymax, zzmin, zzmax
integer,allocatable :: cell(:,:,:,:), cellNum(:,:,:)
real(8) :: dr, dx(NSD)
npn = gnp(ngr+1)
!!$acc update host(xt) ! Only for OpenMP
!$acc kernels
xxmin = minval(xt(1,1:npn))
xxmax = maxval(xt(1,1:npn))
yymin = minval(xt(2,1:npn))
yymax = maxval(xt(2,1:npn))
zzmin = minval(xt(3,1:npn))
zzmax = maxval(xt(3,1:npn))
cxmin = ceiling(xxmin/kh)
cxmax = ceiling(xxmax/kh)
cymin = ceiling(yymin/kh)
cymax = ceiling(yymax/kh)
czmin = ceiling(zzmin/kh)
czmax = ceiling(zzmax/kh)
!$acc end kernels
allocate(cell(NPCmax, cxmin-1:cxmax+1, cymin-1:cymax+1, czmin-1:czmax+1), cellNum(cxmin-1:cxmax+1, cymin-1:cymax+1, czmin-1:czmax+1), stat=err)
!$acc enter data create(cell, cellNum)
!$acc kernels
cell = 0
cellNum = 0
npp = 0
npnl = 0
!$acc end kernels
! Find the particles in every cell
!$acc parallel loop private(i,j,k,n,np)
!!$omp parallel do private(i,j,k,n,np) ! schedule(dynamic)
do np=1, npn
i = ceiling(xt(1,np)/kh)
j = ceiling(xt(2,np)/kh)
k = ceiling(xt(3,np)/kh)
!!$omp atomic
!$acc atomic
cellNum(i,j,k) = cellNum(i,j,k) + 1
!!$omp end atomic
n = cellNum(i,j,k)
!$acc wait
!!$omp taskwait
cell(n,i,j,k) = np
enddo
!!$omp end parallel do
!$acc end parallel
!$acc wait
npn = gnp(ngr)
! Find neighboring particles
!$acc parallel loop private(i,j,k, ppi,ck,cj,ci,nj,ppj,dx,dr,n,nn)
!!$omp parallel do private(i,j,k, ppi,ck,cj,ci,nj,ppj,dx,dr,n,nn)
do ppi=1, npn
i = ceiling(xt(1,ppi)/kh)
j = ceiling(xt(2,ppi)/kh)
k = ceiling(xt(3,ppi)/kh)
do ck=k-1, k+1
do cj=j-1, j+1
do ci=i-1, i+1
!$acc wait
!!$omp taskwait
n=cellNum(ci,cj,ck)
do nj=1, n
ppj=cell(nj, ci,cj,ck)
if(ppi /= ppj) then
dx = xt(:,ppi) - xt(:,ppj)
dr = dsqrt(dot_product(dx,dx))
if(dr < kh) then
!!$omp atomic
!$acc atomic
npnl(ppi) = npnl(ppi) + 1
!!$omp end atomic
nn = npnl(ppi)
pair(nn,ppi) = ppj
end if
endif
enddo
enddo
enddo
enddo
enddo
!!$omp end parallel do
!$acc end parallel
!!$acc update device(pair, npnl) ! Only for OpenMP
!$acc exit data delete(cell, cellNum)
deallocate(cell, cellNum, stat=err)
end subroutine
Thank you,
Yongsuk