I have now created an example.
I created the four files:
- main.f
- kernel.f90
- params.f
- makefile
The code crashes when the call acc_set_device_num(…) call is removed from
the !$OMP PARALLEL section inside the the kernel.f90 file.
FATAL ERROR: data in PRESENT clause was not found on device 1: name=gcol_n host:0x73d5900
lfpim_inner_loop line:17
The code should be run with n>1 otherwise it does not crash:
mpiexec -n 8 ./run
We are using the PGI 19.1 compiler and OpenMPI 2.1.5 on a DGX2.
Is something wrong with this code?
Thank you for your help!
main.f
implicit real*8 (a-h,o-z)
include "params.f"
include "mpif.h"
include "accel_lib.h"
dimension
1 f(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb,nsp),
1 f0(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb),
1 fc(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb),
1 ef0(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb),
1 efc(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb),
1 f1(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb),
1 ef(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb),
1 df3(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb,0:nsp),
1 ebbs(1-nb1:nx+nb1,1-nb1:ny+nb1,nv),
1 bbs(1-nb1:nx+nb1,1-nb1:ny+nb1,nv,nsp),
1 bb(1-nb2:nx+nb2,1-nb2:ny+nb2)
real*8 dvs_GPU(0:nsp)
real*8 dws_GPU(0:nsp)
real*8 ams_GPU(0:nsp)
real*8 qs_GPU(0:nsp)
real*8 tms_GPU(0:nsp)
real*8 vts_GPU(0:nsp)
integer numdevices
! Init MPI
call MPI_Init(ierr)
call MPI_Comm_size(mpi_comm_world,npe0,ierr)
call MPI_Comm_rank(mpi_comm_world,iam0,ierr)
if(iam0.eq.0)write(*,*)"Programstart"
numdevices = acc_get_num_devices(acc_device_nvidia)
if(iam0.eq.0)then
write(*,*)"Setting up OpenACC"
write(*,*)"GPUs per node: ",numdevices
endif
call acc_set_device_num(mod(iam0,numdevices),acc_device_nvidia)
istart=0
!$OMP DO
!$OMP& COLLAPSE(3)
!$OMP& SCHEDULE(dynamic,1)
do i=1,nx
do j=1,ny
do k=1,nz
do l=1,nv
f(l,k,j,i,1)= f(l,k,j,i,1)+3.0*f0(l,k,j,i)
enddo
enddo
enddo
enddo
!$OMP END DO NOWAIT
key0=1
key1=2
key2=2
npx_GPU=npx
npy_GPU=npy
npw_GPU=npw
ncom1=1
nsp1_GPU =nsp+1
adt=4200*1.5
dvs_GPU=88
dvs_GPU=99
ams_GPU = 1.0
qs_GPU = 2.0
tms_GPU = 3.0
vts_GPU = 4.0
nx_GPU =nx
ny_GPU =ny
nz_GPU =nz
nv_GPU =nv
nb_GPU =nb
nb1_GPU =nb1
nb2_GPU =nb2
nxy_GPU=nxy
nnx_GPU=nnx
nny_GPU=nny
gdne_GPU=gdne
ifcol_GPU=ifcol
ifcfm_GPU=ifcfm
b0_GPU =b0
c_GPU =c
pi_GPU=pi
!$acc data copy(
!$acc& key0,key1,key2,
!$acc& npx_GPU,npy_GPU,ncom1,nsp1_GPU,adt,
!$acc& dvs_GPU,dws_GPU,
!$acc& ams_GPU,qs_GPU,tms_GPU,vts_GPU,
!$acc& ef, df3, ebbs,
!$acc& f, bbs,
!$acc& nx_GPU,ny_GPU,nz_GPU,nv_GPU,npw_GPU,
!$acc& nb_GPU,nb1_GPU,nb2_GPU,nxy_GPU,
!$acc& ifcol_GPU,ifcfm_GPU,
!$acc& bb,b0_GPU,c_GPU,gdne_GPU,pi_GPU
!$acc& )
do it = istart,1
call lfpim_GPU(ct,
1 key0, key1, key2,
1 nnx_GPU, nny_GPU, ncom1, nsp1_GPU, adt,
1 dvs_GPU,dws_GPU,
1 ams_GPU,qs_GPU,tms_GPU,vts_GPU,
1 ef,df3(in,in,in,in,0),ebbs,
1 f,df3(in,in,in,in,1),bbs,
1 nx_GPU,ny_GPU,nz_GPU,nv_GPU,npw_GPU,
1 nb_GPU,nb1_GPU,nb2_GPU,nxy_GPU,
1 ifcol_GPU,ifcfm_GPU,
1 bb,b0_GPU,c_GPU,gdne_GPU,pi_GPU)
enddo
!$acc end data
call MPI_Finalize(ierr)
end
kernel.f90
subroutine lfpim_inner_loop( &
nx,ny,nz,nv,npw,nb,nb1,nb2,nxy,npa, &
gcol_n &
)
implicit none
include "mpif.h"
integer nx,ny,nz,nv
integer npw
integer nb,nb1,nb2
integer nxy
integer npa
real*8 gcol_n(1-nb1:nv+nb1,1-nb1:npw+nb1,npa,nxy)
!$acc kernels &
!$acc present( &
!$acc gcol_n&
!$acc )
gcol_n=0.d0
!$acc end kernels
end subroutine lfpim_inner_loop
subroutine lfpim_GPU( &
ct,key0,key1,key2,&
nnx,nny,&
ncom1,npa,adt, &
dv,dw,ami,qi,&
ti,vti, &
f1,df1,bbs1, &
f2,df2,bbs2, &
nx,ny,nz,nv,npw,&
nb,nb1,nb2,nxy,ifcol,ifcfm, &
bb,b0,c,gdne,pi)
use omp_lib
implicit none
include "mpif.h"
include "accel_lib.h"
real*8 ct(20)
integer key0,key1,key2
integer nnx,nny,ncom1
integer npa
real*8 adt
real*8 dv(npa)
real*8 dw(npa)
real*8 ami(npa)
real*8 qi(npa)
real*8 ti(npa)
real*8 vti(npa)
real*8 f1(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb)
real*8 df1(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb)
real*8 bbs1(1-nb1:nx+nb1,1-nb1:ny+nb1,nv)
real*8 f2(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb,2:npa)
real*8 df2(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb,2:npa)
real*8 bbs2(1-nb1:nx+nb1,1-nb1:ny+nb1,nv,2:npa)
integer nx,ny,nz,nv
integer npw
integer nb,nb1,nb2
integer nxy
integer ifcol,ifcfm
real*8 bb(1-nb2:nx+nb2,1-nb2:ny+nb2)
real*8 b0,c,gdne,pi
real*8,parameter :: sic = 2.99792458d8
real*8,parameter :: sie = 1.60217733d-19
real*8,parameter :: sime = 9.1093897d-31
real*8,parameter :: gc = sic*1.d2
real*8,parameter :: ge = sie*gc*1.d-1
real*8,parameter :: gme = sime*1.d3
! -----
integer i
integer j
integer ij
integer is
integer l
integer m
integer n
real*8 tmp
! -----
integer k,kk
real*8 gwpe
real*8 bbs(1-nb1:nx+nb1,1-nb1:ny+nb1,nv,npa)
real*8 gdx
real*8 evn
real*8 rbuf1(npa,1-nb:nv+nb,1,nxy,npw)
real*8 rbuf2(npa,1-nb:nv+nb,1,nxy,npw)
real*8 sbuf1(npa,1-nb:nv+nb,1,nxy,npw)
real*8 sbuf2(npa,1-nb:nv+nb,1,nxy,npw)
integer ierr
integer ndata
real*8 g(1-nb1:nv+nb1,1-nb1:npw+nb1,1,nxy,npa)
real*8 dummy
real*8 ct1,ct2
real*8 tmp1,tmp2
real*8 dummy1,dummy2
integer iam0
! -----
real*8 alpha(nxy)
real*8 bb_n(nxy)
real*8 beta(nxy)
real*8 cfmav(nv,npw,nxy)
real*8 cfmav_int_n(nxy)
real*8 cfmax2(nv,npw,nxy)
real*8 cfmax2_int_n(nxy)
real*8 cmr(3,npa,npa,nxy)
real*8 cms_n(3,npa,npa,nxy)
real*8 col_n(10,nv,npw,npa,npa,nxy)
real*8 colf(nv,npw,npa)
real*8 colt_n(nv,npw,npa,nxy)
real*8 dn_all_n(npa,nxy)
real*8 dti_n(nxy)
real*8 dvp_n(nxy)
real*8 dzb_n(npw,npa,nxy)
real*8 dzbs_n(nv,npw,npa,nxy)
real*8 erfb_xb(nv,npw,nxy)
real*8 fdif(nxy)
real*8 fmxla(nv,npw,nxy)
real*8 ftot_n(nxy)
real*8 gap2(nxy)
real*8 gap_n(1-nb1:nv+nb1,1-nb1:npw+nb1,nxy)
real*8 gar_n(1-nb1:nv+nb1,1-nb1:npw+nb1,nxy)
real*8 gcol_n(1-nb1:nv+nb1,1-nb1:npw+nb1,npa,nxy)
real*8 gg_n(1-nb1:nv+nb1,1-nb1:npw+nb1,npa,nxy)
real*8 gp2(nxy)
real*8 gp_n(1-nb1:nv+nb1,1-nb1:npw+nb1,nxy)
real*8 gr_n(1-nb1:nv+nb1,1-nb1:npw+nb1,nxy)
real*8 grgar(nxy)
real*8 gx_n(1-nb1:nv+nb1,1-nb1:npw+nb1,nxy)
real*8 pmat_n(3,3,npa,npa,nxy)
real*8 ti_all_n(npa,nxy)
real*8 v2_n(nv,npw,npa,nxy)
real*8 vl_n(nv,npa,nxy)
real*8 vp_all_n(npa,nxy)
integer nxyMax
! ---
ct1=0.0d0
ct2=0.0d0
call MPI_Comm_rank(mpi_comm_world,iam0,ierr)
write(*,*) "Inside function"
! -----
gwpe = dsqrt(pi*4*gdne*ge**2/gme)
gdx = gc/(gwpe*c)
evn = gdx*gdx*gwpe*gwpe*1e-4/sie*sime
k=1
kk=1
dummy=0.d0
tmp=0.d0
nxyMax=0
do n = 1,nxy
ij=key0*nxy+n
if(ij.le.nx*ny) then
else
exit
endif
nxyMax=nxyMax+1
enddo
!$acc data &
!$acc copy( &
!$acc k,kk, &
!$acc gdx,evn &
!$acc ) &
!$acc create( &
!$acc bbs, &
!$acc rbuf1,rbuf2,sbuf1,sbuf2, &
!$acc g, &
!$acc alpha, &
!$acc bb_n, &
!$acc beta, &
!$acc cfmav,cfmav_int_n,cfmax2,cfmax2_int_n, &
!$acc cmr,cms_n, &
!$acc col_n,colf,colt_n, &
!$acc dn_all_n, &
!$acc dti_n,dvp_n,dzb_n,dzbs_n, &
!$acc erfb_xb, &
!$acc fdif,fmxla,ftot_n, &
!$acc gap2,gap_n,gar_n,gcol_n, &
!$acc gg_n,gp2,gp_n,gr_n, &
!$acc grgar,gx_n, &
!$acc pmat_n, &
!$acc ti_all_n, &
!$acc v2_n,vl_n, &
!$acc vp_all_n &
!$acc )
! c........................................................gather f(l) -> g(l,m)
!$acc kernels &
!$acc present( &
!$acc bbs,bbs2,bbs1, &
!$acc sbuf1,rbuf1,sbuf2,rbuf2, &
!$acc bb_n,bb &
!$acc )
bbs(:,:,:,1)=bbs1(:,:,:)
bbs(:,:,:,2:npa)=bbs2(:,:,:,2:npa)
sbuf1=0.0
rbuf1=0.0
sbuf2=0.0
rbuf2=0.0
bb_n=0.0d0
do n = 1,nxyMax
ij=key0*nxy+n
j=mod(ij-1,ny)+1
i=(ij-j)/ny+1
bb_n(n)=bb(i,j)
enddo
!$acc end kernels
!$acc update device(kk)
!$acc kernels &
!$acc present( &
!$acc sbuf1,f1,f2, &
!$acc kk &
!$acc )
!$acc loop independent collapse(2)
do n=1,nxy
do m=0,npw-1
ij=m*nxy+n
j=mod(ij-1,ny)+1
i=(ij-j)/ny+1
if(ij.le.nx*ny) then
do l=1-nb,nv+nb
sbuf1(1,l,k,n,m+1)=f1(l,kk,j,i)
do is=2,npa
sbuf1(is,l,k,n,m+1)=f2(l,kk,j,i,is)
enddo
enddo
endif
enddo
enddo
!$acc end kernels
ndata=(nv+2*nb)*nxy*npa
#ifdef GPU_Direct
!$acc host_data use_device( &
!$acc sbuf1,rbuf1 &
!$acc )
#else
!$acc update host(sbuf1)
#endif
call mpi_alltoall( &
sbuf1,ndata,mpi_double_precision, &
rbuf1,ndata,mpi_double_precision,ncom1,ierr)
#ifdef GPU_Direct
!$acc end host_data
#else
!$acc update device(rbuf1)
#endif
do kk=1,nz
!$acc kernels &
!$acc present( &
!$acc g,rbuf1 &
!$acc )
g=0.d0
do n=1,nxy
do m=1,npw
do l=1-nb,nv+nb
do is=1,npa
g(l,m,k,n,is)=rbuf1(is,l,k,n,m)
enddo
enddo
enddo
enddo
!$acc end kernels
!$acc kernels &
!$acc present( &
!$acc sbuf2,rbuf2 &
!$acc )
sbuf2=rbuf2
rbuf2=0.0
!$acc end kernels
! -----
!$OMP PARALLEL
call acc_set_device_num(mod(iam0,16),acc_device_nvidia)
!$OMP MASTER
if(kk+1.le.nz) then
!$acc update device(kk)
!$acc kernels &
!$acc present( &
!$acc sbuf1,f1,f2, &
!$acc kk &
!$acc )
!$acc loop independent collapse(2)
do n=1,nxy
do m=0,npw-1
ij=m*nxy+n
j=mod(ij-1,ny)+1
i=(ij-j)/ny+1
if(ij.le.nx*ny) then
do l=1-nb,nv+nb
sbuf1(1,l,k,n,m+1)=f1(l,kk+1,j,i)
do is=2,npa
sbuf1(is,l,k,n,m+1)=f2(l,kk+1,j,i,is)
enddo
enddo
endif
enddo
enddo
!$acc end kernels
endif
if(kk+1.le.nz) then
#ifdef GPU_Direct
!$acc host_data use_device( &
!$acc sbuf1,rbuf1 &
!$acc )
#else
!$acc update host(sbuf1)
#endif
call mpi_alltoall( &
sbuf1,ndata,mpi_double_precision, &
rbuf1,ndata,mpi_double_precision,ncom1,ierr)
#ifdef GPU_Direct
!$acc end host_data
#else
!$acc update device(rbuf1)
#endif
endif
if(kk-1.ge.1) then
#ifdef GPU_Direct
!$acc host_data use_device( &
!$acc sbuf2,sbuf1 &
!$acc )
#else
!$acc update host(sbuf2)
#endif
call mpi_alltoall( &
sbuf2,ndata,mpi_double_precision, &
sbuf1,ndata,mpi_double_precision,ncom1,ierr)
#ifdef GPU_Direct
!$acc end host_data
#else
!$acc update device(sbuf1)
#endif
endif
!$OMP END MASTER
!$OMP SINGLE
! !$acc update device (g)
call lfpim_inner_loop( &
nx,ny,nz,nv,npw,nb,nb1,nb2,nxy,npa, &
gcol_n &
)
!$OMP END SINGLE
!$OMP END PARALLEL
enddo
kk=nz
!$acc end data
return
end subroutine lfpim_GPU
params.f
parameter(
1 npx = 1, npy = 2, npw = 4, node = npx*npy, node0=node*npw,
1 nx = 32, ny = 32, nz = 32, nv = 32,
1 ifdbg = 1, ifld = 1, ifwo = 0, ifcol = 1,
1 iftem = 2, iflfpm = 1, ifcfm = 1, ifm0 = 1,
1 b0 = 0.99,
1 c = 0.25,
1 gdne = 5.6d13,
1 pi = 3.141592653589793d0,
1 nsp= 1,
1 nxy=(nx*ny-1)/npw+1,
1 nb=2, nb1=nb+nb, nb2=nb1+nb
makefile
CC = mpif90
RM = /bin/rm
PROG = run
OBJS = main.o kernel.f90
OPTS = -O3 -ta=tesla:cc70 -acc -Minfo=accel -Minfo -mp -mcmodel=medium -DGPU_Direct -Mpreprocess
%.o : %.f
${CC} ${OPTS} -c $<
%.o : %.f90
${CC} ${OPTS} -c $<
all : ${PROG}
${PROG} : ${OBJS}
${CC} -Mcuda ${OPTS} -o $@ ${OBJS}
clean :
${RM} -f ${PROG} *.o