Hi Mat,
I created a small example which shows the error. Hope you can reproduce it.
main.f90
program halfprectest
!$acc routine(method1) seq
!$acc routine(method2) seq
use mpi
use cudafor
use accel_lib
implicit none
integer ierr, npe0, iam0, numdevices
integer ss_iter, ii, jj, i, j, ss
integer,parameter :: nblock = 2
integer,parameter :: ncolor=2
integer,parameter :: nb = 2
integer,parameter :: nb1 = 2*nb
integer,parameter :: nb2 = 4*nb
integer,parameter :: nb3 = 8*nb
integer,parameter :: nv = 32
integer,parameter :: nz = 32
integer,parameter :: ny = 32
integer,parameter :: nx = 32
integer icolor, iicolor, iicolor_no(ncolor * 2)
real*8 omega
real*2 f(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb)
real*2 df(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb)
real*2 vxyzv_all_low(1-nb:nv+nb+1,15,1-nb:ny+nb,1-nb:nx+nb)
real*2 dfc_(nv)
real*2 dfc_c(nv),dfc_in,dfc_y,dfc_t
real*2 omega1_
real*2 omega2_
real*2 df_tmp(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb)
call MPI_Init(ierr)
write(*,*) "Programstart"
call MPI_Comm_size(mpi_comm_world,npe0,ierr)
call MPI_Comm_rank(mpi_comm_world,iam0,ierr)
numdevices = acc_get_num_devices(acc_device_nvidia)
if(iam0.eq.0)then
write(*,*)"# of GPUs per node:",numdevices
endif
if(numdevices.ne.0)then
call acc_set_device_num(mod(iam0,numdevices),acc_device_nvidia)
endif
ss_iter = 4
omega = 0.95d0
omega1_ = omega
omega2_ = (1.0d0-omega)
!$acc data copyin(df,f, vxyzv_all_low)
!$acc kernels
do ss = 1,ss_iter
!!do iicolor = 1,ncolor*2
!$acc loop seq
do iicolor = 1,ncolor
icolor = iicolor_no(iicolor)
do jj = 1 ,ny,nblock
!$acc loop seq
do ii = 1 + mod( jj/nblock + (icolor-1) ,2 ) * nblock,&
nx,nblock*ncolor
!$acc loop seq
do i = ii,min(ii+nblock-1,nx)
do j = jj,min(jj+nblock-1,ny)
call method1( &
df,f, &
i,j,&
vxyzv_all_low, &
nx, ny, nz, nv,&
nb,nb1,nb2,nb3)
enddo
enddo
enddo
enddo
enddo !! icolor
!$acc loop seq
do iicolor = ncolor+1,ncolor*2
icolor = iicolor_no(iicolor)
!$acc loop seq
do jj = 1 ,ny,nblock
do ii = 1 + mod( jj/nblock + (icolor-1) ,2 ) * nblock,&
nx,nblock*ncolor
!$acc loop seq
do i = min(ii+nblock-1,nx),ii,-1
do j = min(jj+nblock-1,ny),jj,-1
call method2( &
df,f, &
i,j,&
vxyzv_all_low, &
nx, ny, nz, nv,&
nb,nb1,nb2,nb3)
enddo
enddo
enddo
enddo
enddo !! icolor
enddo !! ss
!$acc end kernels
!$acc end data
call MPI_Barrier(mpi_comm_world, ierr)
call MPI_Finalize(ierr)
end program halfprectest
methods.f90
subroutine method1( &
df,f, &
i,j,&
vxyzv_all, &
nx, ny, nz, nv,&
nb,nb1,nb2,nb3)
!$acc routine(method1) seq
implicit none
real*8, parameter :: omega = 0.05
real*2 f(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb)
real*2 df(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb)
real*2 vxyzv_all(1-nb:nv+nb+1,15,1-nb:ny+nb,1-nb:nx+nb)
real*2 dfc_(nv)
real*2 dfc_c(nv),dfc_in,dfc_y,dfc_t
real*2 omega1_
real*2 omega2_
integer i,j
integer nx,ny,nz,nv
integer nb,nb1,nb2,nb3
integer ssi
integer kl
integer k,l
integer nn
omega1_ = omega
omega2_ = (1.0d0-omega)
k=1
do l = 1,nv
dfc_(l) = df(l,k-1,j,i)
enddo
do k = 1,nz
do l = 1,nv
df(l,k-1,j,i) = dfc_(l)
dfc_(l) = f(l,k,j,i)
dfc_(l) = dfc_(l) + vxyzv_all(l,1,j,i) * df(l,k,j,i-1)
dfc_(l) = dfc_(l) - vxyzv_all(l,2,j,i) * df(l,k,j,i+1)
dfc_(l) = dfc_(l) - vxyzv_all(l,3,j,i) * df(l,k,j,i-2)
dfc_(l) = dfc_(l) + vxyzv_all(l,4,j,i) * df(l,k,j,i+2)
dfc_(l) = dfc_(l) + vxyzv_all(l,5,j,i) * df(l,k,j-1,i)
dfc_(l) = dfc_(l) - vxyzv_all(l,6,j,i) * df(l,k,j+1,i)
dfc_(l) = dfc_(l) - vxyzv_all(l,7,j,i) * df(l,k,j-2,i)
dfc_(l) = dfc_(l) + vxyzv_all(l,8,j,i) * df(l,k,j+2,i)
enddo
enddo
k=nz+1
do l = 1,nv
df(l,k-1,j,i) = dfc_(l)
enddo
end subroutine method1
subroutine method2( &
df,f, &
i,j,&
vxyzv_all, &
nx, ny, nz, nv,&
nb,nb1,nb2,nb3)
!$acc routine(method2) seq
implicit none
real*8, parameter :: omega = 0.05
real*2 f(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb)
real*2 df(1-nb:nv+nb,1-nb:nz+nb,1-nb:ny+nb,1-nb:nx+nb)
real*2 vxyzv_all(1-nb:nv+nb+1,15,1-nb:ny+nb,1-nb:nx+nb)
real*2 dfc_(nv)
real*2 dfc_c(nv),dfc_in,dfc_y,dfc_t
real*2 omega1_
real*2 omega2_
integer i,j
integer nx,ny,nz,nv
integer nb,nb1,nb2,nb3
integer ssi
integer kl
integer k,l
integer nn
omega1_ = omega
omega2_ = (1.0d0-omega)
k=nz
do l = 1,nv
dfc_(l) = df(l,k+1,j,i)
enddo
! do k = 1,nz
do k = nz,1,-1
do l = 1,nv
df(l,k+1,j,i) = dfc_(l)
dfc_(l) = f(l,k,j,i)
dfc_(l) = dfc_(l) + vxyzv_all(l,1,j,i) * df(l,k,j,i-1)
dfc_(l) = dfc_(l) - vxyzv_all(l,2,j,i) * df(l,k,j,i+1)
dfc_(l) = dfc_(l) - vxyzv_all(l,3,j,i) * df(l,k,j,i-2)
dfc_(l) = dfc_(l) + vxyzv_all(l,4,j,i) * df(l,k,j,i+2)
dfc_(l) = dfc_(l) + vxyzv_all(l,5,j,i) * df(l,k,j-1,i)
dfc_(l) = dfc_(l) - vxyzv_all(l,6,j,i) * df(l,k,j+1,i)
dfc_(l) = dfc_(l) - vxyzv_all(l,7,j,i) * df(l,k,j-2,i)
dfc_(l) = dfc_(l) + vxyzv_all(l,8,j,i) * df(l,k,j+2,i)
enddo
enddo
k=0
do l = 1,nv
df(l,k+1,j,i) = dfc_(l)
enddo
end subroutine method2
makefile
all:
mpif90 -Mpreprocess -acc -Mcuda -Minfo -ta=multicore main.f90 methods.f90
clean:
rm *.o a.out