My reproducer is similar to Michael Wolfe’s famous Jacobi example but has a total of 4 kernel loops to implement a Red-Black-SOR solver with separate red and black arrays.
Compilation fails with
PGF90-W-0155 … Missing branch target block
which seems to be a known problem.
Besides that, the diagnosis of loop dependence is definitely incorrect. There is no dependence in this type of loop:
!$acc do parallel
do ix = sps,spe
xx = xred(ix,iy)
c = bred( ix,iy) - xx &
- ared2(ix,iy) * xblk(ix ,iy-1) &
- ared3(ix,iy) * xblk(ix-1,iy ) &
- ared4(ix,iy) * xblk(ix ,iy ) &
- ared5(ix,iy) * xblk(ix ,iy+1)
c = omega_f * c
errff = max(errff, abs(c/xx))
xred(ix,iy) = xx + c
end do
Compile flags are:
pgf90 -ta=nvidia:cuda4.0 -tp=nehalem-64 -r8 -Minfo -fast -c rbsor2d.f90
Partial compiler information is
36, Accelerator scalar kernel generated
37, Complex loop carried dependence of 'xred' prevents parallelization
Loop carried dependence due to exposed use of 'xred(:,i1+1)' prevents parallelization
Inner sequential loop scheduled on host
Accelerator scalar kernel generated
Generated 3 alternate versions of the loop
Generated vector sse code for the loop
Generated 8 prefetch instructions for the loop
Here is the complete reproducer. Any hints welcome
subroutine red_black_sor(ared2,ared3,ared4,ared5,ablk2,ablk3,ablk4,ablk5, &
bred,bblk,rs_red,re_red,lt_red,rs_blk,re_blk,lt_blk, &
xred,xblk,omega_f,nrow,itsor,epsor)
implicit none
! ---------------------------------------------------------------------
real, dimension(:,:), intent(in) :: ared2,ared3,ared4,ared5,ablk2,ablk3,ablk4,ablk5
real, dimension(:,:), intent(in) :: bred, bblk
real, dimension(:,:), intent(inout) :: xred, xblk
integer, dimension(:), intent(in) :: rs_red, re_red, lt_red, rs_blk, re_blk, lt_blk
real, intent(in) :: omega_f
real, intent(inout) :: epsor
integer, intent(in) :: nrow
integer, intent(inout) :: itsor
! ---------------------------------------------------------------------
integer :: iter,ix,iy,sps,spe
real :: c,errff,xx
! ---------------------------------------------------------------------
!$acc data region local (iter,ix,iy,xx,c,errff,sps,spe) &
!$acc copyin(ared2,ared3,ared4,ared5,ablk2,ablk3,ablk4,ablk5, &
!$acc bred,bblk,rs_red,re_red,lt_red,rs_blk,re_blk,lt_blk) &
!$acc copy(itsor,xred,xblk)
do iter = 1, 10000
errff = 0.
!$acc region
!$acc do seq
do iy = 1,nrow ! RED Split X arrays require 2 different loop versions
sps = rs_red(iy)
spe = re_red(iy)
if (lt_red(iy) == 0) then
!$acc do parallel
do ix = sps,spe
xx = xred(ix,iy)
c = bred( ix,iy) - xx &
- ared2(ix,iy) * xblk(ix ,iy-1) &
- ared3(ix,iy) * xblk(ix-1,iy ) &
- ared4(ix,iy) * xblk(ix ,iy ) &
- ared5(ix,iy) * xblk(ix ,iy+1)
c = omega_f * c
errff = max(errff, abs(c/xx))
xred(ix,iy) = xx + c
end do
else
!$acc do parallel
do ix = sps,spe
xx = xred(ix,iy)
c = bred( ix,iy) - xx &
- ared2(ix,iy) * xblk(ix ,iy-1) &
- ared3(ix,iy) * xblk(ix ,iy ) &
- ared4(ix,iy) * xblk(ix+1,iy ) &
- ared5(ix,iy) * xblk(ix ,iy+1)
c = omega_f * c
errff = max(errff, abs(c/xx))
xred(ix,iy) = xx + c
end do
end if
end do
!$acc end region
!$acc region
!$acc do seq
do iy = 1,nrow ! BLACK Split X arrays require 2 different loop versions
sps = rs_blk(iy)
spe = re_blk(iy)
if (lt_blk(iy) == 0) then
!$acc do parallel
do ix = sps,spe
xx = xblk(ix,iy)
c = bblk( ix,iy) - xx &
- ablk2(ix,iy) * xred(ix ,iy-1) &
- ablk3(ix,iy) * xred(ix-1,iy ) &
- ablk4(ix,iy) * xred(ix ,iy ) &
- ablk5(ix,iy) * xred(ix ,iy+1)
c = omega_f * c
errff = max(errff, abs(c/xx))
xblk(ix,iy) = xx + c
end do
else
!$acc do parallel
do ix = sps,spe
xx = xblk(ix,iy)
c = bblk( ix,iy) - xx &
- ablk2(ix,iy) * xred(ix ,iy-1) &
- ablk3(ix,iy) * xred(ix ,iy ) &
- ablk4(ix,iy) * xred(ix+1,iy ) &
- ablk5(ix,iy) * xred(ix ,iy+1)
c = omega_f * c
errff = max(errff, abs(c/xx))
xblk(ix,iy) = xx + c
end do
end if
end do
!$acc end region
if (errff <= epsor) exit ! Converged !
! ------------------------
! END OF JACOBI/SOR LOOP
! ------------------------
end do
itsor = itsor + iter
!$acc end data region
end subroutine red_black_sor