Reduction breaking in 15.3?

I have 14.7 and 15.3 installed on OSX. The following works in the former, but not in the later version. In 14.7, the result is “test ok” while in 15.3 it will print “sum c failed: 0.000000 ; expected: 1966080”.

Update: I just checked the same on linux with 14.7 and 15.1 - same result: 14.7 works, 15.1 sets the sum to 0.

module example
contains

 subroutine data_region(a, b, c, d, sum_c, sum_d)
  use openacc
  implicit none
  real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
  real, intent(out) :: c(256, 256, 10), d(256, 256, 10)
  real, intent(out) :: sum_c, sum_d

!$acc enter data copyin(a), copyin(c), copyin(b), copyin(d)

  call run(a, b, c, d)
  call reduce_to_sum (c, sum_c)
  call reduce_to_sum (d, sum_d)

!$acc exit data delete(a), copyout(c), delete(b), copyout(d)
end subroutine

 subroutine run(a, b, c, d)
  use openacc
  implicit none
  real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
  real, intent(out) :: c(256, 256, 10), d(256, 256, 10)


  call add (a, b, c)
  call mult (a, b, d)

end subroutine

 subroutine add(a, b, c)
  use openacc
  implicit none
  real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
  real, intent(out) :: c(256, 256, 10)
  integer :: z

  integer(4) :: y, x

!$acc kernels
!$acc loop independent vector(16)
  do y=1,256
!$acc loop independent vector(16)
   do x=1,256
!$acc loop seq
    do z=1,10
     c(x, y, z)= a(x, y, z)+ b(x, y, z)
    end do
   end do
  end do
!$acc end kernels
end subroutine

 subroutine mult(a, b, d)
  use openacc
  implicit none
  real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
  real, intent(out) :: d(256, 256, 10)
  integer :: z

  integer(4) :: y, x

!$acc kernels
!$acc loop independent vector(16)
  do y=1,256
!$acc loop independent vector(16)
   do x=1,256
!$acc loop seq
    do z=1,10
     d(x, y, z)= a(x, y, z)* b(x, y, z)
    end do
   end do
  end do
!$acc end kernels
end subroutine

 subroutine reduce_to_sum(a, result)
  use openacc
  implicit none
  real, intent(in) :: a(256, 256, 10)
  real, intent(out) :: result
  integer :: z

  integer(4) :: y, x

  result = 0.0d0

!$acc kernels
!$acc loop independent vector(16) reduction(+: result)
  do y=1,256
!$acc loop independent vector(16)
   do x=1,256
!$acc loop seq
    do z=1,10
     result = result + a(x, y, z)
    end do
   end do
  end do
!$acc end kernels
end subroutine

end module example

program main
use example
implicit none
real, dimension(256, 256, 10) :: a, b, c, d
real :: sum_c, sum_d, expected_sum
integer :: x, y, z
integer :: fail_x, fail_y, fail_z
logical test

a(:,:,:) = 1.0d0
b(:,:,:) = 2.0d0
c(:,:,:) = 0.0d0
d(:,:,:) = 0.0d0
test = .TRUE.

call data_region(a, b, c, d, sum_c, sum_d)
write(6,*) "calculation complete"

expected_sum = 3.0d0 * 256 * 256 * 10
if ( abs(sum_c - expected_sum) > 1E-5 ) then
write(6,*) "sum c failed: ", sum_c, "; expected: ", expected_sum
stop 2
end if

expected_sum = 2.0d0 * 256 * 256 * 10
if ( abs(sum_d - expected_sum) > 1E-5 ) then
write(6,*) "sum d failed: ", sum_d, "; expected: ", expected_sum
stop 2
end if

do y=1,256
 do x=1,256
  do z=1,10
   if (test .EQ. .TRUE. .AND. c(x, y, z) /= 3.0d0) then
   test = .FALSE.
   fail_x = x
   fail_y = y
   fail_z = z
   end if
   if (test .EQ. .TRUE. .AND. d(x, y, z) /= 2.0d0) then
   test = .FALSE.
   fail_x = x
   fail_y = y
   fail_z = z
   end if
  end do
 end do
end do
if (test .EQ. .TRUE.) then
write(6,*) "test ok"
else
write(6,*) "test failed"
write(6,*) "fails at", fail_x, fail_y, fail_z, "C:", c(fail_x, fail_y, fail_z), "D:", d(fail_x, fail_y, fail_z)
stop 2
end if

stop
end program main

[/code]

Hi Michel,

The example “works” in 14.7 only because we think there’s a dependency on “result” given it has intent out. Hence only a scalar kernel is being generated.

reduce_to_sum:
     89, Generating present_or_copyin(a(:,:,:))
         Generating Tesla code
     91, Accelerator restriction: scalar variable live-out from loop: result
     93, Accelerator restriction: scalar variable live-out from loop: result
     95, Loop is parallelizable
         Accelerator kernel generated

Once we fixed the dependency issue and generate the parallel kernel, it exposes the problem with the reduction. I can recreate the problem in 14.7 if I use a local variable instead of the argument.

I have added a problem report (TPR#21474) and sent it to engineering for further investigation. The work around is to remove the “reduction” clause and instead let the compiler discover the reduction. It seems to create the correct results in this case.

Thanks!
Mat

% cat test_032715.1.f90
module example
 contains

  subroutine data_region(a, b, c, d, sum_c, sum_d)
   use openacc
   implicit none
   real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
   real, intent(out) :: c(256, 256, 10), d(256, 256, 10)
   real, intent(out) :: sum_c, sum_d

 !$acc enter data copyin(a), copyin(c), copyin(b), copyin(d)

   call run(a, b, c, d)
   call reduce_to_sum (c, sum_c)
   call reduce_to_sum (d, sum_d)

 !$acc exit data delete(a), copyout(c), delete(b), copyout(d)
 end subroutine

  subroutine run(a, b, c, d)
   use openacc
   implicit none
   real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
   real, intent(out) :: c(256, 256, 10), d(256, 256, 10)


   call add (a, b, c)
   call mult (a, b, d)

 end subroutine

  subroutine add(a, b, c)
   use openacc
   implicit none
   real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
   real, intent(out) :: c(256, 256, 10)
   integer :: z

   integer(4) :: y, x

 !$acc kernels
 !$acc loop independent vector(16)
   do y=1,256
 !$acc loop independent vector(16)
    do x=1,256
 !$acc loop seq
     do z=1,10
      c(x, y, z)= a(x, y, z)+ b(x, y, z)
     end do
    end do
   end do
 !$acc end kernels
 end subroutine

  subroutine mult(a, b, d)
   use openacc
   implicit none
   real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
   real, intent(out) :: d(256, 256, 10)
   integer :: z

   integer(4) :: y, x

 !$acc kernels
 !$acc loop independent vector(16)
   do y=1,256
 !$acc loop independent vector(16)
    do x=1,256
 !$acc loop seq
     do z=1,10
      d(x, y, z)= a(x, y, z)* b(x, y, z)
     end do
    end do
   end do
 !$acc end kernels
 end subroutine

  subroutine reduce_to_sum(a, result)
   use openacc
   implicit none
   real, intent(in) :: a(256, 256, 10)
   real, intent(out) :: result
   integer :: z

   integer(4) :: y, x

   result = 0.0d0

 !$acc kernels
 !$acc loop independent vector(16)
   do y=1,256
 !$acc loop independent vector(16)
    do x=1,256
 !$acc loop seq
     do z=1,10
      result = result + a(x, y, z)
     end do
    end do
   end do
 !$acc end kernels
 end subroutine

 end module example

 program main
 use example
 implicit none
 real, dimension(256, 256, 10) :: a, b, c, d
 real :: sum_c, sum_d, expected_sum
 integer :: x, y, z
 integer :: fail_x, fail_y, fail_z
 logical test

 a(:,:,:) = 1.0d0
 b(:,:,:) = 2.0d0
 c(:,:,:) = 0.0d0
 d(:,:,:) = 0.0d0
 test = .TRUE.

 call data_region(a, b, c, d, sum_c, sum_d)
 write(6,*) "calculation complete"

 expected_sum = 3.0d0 * 256 * 256 * 10
 if ( abs(sum_c - expected_sum) > 1E-5 ) then
 write(6,*) "sum c failed: ", sum_c, "; expected: ", expected_sum
 stop 2
 end if

 expected_sum = 2.0d0 * 256 * 256 * 10
 if ( abs(sum_d - expected_sum) > 1E-5 ) then
 write(6,*) "sum d failed: ", sum_d, "; expected: ", expected_sum
 stop 2
 end if

 do y=1,256
  do x=1,256
   do z=1,10
    if (test .EQ. .TRUE. .AND. c(x, y, z) /= 3.0d0) then
    test = .FALSE.
    fail_x = x
    fail_y = y
    fail_z = z
    end if
    if (test .EQ. .TRUE. .AND. d(x, y, z) /= 2.0d0) then
    test = .FALSE.
    fail_x = x
    fail_y = y
    fail_z = z
    end if
   end do
  end do
 end do
 if (test .EQ. .TRUE.) then
 write(6,*) "test ok"
 else
 write(6,*) "test failed"
 write(6,*) "fails at", fail_x, fail_y, fail_z, "C:", c(fail_x, fail_y, fail_z), "D:", d(fail_x, fail_y, fail_z)
 stop 2
 end if

 stop
 end program main
% pgfortran -acc -Minfo=accel test_032715.1.f90; a.out
data_region:
     11, Generating enter data copyin(d(:,:,:),b(:,:,:),c(:,:,:),a(:,:,:))
     17, Generating exit data copyout(d(:,:,:),b(:,:,:),c(:,:,:),a(:,:,:))
add:
     41, Generating copyout(c(:,:,:))
         Generating copyin(a(:,:,:),b(:,:,:))
         Generating Tesla code
     43, Loop is parallelizable
     45, Loop is parallelizable
     47, Loop is parallelizable
         Accelerator kernel generated
         43, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
         45, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
mult:
     64, Generating copyout(d(:,:,:))
         Generating copyin(a(:,:,:),b(:,:,:))
         Generating Tesla code
     66, Loop is parallelizable
     68, Loop is parallelizable
     70, Loop is parallelizable
         Accelerator kernel generated
         66, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
         68, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
reduce_to_sum:
     89, Generating copyin(a(:,:,:))
         Generating Tesla code
     91, Loop is parallelizable
     93, Loop is parallelizable
     95, Loop is parallelizable
         Accelerator kernel generated
         91, !$acc loop gang, vector(16) ! blockidx%y threadidx%y
         93, !$acc loop gang, vector(16) ! blockidx%x threadidx%x
         96, Sum reduction generated for result
 calculation complete
 test ok
Warning: ieee_inexact is signaling
FORTRAN STOP

Thanks Mat! Once again your workaround works.

TP#21474 is resolved with PGI 20.1