Reduction breaking in 15.3?

I have 14.7 and 15.3 installed on OSX. The following works in the former, but not in the later version. In 14.7, the result is “test ok” while in 15.3 it will print “sum c failed: 0.000000 ; expected: 1966080”.

Update: I just checked the same on linux with 14.7 and 15.1 - same result: 14.7 works, 15.1 sets the sum to 0.

``````module example
contains

subroutine data_region(a, b, c, d, sum_c, sum_d)
use openacc
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: c(256, 256, 10), d(256, 256, 10)
real, intent(out) :: sum_c, sum_d

!\$acc enter data copyin(a), copyin(c), copyin(b), copyin(d)

call run(a, b, c, d)
call reduce_to_sum (c, sum_c)
call reduce_to_sum (d, sum_d)

!\$acc exit data delete(a), copyout(c), delete(b), copyout(d)
end subroutine

subroutine run(a, b, c, d)
use openacc
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: c(256, 256, 10), d(256, 256, 10)

call mult (a, b, d)

end subroutine

use openacc
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: c(256, 256, 10)
integer :: z

integer(4) :: y, x

!\$acc kernels
!\$acc loop independent vector(16)
do y=1,256
!\$acc loop independent vector(16)
do x=1,256
!\$acc loop seq
do z=1,10
c(x, y, z)= a(x, y, z)+ b(x, y, z)
end do
end do
end do
!\$acc end kernels
end subroutine

subroutine mult(a, b, d)
use openacc
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: d(256, 256, 10)
integer :: z

integer(4) :: y, x

!\$acc kernels
!\$acc loop independent vector(16)
do y=1,256
!\$acc loop independent vector(16)
do x=1,256
!\$acc loop seq
do z=1,10
d(x, y, z)= a(x, y, z)* b(x, y, z)
end do
end do
end do
!\$acc end kernels
end subroutine

subroutine reduce_to_sum(a, result)
use openacc
implicit none
real, intent(in) :: a(256, 256, 10)
real, intent(out) :: result
integer :: z

integer(4) :: y, x

result = 0.0d0

!\$acc kernels
!\$acc loop independent vector(16) reduction(+: result)
do y=1,256
!\$acc loop independent vector(16)
do x=1,256
!\$acc loop seq
do z=1,10
result = result + a(x, y, z)
end do
end do
end do
!\$acc end kernels
end subroutine

end module example

program main
use example
implicit none
real, dimension(256, 256, 10) :: a, b, c, d
real :: sum_c, sum_d, expected_sum
integer :: x, y, z
integer :: fail_x, fail_y, fail_z
logical test

a(:,:,:) = 1.0d0
b(:,:,:) = 2.0d0
c(:,:,:) = 0.0d0
d(:,:,:) = 0.0d0
test = .TRUE.

call data_region(a, b, c, d, sum_c, sum_d)
write(6,*) "calculation complete"

expected_sum = 3.0d0 * 256 * 256 * 10
if ( abs(sum_c - expected_sum) > 1E-5 ) then
write(6,*) "sum c failed: ", sum_c, "; expected: ", expected_sum
stop 2
end if

expected_sum = 2.0d0 * 256 * 256 * 10
if ( abs(sum_d - expected_sum) > 1E-5 ) then
write(6,*) "sum d failed: ", sum_d, "; expected: ", expected_sum
stop 2
end if

do y=1,256
do x=1,256
do z=1,10
if (test .EQ. .TRUE. .AND. c(x, y, z) /= 3.0d0) then
test = .FALSE.
fail_x = x
fail_y = y
fail_z = z
end if
if (test .EQ. .TRUE. .AND. d(x, y, z) /= 2.0d0) then
test = .FALSE.
fail_x = x
fail_y = y
fail_z = z
end if
end do
end do
end do
if (test .EQ. .TRUE.) then
write(6,*) "test ok"
else
write(6,*) "test failed"
write(6,*) "fails at", fail_x, fail_y, fail_z, "C:", c(fail_x, fail_y, fail_z), "D:", d(fail_x, fail_y, fail_z)
stop 2
end if

stop
end program main
``````

[/code]

Hi Michel,

The example “works” in 14.7 only because we think there’s a dependency on “result” given it has intent out. Hence only a scalar kernel is being generated.

``````reduce_to_sum:
89, Generating present_or_copyin(a(:,:,:))
Generating Tesla code
91, Accelerator restriction: scalar variable live-out from loop: result
93, Accelerator restriction: scalar variable live-out from loop: result
95, Loop is parallelizable
Accelerator kernel generated
``````

Once we fixed the dependency issue and generate the parallel kernel, it exposes the problem with the reduction. I can recreate the problem in 14.7 if I use a local variable instead of the argument.

I have added a problem report (TPR#21474) and sent it to engineering for further investigation. The work around is to remove the “reduction” clause and instead let the compiler discover the reduction. It seems to create the correct results in this case.

Thanks!
Mat

``````% cat test_032715.1.f90
module example
contains

subroutine data_region(a, b, c, d, sum_c, sum_d)
use openacc
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: c(256, 256, 10), d(256, 256, 10)
real, intent(out) :: sum_c, sum_d

!\$acc enter data copyin(a), copyin(c), copyin(b), copyin(d)

call run(a, b, c, d)
call reduce_to_sum (c, sum_c)
call reduce_to_sum (d, sum_d)

!\$acc exit data delete(a), copyout(c), delete(b), copyout(d)
end subroutine

subroutine run(a, b, c, d)
use openacc
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: c(256, 256, 10), d(256, 256, 10)

call mult (a, b, d)

end subroutine

use openacc
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: c(256, 256, 10)
integer :: z

integer(4) :: y, x

!\$acc kernels
!\$acc loop independent vector(16)
do y=1,256
!\$acc loop independent vector(16)
do x=1,256
!\$acc loop seq
do z=1,10
c(x, y, z)= a(x, y, z)+ b(x, y, z)
end do
end do
end do
!\$acc end kernels
end subroutine

subroutine mult(a, b, d)
use openacc
implicit none
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: d(256, 256, 10)
integer :: z

integer(4) :: y, x

!\$acc kernels
!\$acc loop independent vector(16)
do y=1,256
!\$acc loop independent vector(16)
do x=1,256
!\$acc loop seq
do z=1,10
d(x, y, z)= a(x, y, z)* b(x, y, z)
end do
end do
end do
!\$acc end kernels
end subroutine

subroutine reduce_to_sum(a, result)
use openacc
implicit none
real, intent(in) :: a(256, 256, 10)
real, intent(out) :: result
integer :: z

integer(4) :: y, x

result = 0.0d0

!\$acc kernels
!\$acc loop independent vector(16)
do y=1,256
!\$acc loop independent vector(16)
do x=1,256
!\$acc loop seq
do z=1,10
result = result + a(x, y, z)
end do
end do
end do
!\$acc end kernels
end subroutine

end module example

program main
use example
implicit none
real, dimension(256, 256, 10) :: a, b, c, d
real :: sum_c, sum_d, expected_sum
integer :: x, y, z
integer :: fail_x, fail_y, fail_z
logical test

a(:,:,:) = 1.0d0
b(:,:,:) = 2.0d0
c(:,:,:) = 0.0d0
d(:,:,:) = 0.0d0
test = .TRUE.

call data_region(a, b, c, d, sum_c, sum_d)
write(6,*) "calculation complete"

expected_sum = 3.0d0 * 256 * 256 * 10
if ( abs(sum_c - expected_sum) > 1E-5 ) then
write(6,*) "sum c failed: ", sum_c, "; expected: ", expected_sum
stop 2
end if

expected_sum = 2.0d0 * 256 * 256 * 10
if ( abs(sum_d - expected_sum) > 1E-5 ) then
write(6,*) "sum d failed: ", sum_d, "; expected: ", expected_sum
stop 2
end if

do y=1,256
do x=1,256
do z=1,10
if (test .EQ. .TRUE. .AND. c(x, y, z) /= 3.0d0) then
test = .FALSE.
fail_x = x
fail_y = y
fail_z = z
end if
if (test .EQ. .TRUE. .AND. d(x, y, z) /= 2.0d0) then
test = .FALSE.
fail_x = x
fail_y = y
fail_z = z
end if
end do
end do
end do
if (test .EQ. .TRUE.) then
write(6,*) "test ok"
else
write(6,*) "test failed"
write(6,*) "fails at", fail_x, fail_y, fail_z, "C:", c(fail_x, fail_y, fail_z), "D:", d(fail_x, fail_y, fail_z)
stop 2
end if

stop
end program main
% pgfortran -acc -Minfo=accel test_032715.1.f90; a.out
data_region:
11, Generating enter data copyin(d(:,:,:),b(:,:,:),c(:,:,:),a(:,:,:))
17, Generating exit data copyout(d(:,:,:),b(:,:,:),c(:,:,:),a(:,:,:))
41, Generating copyout(c(:,:,:))
Generating copyin(a(:,:,:),b(:,:,:))
Generating Tesla code
43, Loop is parallelizable
45, Loop is parallelizable
47, Loop is parallelizable
Accelerator kernel generated
43, !\$acc loop gang, vector(16) ! blockidx%y threadidx%y
45, !\$acc loop gang, vector(16) ! blockidx%x threadidx%x
mult:
64, Generating copyout(d(:,:,:))
Generating copyin(a(:,:,:),b(:,:,:))
Generating Tesla code
66, Loop is parallelizable
68, Loop is parallelizable
70, Loop is parallelizable
Accelerator kernel generated
66, !\$acc loop gang, vector(16) ! blockidx%y threadidx%y
68, !\$acc loop gang, vector(16) ! blockidx%x threadidx%x
reduce_to_sum:
89, Generating copyin(a(:,:,:))
Generating Tesla code
91, Loop is parallelizable
93, Loop is parallelizable
95, Loop is parallelizable
Accelerator kernel generated
91, !\$acc loop gang, vector(16) ! blockidx%y threadidx%y
93, !\$acc loop gang, vector(16) ! blockidx%x threadidx%x
96, Sum reduction generated for result
calculation complete
test ok
Warning: ieee_inexact is signaling
FORTRAN STOP
``````

Thanks Mat! Once again your workaround works.

TP#21474 is resolved with PGI 20.1