So I stumbled upon a problem with branches and kernels when porting a program to OpenACC. Am I getting this part of the spec correctly: A kernel may never be inside a branch (note: I’m not talking about jumping into the code of a kernel, just the whole kernel constructed being nested inside a branch)? I’ve tested the behavior with the following program and I’m getting an output that would suggest, that OpenACC indeed just ignores the branch conditions and executes both the ‘if’ and the ‘else’ kernels.
If that’s true, I have to ask here - why this limitation if you create synthetic CUDA C kernels anyways? Is there a workaround other than moving everything into separate subroutines?
Edit: One more thing: If my assumptions are correct, and the PGI Compiler just ignores branches in this case, I find this behaviour very dangerous. As it stands, this failure will only be detected in incorrect results (and lead to lengthy debugging in my case). I suggest that the compiler should fail when it detects a kernels/parallel construct inside an ‘if’ or ‘select’ branch.
Output CPU
calculation for c complete
c ok 1
d ok 1
calculation for d complete
d ok 2
c ok 2
Output GPU
calculation for c complete
c ok 1
d@ 1 1 1 : 2.000000
d failed 1
module example
contains
subroutine wrapper_inline_kernels(a, b, c, d, calc_switch)
logical, intent(in) :: calc_switch
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: c(256, 256, 10), d(256, 256, 10)
!$acc data copyin(a), copyout(c), copyin(b), copyout(d)
call inline_kernels (a(:, :, :), b(:, :, :), c(:, :, :), d(:, :, :), calc_switch)
!$acc end data
end subroutine
subroutine inline_kernels(a, b, c, d, calc_switch)
logical, intent(in) :: calc_switch
real, intent(in) :: a(256, 256, 10), b(256, 256, 10)
real, intent(out) :: c(256, 256, 10), d(256, 256, 10)
integer(4) :: y, x
!$acc data present(a), present(c), present(b), present(d)
if (calc_switch) then
!$acc kernels
!$acc loop independent vector(16)
do y=1,256
!$acc loop independent vector(16)
do x=1,256
!$acc loop seq
do z=1,10
c(x, y, z) = a(x, y, z) + b(x, y, z)
end do
end do
end do
!$acc end kernels
else
!$acc kernels
!$acc loop independent vector(16)
do y=1,256
!$acc loop independent vector(16)
do x=1,256
!$acc loop seq
do z=1,10
d(x, y, z) = a(x, y, z) * b(x, y, z)
end do
end do
end do
!$acc end kernels
end if
!$acc end data
end subroutine
end module example
program main
use example
real, dimension(256, 256, 10) :: a, b, c, d
integer :: x, y, z
logical test
a(:,:,:) = 1.0d0
b(:,:,:) = 2.0d0
c(:,:,:) = 0.0d0
d(:,:,:) = 0.0d0
test = .TRUE.
call wrapper_inline_kernels(a, b, c, d, .true.)
write(6,*) "calculation for c complete"
do y=1,256
do x=1,256
do z=1,10
if (test .EQ. .TRUE. .AND. c(x, y, z) .NE. 3.0d0) then
write(6,*) "c@", x, y, z, ": ", c(x, y, z)
test = .FALSE.
end if
end do
end do
end do
if (test .EQ. .TRUE.) then
write(6,*) "c ok 1"
else
write(6,*) "c failed 1"
stop 2
end if
do y=1,256
do x=1,256
do z=1,10
if (test .EQ. .TRUE. .AND. d(x, y, z) .NE. 0.0d0) then
write(6,*) "d@", x, y, z, ": ", d(x, y, z)
test = .FALSE.
end if
end do
end do
end do
if (test .EQ. .TRUE.) then
write(6,*) "d ok 1"
else
write(6,*) "d failed 1"
stop 2
end if
c(:,:,:) = 0.0d0
d(:,:,:) = 0.0d0
call wrapper_inline_kernels(a, b, c, d, .false.)
write(6,*) "calculation for d complete"
do y=1,256
do x=1,256
do z=1,10
if (test .EQ. .TRUE. .AND. d(x, y, z) .NE. 2.0d0) then
write(6,*) "d@", x, y, z, ": ", d(x, y, z)
test = .FALSE.
end if
end do
end do
end do
if (test .EQ. .TRUE.) then
write(6,*) "d ok 2"
else
write(6,*) "d failed 2"
stop 2
end if
do y=1,256
do x=1,256
do z=1,10
if (test .EQ. .TRUE. .AND. c(x, y, z) .NE. 0.0d0) then
write(6,*) "c@", x, y, z, ": ", c(x, y, z)
test = .FALSE.
end if
end do
end do
end do
if (test .EQ. .TRUE.) then
write(6,*) "c ok 2"
else
write(6,*) "c failed 2"
stop 2
end if
stop
end program main
[/code]