Hello,
I encountered some issue calling an acc routine within an acc kernels construct, and was able to create a reproducible example that I include below. The routine performs a reduction, which the compiler correctly identifies. I have observed two issues, in short:
- If the routine call requires the creation of an array temporary this causes a runtime 700 error
- If the routine performs a vector reduction, the wrong answer is returned
The routine behaves as desired if it is declared as sequential, called outside of a kernels construct, or written inline with the kernels code. Two of these options are not desirable for performance reasons, the third is unwieldy code.
Is any of this behavior expected? I am using nvfortran 24.7 with cuda 12.2 on an a100 gpu
! A minimal nvhpc error case
module formulae
implicit none
contains
!"""
! Integrate an equispaced grid, trapezoidal
!
!"""
real(kind=8) function integrate(y,dx,n)
!$acc routine seq
implicit none
real(kind=8), intent(in) :: dx,y(:)
integer, intent(in) :: n
integrate=(sum(y(1:n))-(y(1)+y(n))*0.5)*dx
end function integrate
real(kind=8) function integrate_vec(y,dx,n)
!$acc routine vector
implicit none
real(kind=8), intent(in) :: dx,y(:)
integer, intent(in) :: n
integrate_vec=(sum(y(1:n))-(y(1)+y(n))*0.5)*dx
end function integrate_vec
end module formulae
program test
use openacc
call test_cpu ! Returns 9999.99
call test_seq ! Returns 9999.99
call test_vec ! Returns 312.49
call test_inline !Returns 9999.99
call test_array_temporary ! error 700: Illegal address during kernel execution
end program test
subroutine test_array_temporary
use formulae
implicit none
real(kind=8),allocatable :: a(:),b(:)
real(kind=8) :: y,dx
integer :: n
n=1e6
dx=0.01
allocate(a(n),b(n))
!$acc enter data create(a,b)
!$acc kernels present(a,b)
a=1.0
b=1.0
y=integrate(a*b,dx,n)
!$acc end kernels
!$acc exit data delete(a,b)
deallocate(a,b)
write(6,*) 'Test Array Temporary:', y
end subroutine test_array_temporary
subroutine test_seq
use formulae
implicit none
real(kind=8),allocatable :: a(:),b(:),c(:)
real(kind=8) :: y,dx
integer :: n
n=1e6
dx=0.01
allocate(a(n),b(n),c(n))
!$acc enter data create(a,b,c)
!$acc kernels present(a,b,c)
a=1.0
b=1.0
c=a*b
y=integrate(c,dx,n)
!$acc end kernels
!$acc exit data delete(a,b,c)
deallocate(a,b,c)
write(6,*) 'test seq:', y
end subroutine test_seq
subroutine test_cpu
use formulae
implicit none
real(kind=8),allocatable :: a(:),b(:)
real(kind=8) :: y,dx
integer :: n
n=1e6
dx=0.01
allocate(a(n),b(n))
a=1.0
b=1.0
y=integrate(a*b,dx,n)
deallocate(a,b)
write(6,*) 'test cpu:', y
end subroutine test_cpu
subroutine test_vec
use formulae
implicit none
real(kind=8),allocatable :: a(:),b(:),c(:)
real(kind=8) :: y,dx
integer :: n
n=1e6
dx=0.01
allocate(a(n),b(n),c(n))
!$acc enter data create(a,b,c)
!$acc kernels present(a,b,c)
a=1.0
b=1.0
c=a*b
y=integrate_vec(c,dx,n)
!$acc end kernels
!$acc exit data delete(a,b,c)
deallocate(a,b,c)
write(6,*) 'test vec:', y
end subroutine test_vec
subroutine test_inline
use formulae
implicit none
real(kind=8),allocatable :: a(:),b(:),c(:)
real(kind=8) :: y,dx
integer :: n
n=1e6
dx=0.01
allocate(a(n),b(n),c(n))
!$acc enter data create(a,b,c)
!$acc kernels present(a,b,c)
a=1.0
b=1.0
c=a*b
y=(sum(c)-0.5*(c(1)+c(n)))*dx
!$acc end kernels
!$acc exit data delete(a,b,c)
deallocate(a,b,c)
write(6,*) 'test inline:', y
end subroutine test_inline