I am experiencing a problem with the performance of my application when using an automatic array in a device subroutine defined with the “!$acc routine seq” derivative.
The automatic array is allocated in heap (global) memory using malloc, which results in interthread locking and significant performance degradation. Even when using the “-Mstack_arrays” compiler option, malloc is still being used.
Please find below an example program:
main.F90
program main
use omp_lib
use sub
implicit none
integer, parameter :: nitr = 100
integer :: KA, IA
real(RP), allocatable :: x(:,:), y(:,:)
real(8) :: et, ts
integer :: i, n
KA = 32
IA = 10000
allocate(x(KA,IA), y(KA,IA))
x(:,:) = 1.0_RP
!$acc data copyin(x) copyout(y)
et = 0.0d0
do n = 1, nitr
ts = omp_get_wtime()
!$acc parallel
!$acc loop independent
do i = 1, IA
call oneD(KA, x(:,i), y(:,i))
end do
!$acc end parallel
et = et + ( omp_get_wtime() - ts )
end do
!$acc end data
write(*,*)"error: ", sum( abs(y-1.0_RP) )
write(*,*)"TIME: ", et
end program main
sub.F90
module sub
implicit none
integer, parameter :: RP = 8
contains
subroutine oneD( &
KA, &
X, Y )
!$acc routine seq nohost
implicit none
integer, intent(in), value :: KA
real(RP), intent(in) :: X(KA)
real(RP), intent(out) :: Y(KA)
real(RP) :: work(KA) ! automatic array
integer :: k
!---------------------------------------------------------------------------
do k = 1, KA
work(k) = X(k)
end do
do k = 1, KA
Y(k) = work(k)
end do
return
end subroutine oneD
end module sub
This program takes about 0.67 seconds to execute.
I have found that if I use alloca instead of malloc to allocate the automatic array in the stack, the execution time is much shorter (about 0.0027 seconds).
sub_alloca.cu
extern "C" {
__device__ void sub_oned_(int ka, double *x, double *y)
{
double *z;
z = (double*) alloca(ka * sizeof(double));
for(int k=0; k<ka; k++)
z[k] = x[k];
for(int k=0; k<ka; k++)
y[k] = z[k];
}
}
My question is,
do you plan to use alloca with the “-Mstack_arrays” option?
Thank you.