Dear PGI support,
While translating some of my PGI acc test codes to Openacc I have found some strong performance decrease (x5 slower). After investigation, it seems to be related to the present directive.
Here is the original PGI acc code:
! test programme PGI acc
module data_field
implicit none
real*8, allocatable :: a(:),b(:)
!$acc mirror(a,b)
END module data_field
module work_array
implicit none
real*8, allocatable :: a1(:), a2(:), a3(:), a4(:), a5(:), a6(:)
real*8, allocatable :: zparam(:)
!$acc mirror(a1,a2,a3,a4,a5,a6)
!$acc mirror(zparam)
END module work_array
module computation
implicit none
contains
subroutine gpu_routine(nvec,a,b)
USE work_array
integer, intent(in) :: nvec
real*8, intent(inout) :: a(nvec)
real*8, intent(in) :: b(nvec)
integer :: i,k, iparam, il
!$acc reflected(a,b)
!$acc region
DO iparam=1,8
zparam(iparam)=0.1D0*iparam
END DO
!$acc end region
!$acc region
Do i=1,nvec
a1(i)=0.1D0*(1.0D0+1.0D0/i)
a2(i)=0.2D0*(1.0D0+1.0D0/i)
a3(i)=0.3D0*(1.0D0+1.0D0/i)
a4(i)=0.4D0*(1.0D0+1.0D0/i)
a5(i)=0.5D0*(1.0D0+1.0D0/i)
a6(i)=0.6D0*(1.0D0+1.0D0/i)
END do
!$acc end region
!$acc region do kernel
do i=1,nvec
do iparam=1,8 ! just to imitate many operations
a(i)=zparam(iparam)*(1+cos(a(i)))+b(i)*(1.0D0+sin(1.0D0+a1(i)+a2(i)+a3(i)+a4(i)+a5(i)+a6(i)))
end do
end do !i
!$acc end region
end subroutine gpu_routine
end module computation
program main
USE data_field, only: a,b
USE work_array
USE computation, only: gpu_routine
implicit none
integer :: n1, n2
integer :: nargs,i,j,k,nt, dummy(20), niter
character*10 arg
integer :: nvec,nblock
real*8 :: rt
INTEGER :: icountnew, icountold, icountrate, icountmax
INTEGER :: z_sync(2) !use for synchronization
!$acc local(z_sync)
nargs = command_argument_count()
niter=10
if( nargs == 2 ) then
call getarg( 1, arg )
read(arg,'(i)') n1
call getarg( 2, arg )
read(arg,'(i)') n2
else
stop('usage ./test n1 n2')
endif
nvec=n1*n2
allocate(a(nvec),b(nvec))
allocate(a1(nvec), a2(nvec), a3(nvec), a4(nvec), a5(nvec), a6(nvec))
allocate(zparam(8))
z_sync(:)=1
!$acc region
do i=1,nvec
a(i)=0.0D0
b(i)=0.1D0
end do
!$acc end region
!$acc update device(z_sync)
CALL SYSTEM_CLOCK(COUNT=icountold,COUNT_RATE=icountrate,COUNT_MAX=icountmax)
do nt=1,niter
call gpu_routine(nvec,a,b)
end do
!$acc update device(z_sync)
CALL SYSTEM_CLOCK(COUNT=icountnew)
!$acc update host(a)
rt = ( REAL(icountnew) - REAL(icountold) ) / REAL(icountrate)
print*, 'n1 =', n1, 'n2=', n2, sum(a), sum(z_sync)
write(*,20) rt*1.0e3/niter
20 format( ' time/step=', f10.5, ' ms' )
DEALLOCATE(a,b,a1,a2,a3,a4,a5,a6,zparam)
end program main
and the Open Acc version:
! test programme OpenACC
module data_field
implicit none
real*8, allocatable :: a(:),b(:)
END module data_field
module work_array
implicit none
real*8, allocatable :: a1(:), a2(:), a3(:), a4(:), a5(:), a6(:)
real*8, allocatable :: zparam(:)
END module work_array
module computation
implicit none
contains
subroutine gpu_routine(nvec,a,b)
USE work_array, only: a1,a2,a3,a4,a5,a6,zparam
integer, intent(in) :: nvec
real*8, intent(inout) :: a(nvec)
real*8, intent(in) :: b(nvec)
integer :: i, iparam
!$acc data present(a,b) &
!$acc& present(a1,a2,a3,a4,a5,a6) &
!$acc& present(zparam)
!$acc kernels
DO iparam=1,8
zparam(iparam)=0.1D0*iparam
END DO
!$acc end kernels
!$acc kernels
Do i=1,nvec
a1(i)=0.1D0*(1.0D0+1.0D0/i)
a2(i)=0.2D0*(1.0D0+1.0D0/i)
a3(i)=0.3D0*(1.0D0+1.0D0/i)
a4(i)=0.4D0*(1.0D0+1.0D0/i)
a5(i)=0.5D0*(1.0D0+1.0D0/i)
a6(i)=0.6D0*(1.0D0+1.0D0/i)
END do
!$acc end kernels
!$acc kernels loop
do i=1,nvec
do iparam=1,8 ! just to imitate several operations
a(i)=zparam(iparam)*(1+cos(a(i)))+b(i)*(1.0D0+sin(1.0D0+a1(i)+a2(i)+a3(i)+a4(i)+a5(i)+a6(i)))
end do
end do !i
!$acc end kernels
!$acc end data
end subroutine gpu_routine
end module computation
program main
USE data_field, only: a,b
USE work_array, only: a1,a2,a3,a4,a5,a6,zparam
USE computation, only: gpu_routine
implicit none
integer :: n1,n2
integer :: nargs,i,j,k,nt, niter
character*10 arg
integer :: nvec,nblock
real*8 :: rt
INTEGER :: icountnew, icountold, icountrate, icountmax
INTEGER :: z_sync(2) !use for synchronization
nargs = command_argument_count()
niter=10
if( nargs == 2 ) then
call getarg( 1, arg )
read(arg,'(i)') n1
call getarg( 2, arg )
read(arg,'(i)') n2
else
stop('usage ./test n1 n2')
endif
nvec=n1*n2
allocate(a(nvec),b(nvec))
allocate(a1(nvec), a2(nvec), a3(nvec),a4(nvec),a5(nvec),a6(nvec))
allocate(zparam(8))
z_sync(:)=1
!$acc data create(z_sync) &
!$acc& create(a,b) &
!$acc& create(a1,a2,a3,a4,a5,a6,zparam)
!$acc kernels
do i=1,nvec
a(i)=0.0D0
b(i)=0.1D0
end do
!$acc end kernels
!$acc update device(z_sync)
CALL SYSTEM_CLOCK(COUNT=icountold,COUNT_RATE=icountrate,COUNT_MAX=icountmax)
do nt=1,niter
call gpu_routine(nvec,a,b)
end do
!$acc update device(z_sync)
CALL SYSTEM_CLOCK(COUNT=icountnew)
!$acc update host(a)
rt = ( REAL(icountnew) - REAL(icountold) ) / REAL(icountrate)
print*, 'n1 =', n1, 'n2=', n2, sum(a), sum(z_sync)
write(*,20) rt*1.0e3/niter
20 format( ' time/step=', f10.5, ' ms' )
DEALLOCATE(a,b,a1,a2,a3,a4,a5,a6,zparam)
!$acc end data
end program main
If I now compile and run the two codes I get:
pgf90 -ta=nvidia -o test test.f90
./test_automatic_array 100 100
n1 = 100 n2= 100 12313.39881122256 2
time/step= 0.10040 ms
pgf90 -ta=nvidia -o test_openacc test_openacc.f90
./test_openacc 100 100
n1 = 100 n2= 100 12313.39881122256 2
time/step= 0.56890 ms
I have investigated the problem with the nvidia profilier, and I have seen in the OpenAcc version that there are several memcpyHtoD between the three kernels in subroutine “gpu_routine”.
This seems incorrect as all array involved at this point should be already on the GPU (unless I am doing something wrong with the data region in the main program.)
Best regards,
Xavier
PGI version: 12.4