OpenACC: Problem with present directive and module array

Dear PGI support,

While translating some of my PGI acc test codes to Openacc I have found some strong performance decrease (x5 slower). After investigation, it seems to be related to the present directive.

Here is the original PGI acc code:


! test programme PGI acc
module data_field
  implicit none
  real*8, allocatable :: a(:),b(:)
  !$acc mirror(a,b)
END module data_field

module work_array
  implicit none
  real*8, allocatable :: a1(:), a2(:),  a3(:), a4(:), a5(:),  a6(:)
  real*8, allocatable :: zparam(:)
  !$acc mirror(a1,a2,a3,a4,a5,a6)
  !$acc mirror(zparam)
END module work_array

module computation
  implicit none
  
contains

subroutine gpu_routine(nvec,a,b)
  USE work_array
  integer, intent(in) :: nvec
  real*8, intent(inout) :: a(nvec)
  real*8, intent(in) :: b(nvec)
  integer :: i,k, iparam, il
  !$acc reflected(a,b)  
  

  !$acc region
  DO iparam=1,8
     zparam(iparam)=0.1D0*iparam
  END DO
  !$acc end region


  !$acc region 
  Do i=1,nvec       
     a1(i)=0.1D0*(1.0D0+1.0D0/i)
     a2(i)=0.2D0*(1.0D0+1.0D0/i)
     a3(i)=0.3D0*(1.0D0+1.0D0/i)
     a4(i)=0.4D0*(1.0D0+1.0D0/i)
     a5(i)=0.5D0*(1.0D0+1.0D0/i)
     a6(i)=0.6D0*(1.0D0+1.0D0/i)
  END do
  !$acc end region

  !$acc region do kernel
  do i=1,nvec       
     do iparam=1,8 ! just to imitate many operations
        a(i)=zparam(iparam)*(1+cos(a(i)))+b(i)*(1.0D0+sin(1.0D0+a1(i)+a2(i)+a3(i)+a4(i)+a5(i)+a6(i)))
     end do
  end do !i
  !$acc end region

end subroutine gpu_routine

end module computation
  
  


program main
  USE data_field, only: a,b
  USE work_array
  USE computation, only: gpu_routine
  implicit none
  integer :: n1, n2
  integer :: nargs,i,j,k,nt, dummy(20), niter
  character*10 arg
  integer :: nvec,nblock
  real*8 :: rt
  INTEGER ::  icountnew, icountold, icountrate, icountmax
  INTEGER :: z_sync(2) !use for synchronization
  !$acc local(z_sync)

  nargs = command_argument_count()
  niter=10

  if( nargs == 2 ) then
     call getarg( 1, arg )
     read(arg,'(i)') n1
     call getarg( 2, arg )
     read(arg,'(i)') n2
  else
     stop('usage ./test n1 n2') 
  endif
 
 nvec=n1*n2
 
 allocate(a(nvec),b(nvec))
 allocate(a1(nvec), a2(nvec),  a3(nvec), a4(nvec), a5(nvec),  a6(nvec))
 allocate(zparam(8))
 z_sync(:)=1


 !$acc region
 do i=1,nvec
    a(i)=0.0D0
    b(i)=0.1D0
 end do
 !$acc end region

 
 !$acc update device(z_sync)
 CALL SYSTEM_CLOCK(COUNT=icountold,COUNT_RATE=icountrate,COUNT_MAX=icountmax)
 
 do nt=1,niter
       call gpu_routine(nvec,a,b)
 end do

 !$acc update device(z_sync)
  CALL SYSTEM_CLOCK(COUNT=icountnew)
  !$acc update host(a)

rt = ( REAL(icountnew) - REAL(icountold) ) / REAL(icountrate)
 print*, 'n1 =', n1, 'n2=', n2, sum(a), sum(z_sync)
 write(*,20) rt*1.0e3/niter
20 format( ' time/step=', f10.5, ' ms' )

 DEALLOCATE(a,b,a1,a2,a3,a4,a5,a6,zparam)


end program main

and the Open Acc version:

! test programme OpenACC
module data_field
  implicit none
  real*8, allocatable :: a(:),b(:) 
END module data_field

module work_array
  implicit none
  real*8, allocatable :: a1(:), a2(:),  a3(:), a4(:), a5(:),  a6(:)
  real*8, allocatable :: zparam(:) 
END module work_array

module computation
  implicit none  
contains

subroutine gpu_routine(nvec,a,b)
  USE work_array, only: a1,a2,a3,a4,a5,a6,zparam
  integer, intent(in) :: nvec
  real*8, intent(inout) :: a(nvec)
  real*8, intent(in) :: b(nvec)
  integer :: i, iparam

  !$acc data present(a,b) &
  !$acc& present(a1,a2,a3,a4,a5,a6) &
  !$acc& present(zparam)
  
  !$acc kernels
  DO iparam=1,8
     zparam(iparam)=0.1D0*iparam
  END DO
  !$acc end kernels
 

  !$acc kernels 
  Do i=1,nvec       
     a1(i)=0.1D0*(1.0D0+1.0D0/i)
     a2(i)=0.2D0*(1.0D0+1.0D0/i)
     a3(i)=0.3D0*(1.0D0+1.0D0/i)
     a4(i)=0.4D0*(1.0D0+1.0D0/i)
     a5(i)=0.5D0*(1.0D0+1.0D0/i)
     a6(i)=0.6D0*(1.0D0+1.0D0/i)
  END do
  !$acc end kernels

  !$acc kernels loop
  do i=1,nvec       
     do iparam=1,8 ! just to imitate several operations
        a(i)=zparam(iparam)*(1+cos(a(i)))+b(i)*(1.0D0+sin(1.0D0+a1(i)+a2(i)+a3(i)+a4(i)+a5(i)+a6(i)))
     end do
  end do !i
  !$acc end kernels

  !$acc end data
 
end subroutine gpu_routine

end module computation
  
program main
  USE data_field, only: a,b
  USE work_array, only: a1,a2,a3,a4,a5,a6,zparam
  USE computation, only: gpu_routine
  implicit none
  integer :: n1,n2
  integer :: nargs,i,j,k,nt, niter
  character*10 arg
  integer :: nvec,nblock
  real*8 :: rt
  INTEGER ::  icountnew, icountold, icountrate, icountmax
  INTEGER :: z_sync(2) !use for synchronization

  nargs = command_argument_count()
  niter=10
  if( nargs == 2 ) then
     call getarg( 1, arg )
     read(arg,'(i)') n1
     call getarg( 2, arg )
     read(arg,'(i)') n2
  else
     stop('usage ./test n1 n2') 
  endif
 
 nvec=n1*n2
 
 allocate(a(nvec),b(nvec))
 allocate(a1(nvec), a2(nvec), a3(nvec),a4(nvec),a5(nvec),a6(nvec))
 allocate(zparam(8))
 z_sync(:)=1

!$acc data create(z_sync) &
!$acc& create(a,b) &
!$acc& create(a1,a2,a3,a4,a5,a6,zparam)

 !$acc kernels
 do i=1,nvec
    a(i)=0.0D0
    b(i)=0.1D0
 end do
 !$acc end kernels
 
 !$acc update device(z_sync)
 CALL SYSTEM_CLOCK(COUNT=icountold,COUNT_RATE=icountrate,COUNT_MAX=icountmax)
 
 do nt=1,niter
       call gpu_routine(nvec,a,b)
 end do

 !$acc update device(z_sync)
  CALL SYSTEM_CLOCK(COUNT=icountnew)
  !$acc update host(a)


 rt = ( REAL(icountnew) - REAL(icountold) ) / REAL(icountrate)
 print*, 'n1 =', n1, 'n2=', n2, sum(a), sum(z_sync)
  write(*,20) rt*1.0e3/niter
20 format( ' time/step=', f10.5, ' ms' )

 DEALLOCATE(a,b,a1,a2,a3,a4,a5,a6,zparam)
  !$acc end data

end program main

If I now compile and run the two codes I get:

pgf90 -ta=nvidia -o test test.f90

./test_automatic_array 100 100
n1 = 100 n2= 100 12313.39881122256 2
time/step= 0.10040 ms

pgf90 -ta=nvidia -o test_openacc test_openacc.f90
./test_openacc 100 100
n1 = 100 n2= 100 12313.39881122256 2
time/step= 0.56890 ms


I have investigated the problem with the nvidia profilier, and I have seen in the OpenAcc version that there are several memcpyHtoD between the three kernels in subroutine “gpu_routine”.
This seems incorrect as all array involved at this point should be already on the GPU (unless I am doing something wrong with the data region in the main program.)


Best regards,

Xavier

PGI version: 12.4

Hi Xavier,

As you know our support for OpenACC is still in development so your feedback is very much appreciated. In this case what’s happening is that the section descriptors for these arrays are being copied over to the GPU each time the present clause is encountered. Michael is aware of the issue and will have his team increase the priority of removing the need copying over the section descriptor. This work had been scheduled for later this year.

Note that the better equivalent OpenACC for “mirror” is the “device_resident” clause which is currently scheduled to be available in the 12.6.

Best Regards,
Mat

Hi Mat,

Ok, thanks for the info. I’ll wait then before further testing OpenAcc.

Best regards,

Xavier

I’ll wait then before further testing OpenAcc.

Please do continue testing since we need the feedback as we implement the full standard.

Thanks!
Mat

Hi Mat,

After your last comment I also went back to analyse nvidia profiler output for some other PGI acc codes (not OpenAcc).

In several subroutines I have also found many memcpyHtoD between kernels although the arrays used there are embeded in a data region and passed using the reflected keyword (also I don’t get any copy message from -Minfo).

Actually in one subroutine (which is made of a few different kernels), memcpyHtoD account for half of the total time (looking at the CPU time column).

What are these memcpyHtoD corresponding to ? Are they also related to section descriptors like for OpenAcc. Or could they be related to scalar parameters required in the kernel ?

Is there a way to get a list of what is being send to the device (the -Minfo says nothings apparently).

Thanks,

Xavier

Hi Xavier,

What are these memcpyHtoD corresponding to ? Are they also related to section descriptors like for OpenAcc. Or could they be related to scalar parameters required in the kernel ?

I doubt that it’s section descriptors since that is an due to how “present” is currently implemented in the OpenACC API. Also, kernel parameters are copied as a struct argument, however it’s my understanding that the time it takes to copy the parameter is included in the kernel time and not broken out as “memcpyHtoD” in the CUDA profile.

My best guess is that there are some extra arrays being copied that are not part of a data region.

Is there a way to get a list of what is being send to the device (the -Minfo says nothings apparently).

Try setting the environment variable “NVDEBUG=1”. This shows all calls made to the CUDA runtime including memory copies. It will also show the variable names that it’s copying. You can tell if it’s section descriptor if the variable name is followed by a “$sd”.

  • Mat

Hi Mat,

Thanks for the tip. I tried to run with NVDEBUG=1. Here is the output between 2 kernels where I don’t expect any data transfer.

__pgi_cu_init( file=/project/s83/lapixa/COSMO_ICON_4.18_GPU_dev/src/soil_multilay.f90, function=terra_multlay, line=2164, startline=942, endline=5165 )
__pgi_cu_module3( lineno=2164 )
__pgi_cu_module3 module loaded at 0x7425770
__pgi_cu_module_function( name=0x117e4d0=terra_multlay_2165_gpu, lineno=2165, argname=0x0=, argsize=84, SWcachesize=0 )
Function handle is 0x74264a0
__pgi_cu_launch_a(func=0x74264a0, grid=19x1x1, block=256x1x1, lineno=2165)
__pgi_cu_launch_a(func=0x74264a0, params=0x7ffffb8e0150, bytes=84, sharedbytes=0)
First arguments are:
                          [ ... ]
__pgi_cu_close()
__pgi_cu_init( file=/project/s83/lapixa/COSMO_ICON_4.18_GPU_dev/src/soil_multilay.f90, function=terra_multlay, line=2183, startline=942, endline=5165 )
__pgi_cu_module3( lineno=2183 )
__pgi_cu_module3 module loaded at 0x7427690
__pgi_cu_module_function( name=0x11829c0=terra_multlay_2184_gpu, lineno=2184, argname=0x11829d7=a12, argsize=736, SWcachesize=0 )
Function handle is 0x7439740
__pgi_cu_uploadc( "a12", size=736, offset=0, lineno=2184 )
constant data a12 at address 0x20202200 devsize=736, size=736, offset=0
First arguments are:

                   [ ... ]    

  __pgi_cu_launch_a(func=0x7439740, grid=19x1x1, block=256x1x1, lineno=2184)
__pgi_cu_launch_a(func=0x7439740, params=0x7ffffb8e0150, bytes=0, sharedbytes=0)
__pgi_cu_close()

The nvidia profiler at this stage shows:

time stamp                 Method       GPU time  CPU time (us)
1.52021e+06	terra_multlay_2165_gpu	9.28	27
1.52062e+06	memcpyHtoD	            1.184	45
1.5207e+06	terra_multlay_2184_gpu	25.152	33.152

I suppose the unexpected memcpyHtoD here is related to the “__pgi_cu_uploadc( “a12”, size=736, offset=0, lineno=2184 )”.
This a12 variable is apparently generated by the compiler. I suppose it may be related with some parameters. Would there be a way to place this in a data region ?

Of course the time are not very high (45 us) but is comparable to the kernel execution time. I have quite a few of them and the total sum is not negligeable at the end (some are called inside a loop).

Thanks,

Xavier

Hi Xavier,

I think I was mistaken before. In this case, it may actually be the parameter list to the routine. CUDA limits the argument list to 256 bytes so for routines with larger argument lists, we create a struct with the arguments, copy the struct to the device, and then launch the kernel with pointer to the struct as it’s arguments.

Try keeping the generated GPU code, -ta=nvidia,keepgpu, and look at the top of the file for a struct named “a12”. If it’s there and is the argument to your routine, then this is what’s going on.

  • Mat

Hi Mat,

I looked up in the .gpu file, but there are no a12 variable in the argument list of the kernel.

Most variables corresponds to variable in my code ( as I can see from the comment), apart from 3 integers.

Anyway what would be interesting though would be to know if I have any way to supress these data transfer, like putting scalars in a data region.

Xavier

Hi,

I am comming back to the first issue discussed in this post, concerning the overhead assoicated with the present directive in OpenAcc:

In this case what’s happening is that the section descriptors for these arrays are being copied over to the GPU each time the present clause is encountered. Michael is aware of the issue and will have his team increase the priority of removing the need copying over the section descriptor. This work had been scheduled for later this year.

Do you know whether this will be better in 12.6 ? Or could you give some time line on when we could expect some improvment regarding this aspect ?

With the partial support of the parallel construct, we were able to compile some of our OpenAcc code with PGI. For some parts however, where small kernels are called in a loop, this descriptors copy leads to 10x slower code, with respect to what we are getting with CCE.

Best regards,

Xavier

Hi Xavier,

Do you know whether this will be better in 12.6 ?

Nothing has changed w.r.t. how arguments are passed in 12.6 and Michael did not give me a time line on when they’ll be able to revisit this.

Michael did mention you can try setting the environment variable “PGI_ACC_SEQDATA” to ‘1’. One of the various things that does is to use async data movement from pinned
memory for argument buffers. It may help here.

  • Mat

Hi Mat,

Thanks for the update.

Using the PGI_ACC_SEQDATA did not help much in our case, as in the part of our code were we noticed this problem, the kernels are quite small and the timing seems to be dominated by these meta data update.


Xavier

Xavier: You say you see 'cuMemcpyHtoD calls. These should be cuMemcpyHtoDAsync calls, the updates should get getting done asynchronously, at least with the latest release (12.6). Now that I think on this, I’m not sure those updates were asynchronous before 12.5. There’s really no workaround for sending large argument lists to a kernel except to use a memcopy. We put those argument structs into constant memory on the device side.

Hi Micheal,


Sorry it was probaly not clear in my previous post, looking at the profiler, when I go from 12.5 to 12.6 I see those unexpected cuMemcpyHtoD changing to cuMemcpyHtoDAsync. However, in the part of the code where we noticed this issue, the kernels times are comparable to the cuMemcpyHtoDAsync, so that having them asynchronous does not help very much.

The problem is that the kernels are called in a loop, so these copy are done very often

There’s really no workaround for sending large argument lists to a kernel except to use a memcopy

Note that the cuMemcpyHtoDAsync I am refering to are those associated with the present directive when using OpenAcc with PGI. What I don’t fully understand is what is the fundamental difference between data create + present in OpenACC and mirror + reflected using the old PGI-directive. With “mirror + reflected” I didn’t saw these additional memcopy at each kernel call, so I assume in this case the array information was send to the device at the begining of the implicit data region associated with the mirror directive. Is something similar not possible with OpenACC ?
Note also that we are not seeing additional copies when using the Cray compiler on the same code.

Xavier

Hi here is an update from the previous post:

  • following an advice from Mat in an other post we removed all the “private” statment. We find out that in fact this was causing most of the performance issue; now the code is about 10 time faster. My guess is that the private directive was generating some additional cudamalloc, and since our kernels are called in a loop this was very bad (or do you have any other idea/experience ?)

  • the time spent in the cuMemcpyHtoDAsync associated with the array metadata is still of the order of the kernels time, so I am assuming that it can not be fully overlaped, and it still leads to some performance penalty. In fact we found out that setting PGI_ACC_SEQDATA=1 (making the transfer sequential), is in fact faster in our case. I’m not sure why but my gess is that there is a trade off between making the transfer asynchronous and the larger time required to allocate pinned memory.

Best regards,

Xavier