Fortran OpenACC uses 32-bit indexing and thus does not work for arrays bigger than 16384 x 16384 (because 81638416384=2**32). Limiting OpenACC programers to 2GiB arrays isn’t ideal.
I am using PGI 17.10 with CUDA 9.0, Linux 3.10.0-693.el7.x86_64, and Xeon Scalable 8180 + NVIDIA GV100.
Compile
pgfortran -Mpreprocess -Mfreeform -O3 -tp=haswell -DRADIUS=2 -DSTAR -DPGI -acc -ta=tesla:cuda9.0 -Minfo=accel p2p-innerloop-ornlacc.f90 -o p2p-innerloop-ornlacc
Run
$ make p2p-innerloop-ornlacc && srun ./p2p-innerloop-ornlacc 1 16385 16385
make: `p2p-innerloop-ornlacc' is up to date.
Parallel Research Kernels
Fortran OpenACC INNERLOOP pipeline execution on 2D grid
WARNING: grid size exceeds 16384: 16385
PGI 17.10 + CUDA 9.0 generates illegal address
Number of iterations = 1
Grid sizes = 16385 16385
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
call to cuMemFreeHost returned error 700: Illegal address during kernel execution
$ make p2p-innerloop-ornlacc && srun ./p2p-innerloop-ornlacc 1 16384 16384
make: `p2p-innerloop-ornlacc' is up to date.
Parallel Research Kernels
Fortran OpenACC INNERLOOP pipeline execution on 2D grid
Number of iterations = 1
Grid sizes = 16384 16384
Solution validates
Rate (MFlop/s): 1865.409606 Avg time (s): 0.287768
Source
function prk_get_wtime() result(t)
use iso_fortran_env
implicit none
real(kind=REAL64) :: t
integer(kind=INT64) :: c, r
call system_clock(count = c, count_rate = r)
t = real(c,REAL64) / real(r,REAL64)
end function prk_get_wtime
program main
use iso_fortran_env
implicit none
real(kind=REAL64) :: prk_get_wtime
! for argument parsing
integer :: err
integer :: arglen
character(len=32) :: argtmp
! problem definition
integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm
integer(kind=INT32) :: n
real(kind=REAL64) :: corner_val ! verification value at top right corner of grid
real(kind=REAL64), allocatable :: grid(:,:) ! array holding grid values
! runtime variables
integer(kind=INT32) :: i, j, k
integer(kind=INT32) :: x, y
real(kind=REAL64) :: t0, t1, pipeline_time, avgtime ! timing parameters
real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance
! ********************************************************************
! read and test input parameters
! ********************************************************************
write(*,'(a25)') 'Parallel Research Kernels'
write(*,'(a55)') 'Fortran OpenACC INNERLOOP pipeline execution on 2D grid'
if (command_argument_count().lt.2) then
write(*,'(a17,i1)') 'argument count = ', command_argument_count()
write(*,'(a34,a16)') 'Usage: ./synch_p2p <# iterations> ', &
'<grid dimension>'
stop 1
endif
iterations = 1
call get_command_argument(1,argtmp,arglen,err)
if (err.eq.0) read(argtmp,'(i32)') iterations
n = 1
call get_command_argument(2,argtmp,arglen,err)
if (err.eq.0) read(argtmp,'(i32)') n
if (n .gt. 16384) then
write(*,'(a,i5)') 'WARNING: grid size exceeds 16384: ', n
write(*,'(a)') 'PGI 17.10 + CUDA 9.0 generates illegal address'
endif
if (iterations .lt. 1) then
write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
stop 1
endif
if (n .lt. 1) then
write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', n
stop 1
endif
#ifdef _OPENMP
write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads()
#endif
write(*,'(a,i8)') 'Number of iterations = ', iterations
write(*,'(a,i8,i8)') 'Grid sizes = ', n, n
allocate( grid(n,n), stat=err)
if (err .ne. 0) then
write(*,'(a,i3)') 'allocation of grid returned ',err
stop 1
endif
do j=1,n
do i=1,n
grid(i,j) = 0.0d0
enddo
enddo
do j=1,n
grid(1,j) = real(j-1,REAL64)
enddo
do i=1,n
grid(i,1) = real(i-1,REAL64)
enddo
!$acc data pcopy(grid)
do k=0,iterations
if (k.eq.1) t0 = prk_get_wtime()
do i=2,2*n-2
!$acc parallel loop independent
do j=max(2,i-n+2),min(i,n)
x = i-j+2
y = j
grid(x,y) = grid(x-1,y) + grid(x,y-1) - grid(x-1,y-1)
enddo
enddo
!$acc kernels
grid(1,1) = -grid(n,n)
!$acc end kernels
enddo
t1 = prk_get_wtime()
!$acc end data
pipeline_time = t1 - t0
! ********************************************************************
! ** Analyze and output results.
! ********************************************************************
! verify correctness, using top right value
corner_val = real((iterations+1)*(2*n-2),REAL64);
if (abs(grid(n,n)-corner_val)/corner_val .gt. epsilon) then
write(*,'(a,f10.2,a,f10.2)') 'ERROR: checksum ',grid(n,n), &
' does not match verification value ', corner_val
stop 1
endif
write(*,'(a)') 'Solution validates'
avgtime = pipeline_time/iterations
write(*,'(a,f13.6,a,f10.6)') 'Rate (MFlop/s): ',2.d-6*real((n-1)*(n-1),REAL64)/avgtime, &
' Avg time (s): ', avgtime
deallocate( grid )
end program