call to cuStreamSynchronize returned error 700: Illegal addr

Fortran OpenACC uses 32-bit indexing and thus does not work for arrays bigger than 16384 x 16384 (because 81638416384=2**32). Limiting OpenACC programers to 2GiB arrays isn’t ideal.

I am using PGI 17.10 with CUDA 9.0, Linux 3.10.0-693.el7.x86_64, and Xeon Scalable 8180 + NVIDIA GV100.

Compile

pgfortran -Mpreprocess -Mfreeform -O3 -tp=haswell -DRADIUS=2 -DSTAR -DPGI -acc -ta=tesla:cuda9.0 -Minfo=accel p2p-innerloop-ornlacc.f90 -o p2p-innerloop-ornlacc

Run

$ make p2p-innerloop-ornlacc && srun ./p2p-innerloop-ornlacc 1 16385 16385
make: `p2p-innerloop-ornlacc' is up to date.
Parallel Research Kernels
Fortran OpenACC INNERLOOP pipeline execution on 2D grid
WARNING: grid size exceeds 16384: 16385
PGI 17.10 + CUDA 9.0 generates illegal address
Number of iterations     =        1
Grid sizes               =    16385   16385
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
call to cuMemFreeHost returned error 700: Illegal address during kernel execution



$ make p2p-innerloop-ornlacc && srun ./p2p-innerloop-ornlacc 1 16384 16384
make: `p2p-innerloop-ornlacc' is up to date.
Parallel Research Kernels
Fortran OpenACC INNERLOOP pipeline execution on 2D grid
Number of iterations     =        1
Grid sizes               =    16384   16384
Solution validates
Rate (MFlop/s):   1865.409606 Avg time (s):   0.287768

Source

function prk_get_wtime() result(t)
  use iso_fortran_env
  implicit none
  real(kind=REAL64) ::  t
  integer(kind=INT64) :: c, r
  call system_clock(count = c, count_rate = r)
  t = real(c,REAL64) / real(r,REAL64)
end function prk_get_wtime
 
program main
  use iso_fortran_env
  implicit none
  real(kind=REAL64) :: prk_get_wtime
  ! for argument parsing
  integer :: err
  integer :: arglen
  character(len=32) :: argtmp
  ! problem definition
  integer(kind=INT32) :: iterations                     ! number of times to run the pipeline algorithm
  integer(kind=INT32) :: n
  real(kind=REAL64) :: corner_val                       ! verification value at top right corner of grid
  real(kind=REAL64), allocatable :: grid(:,:)           ! array holding grid values
  ! runtime variables
  integer(kind=INT32) :: i, j, k
  integer(kind=INT32) :: x, y
  real(kind=REAL64) ::  t0, t1, pipeline_time, avgtime  ! timing parameters
  real(kind=REAL64), parameter ::  epsilon=1.D-8        ! error tolerance
 
  ! ********************************************************************
  ! read and test input parameters
  ! ********************************************************************
 
  write(*,'(a25)') 'Parallel Research Kernels'
  write(*,'(a55)') 'Fortran OpenACC INNERLOOP pipeline execution on 2D grid'
 
  if (command_argument_count().lt.2) then
    write(*,'(a17,i1)') 'argument count = ', command_argument_count()
    write(*,'(a34,a16)') 'Usage: ./synch_p2p <# iterations> ',  &
                         '<grid dimension>'
    stop 1
  endif
 
  iterations = 1
  call get_command_argument(1,argtmp,arglen,err)
  if (err.eq.0) read(argtmp,'(i32)') iterations
 
  n = 1
  call get_command_argument(2,argtmp,arglen,err)
  if (err.eq.0) read(argtmp,'(i32)') n
 
  if (n .gt. 16384) then
    write(*,'(a,i5)') 'WARNING: grid size exceeds 16384: ', n
    write(*,'(a)')    'PGI 17.10 + CUDA 9.0 generates illegal address'
  endif
 
  if (iterations .lt. 1) then
    write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations
    stop 1
  endif
 
  if (n .lt. 1) then
    write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', n
    stop 1
  endif
 
#ifdef _OPENMP
  write(*,'(a,i8)')    'Number of threads        = ', omp_get_max_threads()
#endif
  write(*,'(a,i8)')    'Number of iterations     = ', iterations
  write(*,'(a,i8,i8)') 'Grid sizes               = ', n, n
 
  allocate( grid(n,n), stat=err)
  if (err .ne. 0) then
    write(*,'(a,i3)') 'allocation of grid returned ',err
    stop 1
  endif
 
  do j=1,n
    do i=1,n
      grid(i,j) = 0.0d0
    enddo
  enddo
  do j=1,n
    grid(1,j) = real(j-1,REAL64)
  enddo
  do i=1,n
    grid(i,1) = real(i-1,REAL64)
  enddo
 
  !$acc data pcopy(grid)
 
  do k=0,iterations
 
    if (k.eq.1) t0 = prk_get_wtime()
 
    do i=2,2*n-2
      !$acc parallel loop independent
      do j=max(2,i-n+2),min(i,n)
        x = i-j+2
        y = j
        grid(x,y) = grid(x-1,y) + grid(x,y-1) - grid(x-1,y-1)
      enddo
    enddo
    !$acc kernels
    grid(1,1) = -grid(n,n)
    !$acc end kernels
 
  enddo
 
  t1 = prk_get_wtime()
 
  !$acc end data
 
  pipeline_time = t1 - t0
 
  ! ********************************************************************
  ! ** Analyze and output results.
  ! ********************************************************************
 
  ! verify correctness, using top right value
  corner_val = real((iterations+1)*(2*n-2),REAL64);
  if (abs(grid(n,n)-corner_val)/corner_val .gt. epsilon) then
    write(*,'(a,f10.2,a,f10.2)') 'ERROR: checksum ',grid(n,n), &
            ' does not match verification value ', corner_val
    stop 1
  endif
 
  write(*,'(a)') 'Solution validates'
  avgtime = pipeline_time/iterations
  write(*,'(a,f13.6,a,f10.6)') 'Rate (MFlop/s): ',2.d-6*real((n-1)*(n-1),REAL64)/avgtime, &
         ' Avg time (s): ', avgtime
 
  deallocate( grid )
 
end program

Hi Jeff,

For dynamic arrays larger that 2GB, please add the compiler flag “-Mlarge_arrays”. For large static arrays, use “-mcmodel=medium”.

I tried compiling your code with -Mlarge_arrays and it works as expected with the large arrays.

Hope this helps,
Mat