Hi Paul,
Interesting test case with multiple issues. One code issue and three compiler issues. Not to give away the ending, but the work around in all cases is to use atomics. Atomics may be the better way to go anyway, at least for “cluster_sum” given the high overhead of doing array reductions, but we wont be able to compare performance until we get these issues fixed.
Case #1, “cluster_sum”
Error 1 “Illegal address during kernel execution”
This is due to the size of the array and the memory required to create the reductions private partial arrays. Normally I’d suggest adding “-Mlarge_arrays” or “-mcmodel=medium”, that this doesn’t seem to help. Hence I’ve added TPR #31874 and have asked our engineers to look at the private array indexing and make sure 64-bit offsets are being used.
Error 2 “Wrong answers”
Reducing “ncluster” to a smaller value will work around the first error, but the code now gets incorrect answers. I’m not sure the cause of this, but speculate it may have something to do with the irregular loop bound and the use of the look-up arrays. Though it could be something else as well. I’ve added TPR #31876 and asked engineering to review.
Case #2 “cluster_sum_local”
Error #1 “Initial wrong answers”
I think the initial wrong answers you’re see is because “accum” is shared on the device, hence getting collision. Hence, I instead made accum “private”. Though this caused a different issue.
Error #2 “call to cuLaunchKernel returned error 1: Invalid value”
After privatization on the “gang” loop, the compiler will put “accum” in shared memory. However, when “accum” is then added to a reduction, the amount of dynamic shared memory being consumed is much greater than the maximum allowed per block, hence the launch error. I’ve added TPR #31877 for this case.
Setting “num_gangs(15)” so the amount of shared memory is below the max works around this error, but the code gets incorrect results. I’m guess this is the same problem as TPR #31876 so did not add another report.
Here’s the test case with my changes and the atomic work around:
% cat array_accum.F90
! Gives "call to cuStreamSynchronize returned error 700: Illegal address
! during kernel execution"
subroutine cluster_sum(ncluster, nverts, nmap, &
cluster_begin, cluster_end, cluster_vert_map, vert_data, cluster_data)
implicit none
integer, intent(in) :: ncluster, nverts, nmap, cluster_begin(ncluster), &
cluster_end(ncluster), cluster_vert_map(nmap)
real(8), intent(in) :: vert_data(3, nverts)
real(8), intent(out) :: cluster_data(3, ncluster)
integer :: c, vi, l
!$acc data &
!$acc copy(cluster_data) &
!$acc copyin(ncluster, nverts, nmap, cluster_begin, cluster_end, &
!$acc cluster_vert_map, vert_data)
!$acc parallel loop default(none)
do c = 1, ncluster
#ifdef USE_ATOMIC
!$acc loop
do vi = cluster_begin(c), cluster_end(c)
do l=1,3
!$acc atomic update
cluster_data(l, c) = cluster_data(l, c) + &
vert_data(l, cluster_vert_map(vi))
end do
#else
!$acc loop reduction(+:cluster_data(:,c))
do vi = cluster_begin(c), cluster_end(c)
cluster_data(:, c) = cluster_data(:, c) + &
vert_data(:, cluster_vert_map(vi))
#endif
end do
end do
!$acc end data
end subroutine cluster_sum
! Gives incorrect result
subroutine cluster_sum_local(ncluster, nverts, nmap, &
cluster_begin, cluster_end, cluster_vert_map, vert_data, cluster_data)
implicit none
integer, intent(in) :: ncluster, nverts, nmap, cluster_begin(ncluster), &
cluster_end(ncluster), cluster_vert_map(nmap)
real(8), intent(in) :: vert_data(3, nverts)
real(8), intent(out) :: cluster_data(3, ncluster)
integer :: c, vi, l
real(8) :: accum(3)
!acc declare create(accum)
!$acc data &
!$acc copyout(cluster_data) &
!$acc copyin(ncluster, nverts, nmap, cluster_begin, cluster_end, &
!$acc cluster_vert_map, vert_data)
!$acc parallel loop private(accum(:)) default(none)
do c = 1, ncluster
accum = 0.d0
#ifdef USE_ATOMIC
!$acc loop
do vi = cluster_begin(c), cluster_end(c)
do l=1,3
!$acc atomic update
accum(l) = accum(l) + vert_data(l, cluster_vert_map(vi))
enddo
#else
!$acc loop reduction(+:accum(:))
do vi = cluster_begin(c), cluster_end(c)
accum(:) = accum(:) + vert_data(:, cluster_vert_map(vi))
#endif
end do
cluster_data(:, c) = accum(:)
end do
!$acc end data
end subroutine cluster_sum_local
! Gives correct result
subroutine cluster_sum_scalar(ncluster, nverts, nmap, &
cluster_begin, cluster_end, cluster_vert_map, vert_data, cluster_data)
implicit none
integer, intent(in) :: ncluster, nverts, nmap, cluster_begin(ncluster), &
cluster_end(ncluster), cluster_vert_map(nmap)
real(8), intent(in) :: vert_data(3, nverts)
real(8), intent(out) :: cluster_data(3, ncluster)
integer :: c, vi
real(8) :: accum1, accum2, accum3
!$acc data &
!$acc copyout(cluster_data) &
!$acc copyin(ncluster, nverts, nmap, cluster_begin, cluster_end, &
!$acc cluster_vert_map, vert_data)
!$acc parallel loop default(none)
do c = 1, ncluster
accum1 = 0.d0; accum2 = 0.d0; accum3 = 0.d0
!$acc loop reduction(+:accum1,accum2,accum3)
do vi = cluster_begin(c), cluster_end(c)
accum1 = accum1 + vert_data(1, cluster_vert_map(vi))
accum2 = accum2 + vert_data(2, cluster_vert_map(vi))
accum3 = accum3 + vert_data(3, cluster_vert_map(vi))
end do
cluster_data(1, c) = accum1
cluster_data(2, c) = accum2
cluster_data(3, c) = accum3
end do
!$acc end data
end subroutine cluster_sum_scalar
integer function rand_int(min_val, max_val)
implicit none
integer, intent(in) :: min_val, max_val
real :: rvalue
call random_number(rvalue)
rand_int = min_val + int(floor(dble(max_val - min_val + 1) * rvalue))
end function rand_int
program temp
implicit none
#ifdef SMALL
integer, parameter :: ncluster = 50000, nverts = ncluster*6, maxcsize = 26
#else
integer, parameter :: ncluster = 5000000, nverts = ncluster*6, maxcsize = 26
#endif
integer :: cluster_begin(ncluster), cluster_end(ncluster)
integer, allocatable :: cluster_vert_map(:)
integer :: i, next_begin, nmap
integer, external :: rand_int
real(8) :: vert_data(3,nverts), cluster_data(3,ncluster)
real(8) :: expected(3)
logical :: okay
! Build dummy unstructured graph connectivity and data
next_begin = 1
do i = 1, ncluster
cluster_begin(i) = next_begin
cluster_end(i) = next_begin + rand_int(3,maxcsize)
next_begin = cluster_end(i) + 1
end do
nmap = cluster_end(ncluster)
allocate(cluster_vert_map(nmap))
do i = 1, nmap
cluster_vert_map(i) = rand_int(1, nverts)
end do
vert_data(:,:) = 1.d0
cluster_data=0.0
#if defined(CASE1)
call cluster_sum(ncluster, nverts, nmap, cluster_begin, cluster_end, &
cluster_vert_map, vert_data, cluster_data)
#elif defined(CASE2)
call cluster_sum_local(ncluster, nverts, nmap, cluster_begin, cluster_end, &
cluster_vert_map, vert_data, cluster_data)
#else
call cluster_sum_scalar(ncluster, nverts, nmap, cluster_begin, cluster_end, &
cluster_vert_map, vert_data, cluster_data)
#endif
okay = .true.
do i = 1, ncluster
expected(:) = dble(cluster_end(i) - cluster_begin(i) + 1)
if (any(cluster_data(:,i) /= expected)) then
print *, 'Cluster ', i, ' bad'
print *, ' expected: ', int(expected)
print *, ' got: ', int(cluster_data(:,i))
okay = .false.
exit
end if
end do
if (okay) then
print *, 'All okay'
else
print *, 'FAIL'
end if
end program temp
% nvfortran -acc array_accum.F90 -fast -DCASE1 ; a.out
Failing in Thread:1
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
% nvfortran -acc array_accum.F90 -fast -DCASE1 -DSMALL ; a.out
Cluster 1 bad
expected: 25 25 25
got: 0 0 0
FAIL
% nvfortran -acc array_accum.F90 -fast -DCASE1 -DUSE_ATOMIC ; a.out
All okay
% nvfortran -acc array_accum.F90 -fast -DCASE2 ; a.out
Failing in Thread:1
call to cuLaunchKernel returned error 1: Invalid value
% nvfortran -acc array_accum.F90 -fast -DCASE2 -DUSE_ATOMIC ; a.out
All okay
-Mat