It’s actually failing for a different reason, “n”.
Adding “n” to a firstprivate clause, along with privatizing “kk”, and “k” works around the issue.
Alternatively, you can use the “loop” directive or OpenACC which allows for better analysis during compilation where the compiler is most likely able to auto-detect n should be first private.
% cat test.F90
module var_rtable
real*8, dimension(:), allocatable :: a, b
real*8, dimension(:), allocatable :: c
integer :: n
end module var_rtable
program matrix_multiply
use omp_lib
use var_rtable
implicit none
integer :: i, j, k, kk, myid, m, mm, compiled_for, option
integer, parameter :: fd = 11
integer :: t1, t2, dt, count_rate, count_max
real :: tmp, secs, acumm
open(fd,file='wallclocktime',form='formatted')
option = compiled_for(fd) ! OpenMP
call system_clock(count_max=count_max, count_rate=count_rate)
call system_clock(t1)
mm = 10
m = 1
n = 513
! n = 350
acumm = 0
allocate( a(n*n), b(n*n), c(mm) )
! Initialize matrices
do j=1,n
do i=1,n
kk = (j-1)*n+i
a(kk) = 1.0
b(kk) = 2.0
do k=1,mm
c(k) = 0.0
enddo
enddo
enddo
call test_sub_red_omp(mm)
print*,"Answer should be 526338 and the test gives ",c(1)
call system_clock(t2)
dt = t2-t1
secs = real(dt)/real(count_rate)
write(fd,"('For n=',i4,', wall clock time is ',f12.10,' seconds')") &
n, secs
deallocate(a, b, c)
close(fd)
contains
subroutine test_sub_red_omp(mm)
use var_rtable
integer, intent(in) :: mm
integer :: i, j, k, l, kk
#ifdef _OPENACC
!$acc parallel loop gang vector collapse(2) reduction(+:c)
#elif defined(USE_LOOP)
!$omp target teams loop collapse(2) reduction(+:c) bind(teams,parallel)
#else
!$omp target teams distribute parallel do collapse(2) reduction(+:c) private(kk,k) firstprivate(n)
#endif
do j=1,n
do i=1,n
kk = (j-1)*n+i
do k=1,mm
c(k) = c(k) + a(kk) * b(kk)
enddo
enddo
enddo
end subroutine test_sub_red_omp
end program matrix_multiply
integer function compiled_for(fd)
implicit none
integer :: fd
compiled_for = 3
write(fd,"('This code is compiled with OpenMP')")
end function compiled_for
% nvfortran -acc=gpu -Minfo=accel test.F90 -gpu=managed ; a.out
test_sub_red_omp:
63, Generating copy(c(:)) [if not already present]
Generating implicit copyin(b(:)) [if not already present]
Generating NVIDIA GPU code
69, !$acc loop gang, vector(128) collapse(2) ! blockidx%x threadidx%x
Generating reduction(+:c(:))
70, ! blockidx%x threadidx%x collapsed
72, !$acc loop seq
63, Generating implicit copyin(a(:)) [if not already present]
70, Generating implicit private(kk)
72, Loop is parallelizable
Answer should be 526338 and the test gives 526338.0000000000
% nvfortran -mp=gpu -Minfo=mp test.F90 -gpu=managed -DUSE_LOOP ; a.out
test_sub_red_omp:
65, !$omp target teams loop
65, Generating "nvkernel_matrix_multiply_test_sub_red_omp_F1L65_2" GPU kernel
Generating NVIDIA GPU code
69, Loop parallelized across teams, threads(128) collapse(2) ! blockidx%x threadidx%x
70, ! blockidx%x threadidx%x collapsed
72, Loop run sequentially
69, Generating reduction(+:c(:))
65, Generating Multicore code
69, Loop parallelized across threads
70, Generating implicit private(kk)
72, Loop is parallelizable
Answer should be 526338 and the test gives 526338.0000000000
% nvfortran -mp=gpu -Minfo=mp test.F90 -gpu=managed ; a.out
test_sub_red_omp:
67, !$omp target teams distribute parallel do
67, Generating "nvkernel_matrix_multiply_test_sub_red_omp_F1L67_2" GPU kernel
Answer should be 526338 and the test gives 526338.0000000000
-Mat