Hi Bean,
There’s a couple of things going on. Your program has a few small errors, missing “wait” directive and end data, but these are easily fixed. The cause of the error appears to be due to a race condition on the “p” variable.
Since “p” is being passed by reference to “mm”, the compiler must make it global and assume that it get’s updated in the callee. Before the compute region is launched, “p” gets updated on the device. However within an asynchronous context, the value of “p” will change depending upon which ever asynchronous queue just updated it. I added TPR#21745 to see what, if anything, the compiler can do here.
The solutions are to either not use a “routine gang” and instead have “mm” use it’s own compute region, or pass “p” by value to “mm”. However, to use the “value” attribute, “mm” must have an F90 interface so you’ll either need to put “mm” in a module or write an explicit interface.
Here’s two examples. In the second I put “mm” in a module but also use “parallel” regions instead of “kernels”. “parallel” is better when calling an OpenACC “routine” since it allows you to set the vector length. Without using “vector_length”, the vector length would be “1” since the compiler has no visibility in how the “routine” is parallelized.
% cat Bean.F90
program test_routine
implicit none
integer :: i,j,k,m,n,p
integer :: t1, t2, dt, count_rate, count_max
integer :: td(4)=(/16,64,256,1024/)
real, allocatable, dimension(:,:,:) :: a, b, c
real :: tmp, secs
call system_clock(count_max=count_max, count_rate=count_rate)
n = 20
allocate( a(n,n,1024), b(n,n,1024), c(n,n,1024) )
!$acc data create(a,b) copyout(c(1:n,1:n,1:1024))
do m = 1,4
call system_clock(t1)
do p = 1,td(m) ! td = {16,64,256,1024}
! Initialize matrices
!$acc kernels async(p)
!$acc loop collapse(2) gang vector(128)
do j=1,n
do i=1,n
a(i,j,p) = real(i + j)
b(i,j,p) = real(i - j)
enddo
enddo
!$acc end loop
!$acc end kernels
! Compute matrix multiplication
call mm(a,b,c,n,p,1024)
enddo
!$acc wait
!$acc update host(c)
call system_clock(t2)
dt = t2-t1
secs = real(dt)/real(count_rate)
write(*,*)"For domain_num=",td(m)," wall clock time is ",secs
write(*,*) c(1,1,1),c(2,2,1),c(2,2,2),c(n,n,n)
enddo
!$acc end data
deallocate(a, b, c)
end program test_routine
subroutine mm(aa,bb,cc,n,p,td)
implicit none
integer :: i,j,k,n,p,td
real :: aa(n,n,td),bb(n,n,td),cc(n,n,td)
real :: tmp
!$acc kernels loop collapse(2) gang vector(128) async(p) &
!$acc present(aa,bb,cc)
! Compute matrix multiplication.
do j=1,n
do i=1,n
tmp = 0.0 ! enables ACC parallelism for k-loop
do k=1,n
tmp = tmp + aa(i,k,p) * bb(k,j,p)
enddo
cc(i,j,p) = tmp
enddo
enddo
end subroutine mm
% pgf90 Bean.F90 -Minfo=accel -acc
test_routine:
16, Generating create(a(:,:,:),b(:,:,:))
Generating copyout(c(1:n,1:n,1:1024))
26, Loop is parallelizable
27, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
26, !$acc loop gang, vector(128) collapse(2) ! blockidx%x threadidx%x
27, ! blockidx%x threadidx%x collapsed
40, Generating update host(c(:,:,:))
mm:
59, Generating present(aa(:,:,:),bb(:,:,:),cc(:,:,:))
62, Loop is parallelizable
63, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
62, !$acc loop gang, vector(128) collapse(2) ! blockidx%x threadidx%x
63, ! blockidx%x threadidx%x collapsed
65, Loop is parallelizable
% a.out
For domain_num= 16 wall clock time is 2.2085000E-02
2850.000 2790.000 2790.000 0.000000
For domain_num= 64 wall clock time is 1.0190000E-03
2850.000 2790.000 2790.000 -5130.000
For domain_num= 256 wall clock time is 2.6860000E-03
2850.000 2790.000 2790.000 -5130.000
For domain_num= 1024 wall clock time is 1.0812000E-02
2850.000 2790.000 2790.000 -5130.000
% cat Bean2.F90
module mod_mm
contains
subroutine mm(aa,bb,cc,n,p,td)
!$acc routine gang
implicit none
integer,value :: n,p,td
integer :: i,j,k
real :: aa(n,n,td),bb(n,n,td),cc(n,n,td)
real :: tmp
!$acc loop collapse(2) gang vector
! Compute matrix multiplication.
do j=1,n
do i=1,n
tmp = 0.0 ! enables ACC parallelism for k-loop
!$acc loop seq
do k=1,n
tmp = tmp + aa(i,k,p) * bb(k,j,p)
enddo
cc(i,j,p) = tmp
enddo
enddo
end subroutine mm
end module mod_mm
program test_routine
use mod_mm
implicit none
integer :: i,j,k,m,n,p
integer :: t1, t2, dt, count_rate, count_max
integer :: td(4)=(/16,64,256,1024/)
real, allocatable, dimension(:,:,:) :: a, b, c
real :: tmp, secs
call system_clock(count_max=count_max, count_rate=count_rate)
n = 20
allocate( a(n,n,1024), b(n,n,1024), c(n,n,1024) )
c=0.0
!$acc data create(a,b) copyin(c(1:n,1:n,1:1024))
do m = 1,4
call system_clock(t1)
do p = 1,td(m) ! td = {16,64,256,1024}
! Initialize matrices
!$acc parallel loop collapse(2) gang vector async(p)
do j=1,n
do i=1,n
a(i,j,p) = real(i + j)
b(i,j,p) = real(i - j)
c(i,j,p) = 0.0
enddo
enddo
!$acc end loop
! Compute matrix multiplication
!$acc parallel num_gangs(n) vector_length(128) async(p)
call mm(a,b,c,n,p,1024)
!$acc end parallel
enddo
!$acc wait
!$acc update host(c)
call system_clock(t2)
dt = t2-t1
secs = real(dt)/real(count_rate)
write(*,*)"For domain_num=",td(m)," wall clock time is ",secs
write(*,*) c(1,1,1),c(2,2,2),c(10,10,10),c(n,n,td(m))
enddo
!$acc end data
deallocate(a, b, c)
end program test_routine
% pgf90 Bean2.F90 -Minfo=accel -acc
mm:
4, Generating acc routine gang
Generating Tesla code
13, !$acc loop gang, vector collapse(2) ! blockidx%x threadidx%x
14, ! blockidx%x threadidx%x collapsed
13, Loop is parallelizable
14, Loop is parallelizable
17, Loop is parallelizable
test_routine:
42, Generating create(a(:,:,:),b(:,:,:))
Generating copyin(c(1:n,1:n,1:1024))
50, Accelerator kernel generated
Generating Tesla code
51, !$acc loop gang, vector(128) collapse(2) ! blockidx%x threadidx%x
52, ! blockidx%x threadidx%x collapsed
61, Accelerator kernel generated
Generating Tesla code
66, Generating update host(c(:,:,:))
% a.out
For domain_num= 16 wall clock time is 4.4800001E-03
2850.000 2790.000 870.0000 -5130.000
For domain_num= 64 wall clock time is 1.0010001E-03
2850.000 2790.000 870.0000 -5130.000
For domain_num= 256 wall clock time is 3.1010001E-03
2850.000 2790.000 870.0000 -5130.000
For domain_num= 1024 wall clock time is 1.3404000E-02
2850.000 2790.000 870.0000 -5130.000