Dear all,
We have written a simple test involving a derived type used inside an OpenACC parallel loop that is quite cumbersome: at first sight, we have assumed that it cannot work, violating OpenACC standard, however, it works well.
The first question is: can you tell us if the following test is incorrect and the fact it works is a mere coincidence (maybe due to a compiler bug)?
program test_dtype
use openacc
implicit none
type :: mytype
! prototype derived type
real, pointer, dimension(:,:) :: a => null() ! a matrix
real, pointer, dimension(:,:) :: b => null() ! b matrix
real, pointer, dimension(:,:) :: c => null() ! c = a x b matrix
endtype
integer :: i, j, k, m, n ! counter
integer :: t1, t2, dt, count_rate, count_max ! timing counter
real :: secs ! timing seconds
real :: tmp ! temporary buffer
type(mytype), target :: mat ! derived type matrices instance
! real, pointer, dimension(:) :: a =>null(),b=>null()
call system_clock(count_max=count_max, count_rate=count_rate)
associate(aa=>mat%a, bb=>mat%b, cc=>mat%c)
do m=1,4 ! test for different size matrix multiplies
n = 1000*2**(m-1) ! 1000, 2000, 4000, 8000
print '(A)'
print '(A,I6)', 'Elements number: ', n
allocate(mat%a(n,n), mat%b(n,n), mat%c(n,n) )
call system_clock(t1)
!$acc data create(aa,bb) copyout(cc)
! initialize matrices
!$acc parallel loop gang worker vector collapse(2)
do j=1,n
do i=1,n
if (i == j) then
mat%a(i,j) = 10.0
mat%b(i,j) = 0.1
else
mat%a(i,j) = 0.0
mat%b(i,j) = 0.0
endif
enddo
enddo
!$acc end parallel loop
! multiply matrices
!$acc parallel loop gang worker vector collapse(2) private(i,j,k) reduction(+:tmp) vector_length(128)
do j=1,n
do i=1,n
tmp = 0.0 ! enables ACC parallelism for k-loop
!$acc loop private(k)
do k=1,n
tmp = tmp + mat%a(i,k)*mat%b(k,j)
enddo
mat%c(i,j) = tmp
enddo
enddo
!$acc end parallel loop
!$acc end data
do i=n/10,n,n/10
print '(A,I6,F10.1)', 'c(i,i): ', i, mat%c(i,i)
enddo
call system_clock(t2)
dt = t2-t1
secs = real(dt)/real(count_rate)
print '(A,E16.4)', 'Seconds: ', secs
deallocate(mat%a, mat%b, mat%c)
enddo
endassociate
endprogram test_dtype
Compiling this test produces
┌╼ stefano@adam(10:13 AM Tue Apr 01) on main [?] desk {nvidia-24 - nvidia SDK 24.7 environment}
├───╼ ~/fortran/FUNDAL/compilers_proofs/oac 12 files, 164Kb
└──────╼ nvfortran -cpp -acc -gpu=cc89 -fast -Minfo=all test_dtype-scalar.f90 -o test_dtype-scalar
test_dtype:
19, Loop not vectorized/parallelized: too deeply nested
25, Generating create(aa(:,:),bb(:,:)) [if not already present]
Generating copyout(cc(:,:)) [if not already present]
27, Generating implicit firstprivate(n)
Generating NVIDIA GPU code
28, !$acc loop gang, worker(4), vector(32) collapse(2) blockidx%x threadidx%y threadidx%x
29, ! blockidx%x threadidx%y threadidx%x collapsed
27, Generating implicit copyin(mat) [if not already present]
Generating implicit copy(mat%b(1:n,1:n),mat%a(1:n,1:n)) [if not already present]
28, Loop not fused: no successor loop
29, Zero trip check eliminated
Loop not vectorized: data dependency
41, Generating implicit firstprivate(n)
Generating NVIDIA GPU code
42, !$acc loop gang, vector(128) collapse(2) ! blockidx%x threadidx%y threadidx%x
Generating reduction(+:tmp)
43, ! blockidx%x threadidx%y threadidx%x collapsed
46, !$acc loop seq
Generating implicit reduction(+:tmp)
41, Generating implicit copyin(mat%b(1:n,1:n),mat%a(1:n,1:n),mat) [if not already present]
Generating implicit copy(tmp) [if not already present]
Generating implicit copyout(mat%c(1:n,1:n)) [if not already present]
42, Loop not fused: no successor loop
43, Zero trip check eliminated
46, Loop is parallelizable
Zero trip check eliminated
Loop not vectorized: unprofitable for target
47, FMA (fused multiply-add) instruction(s) generated
54, Loop not vectorized/parallelized: contains call
The runtime execution is correct:
┌╼ stefano@adam(10:14 AM Tue Apr 01) on main [?] desk {nvidia-24 - nvidia SDK 24.7 environment}
├───╼ ~/fortran/FUNDAL/compilers_proofs/oac 13 files, 264Kb
└──────╼ ./test_dtype-scalar
Elements number: 1000
c(i,i): 100 1.0
c(i,i): 200 1.0
c(i,i): 300 1.0
c(i,i): 400 1.0
c(i,i): 500 1.0
c(i,i): 600 1.0
c(i,i): 700 1.0
c(i,i): 800 1.0
c(i,i): 900 1.0
c(i,i): 1000 1.0
Seconds: 0.3344E+00
Elements number: 2000
c(i,i): 200 1.0
c(i,i): 400 1.0
c(i,i): 600 1.0
c(i,i): 800 1.0
c(i,i): 1000 1.0
c(i,i): 1200 1.0
c(i,i): 1400 1.0
c(i,i): 1600 1.0
c(i,i): 1800 1.0
c(i,i): 2000 1.0
Seconds: 0.5616E-01
Elements number: 4000
c(i,i): 400 1.0
c(i,i): 800 1.0
c(i,i): 1200 1.0
c(i,i): 1600 1.0
c(i,i): 2000 1.0
c(i,i): 2400 1.0
c(i,i): 2800 1.0
c(i,i): 3200 1.0
c(i,i): 3600 1.0
c(i,i): 4000 1.0
Seconds: 0.4167E+00
Elements number: 8000
c(i,i): 800 1.0
c(i,i): 1600 1.0
c(i,i): 2400 1.0
c(i,i): 3200 1.0
c(i,i): 4000 1.0
c(i,i): 4800 1.0
c(i,i): 5600 1.0
c(i,i): 6400 1.0
c(i,i): 7200 1.0
c(i,i): 8000 1.0
Seconds: 0.4154E+01
Some notes
Our main surprise is that the compiler (nvfortran v24.11.00 and newer) does not complain about the use of ‘mat%a, mat%b, mat%c’ inside parallel loop even if these variables are not explicitly declare in ‘acc data’, but only ‘indirectly’ by associate (which aliases ‘aa, bb, cc’ are not used anywhere other than in ‘acc data’).
Moreover, if we complicate (just a step over) the test and declare ‘mat’ as an array instead of scalar the compiler procude a runtime error, i.e. :
program test_dtype
use openacc
implicit none
type :: mytype
! prototype derived type
real, pointer, dimension(:,:) :: a => null() ! a matrix
real, pointer, dimension(:,:) :: b => null() ! b matrix
real, pointer, dimension(:,:) :: c => null() ! c = a x b matrix
endtype
integer :: i, j, k, m, n ! counter
integer :: t1, t2, dt, count_rate, count_max ! timing counter
real :: secs ! timing seconds
real :: tmp ! temporary buffer
type(mytype), target :: mat(1) ! derived type matrices instance
! real, pointer, dimension(:) :: a =>null(),b=>null()
call system_clock(count_max=count_max, count_rate=count_rate)
associate(aa=>mat(1)%a, bb=>mat(1)%b, cc=>mat(1)%c)
do m=1,4 ! test for different size matrix multiplies
n = 1000*2**(m-1) ! 1000, 2000, 4000, 8000
print '(A)'
print '(A,I6)', 'Elements number: ', n
allocate(mat(1)%a(n,n), mat(1)%b(n,n), mat(1)%c(n,n) )
call system_clock(t1)
!$acc data create(aa,bb) copyout(cc)
! initialize matrices
!$acc parallel loop gang worker vector collapse(2)
do j=1,n
do i=1,n
if (i == j) then
mat(1)%a(i,j) = 10.0
mat(1)%b(i,j) = 0.1
else
mat(1)%a(i,j) = 0.0
mat(1)%b(i,j) = 0.0
endif
enddo
enddo
!$acc end parallel loop
! multiply matrices
!$acc parallel loop gang worker vector collapse(2) private(i,j,k) reduction(+:tmp) vector_length(128)
do j=1,n
do i=1,n
tmp = 0.0 ! enables ACC parallelism for k-loop
!$acc loop private(k)
do k=1,n
tmp = tmp + mat(1)%a(i,k)*mat(1)%b(k,j)
enddo
mat(1)%c(i,j) = tmp
enddo
enddo
!$acc end parallel loop
!$acc end data
do i=n/10,n,n/10
print '(A,I6,F10.1)', 'c(i,i): ', i, mat(1)%c(i,i)
enddo
call system_clock(t2)
dt = t2-t1
secs = real(dt)/real(count_rate)
print '(A,E16.4)', 'Seconds: ', secs
deallocate(mat(1)%a, mat(1)%b, mat(1)%c)
enddo
endassociate
endprogram test_dtype
The compilation works well as before, but the runtime execution produces:
Elements number: 1000
Failing in Thread:1
Accelerator Fatal Error: call to cuStreamSynchronize returned error 700 (CUDA_ERROR_ILLEGAL_ADDRESS): Illegal address during kernel execution
File: /home/stefano/fortran/FUNDAL/compilers_proofs/oac/test_dtype.f90
Function: test_dtype:1
Line: 28
Can you give us some feedback?
Thank you very much for your help, it is appreciated.
Stefano