The most efficient thing to do is to use batched cuBLAS. Ron Rahaman has several examples: https://github.com/RonRahaman/cublas-demos/tree/master/src
He also has some pure OpenACC examples as well (i.e no cuBLAS).
For your code, it would look something like:
% cat testmm.F90
PROGRAM TEST
#ifdef _OPENACC
USE openacc
#endif
IMPLICIT NONE
REAL(8),dimension(9,9,1000)::A,B,C
REAL :: tmp
Integer::n,i,j,k
A=1d0
B=1d0
C=0d0
!$acc parallel loop vector_length(32) copyin(A,B) copyout(C)
Do n=1,1000
#ifndef _OPENACC
C(:,:,n)=MATMUL(A(:,:,n),B(:,:,n))
#else
!$acc loop vector collapse(2)
do j=1,9
do i=1,9
tmp = 0.0
do k=1,9
tmp = tmp + a(i,k,n) * b(k,j,n)
enddo
c(i,j,n) = tmp
enddo
enddo
#endif
enddo
Print*,C(:,:,1)
END PROGRAM TEST
% nvfortran testmm.F90 -acc -Minfo=accel -fast
test:
15, Generating copyin(a(:,:,:)) [if not already present]
Generating copyout(c(:,:,:)) [if not already present]
Generating copyin(b(:,:,:)) [if not already present]
Generating Tesla code
16, !$acc loop gang ! blockidx%x
21, !$acc loop vector(32) collapse(2) ! threadidx%x
Interchanging generated strip mine loop outwards
22, ! threadidx%x collapsed
Interchanging generated vector loop outwards
Interchanging generated strip mine loop outwards
24, !$acc loop seq
21, Loop is parallelizable
22, Loop is parallelizable
24, Loop carried scalar dependence for tmp at line 25
Scalar last value needed after loop for tmp at line 27
% a.out
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000
9.000000000000000 9.000000000000000 9.000000000000000