I program a simple code to do the matrix multiplication. However, I added a loop outside of the kernel used to do the matrix multiply due to the further study. In this test, I would like to use OpenMP to parallel the outside loop and OpenACC to parallel the kernel loops. There are some errors in that code. Some one can help me to solve this problem?
Following is the code. Thank you very much.
program main
use accel_lib
integer :: n ! size of the vector
real,dimension(:,:),allocatable :: a ,b,c,c1
real,dimension(:),allocatable :: csum
integer :: i,j,k,kk,mk
integer :: t1,t2,thn
real :: diff,st,pt,speedup
!$ integer:: omp_get_num_threads
!$ integer:: omp_get_num_procs
!$ thn=omp_get_num_procs()
!$ write(*,*) "The number of available processors/threads in the system: ",thn
thn=1 ! when OpenMP is not used
!$ write(*,*) "Enter the number of threads"
!$ read(*,*) thn
!$ call omp_set_num_threads(thn) ! set the number of threads
!$call acc_init( acc_device_nvidia )
n =512
mk=16
allocate(a(n,n),b(n,n),c(n,n),c1(n,n),csum(mk))
do i=1,n
do j=1,n
a(i,j)=(i+j)/(i)
b(i,j)=2*(i+j)/(i)
end do
end do
call system_clock( count=t1 )
!$omp parallel do shared(n,a,b),private(c,kk)
do kk=1,mk !CPU processing
c=0.0d0
do i=1,n
do j=1,n
do k=1,n
c(i,j)=c(i,j)+a(i,k)*b(k,j)*kk
end do
end do
end do
csum(kk)=sum(c)
end do
!$omp end parallel do
write(*,*) csum
csum=0.0d0
call system_clock( count=t2 )
st= (t2-t1)/1.0d6
print *, 'CPU time: ', st, ' seconds'
call system_clock( count=t1 )
call acc_init( acc_device_nvidia )
!$omp parallel do shared(n,a,b),private(c1,kk)
do kk=1,mk !GPU processing
c1=0.0d0
call obj(n,a,b,c1,kk)
csum(kk)=sum(c1)
end do
!$omp end parallel do
write(*,*) csum
call system_clock( count=t2 )
pt=(t2-t1)/1.0d6
print *, 'GPU time: ', pt, ' seconds'
speedup=st/pt
print *, 'speedup: ', speedup
end program
subroutine obj(n,a,b,c1,kk)
implicit none
integer, intent(in)::n,kk
real, intent(in)::a(n,n),b(n,n)
real, intent(out)::c1(n,n)
integer::i,j,k
!$acc parallel loop
do j=1,n
do i=1,n
do k=1,n
c1(i,j)=c1(i,j)+a(i,k)*b(k,j)*kk
end do
end do
end do
!$acc end parallel loop
return
end subroutine obj