Hi Everyone,
I am a newbie in accelerator programming. I encounter a problem when I try to compare the execution time of a simple one-dimensional vector addition accelerated by OpenMP with that by OpenACC. To my surprise, the execution with OpenMP is far faster than that with OpenACC, no matter how big the array size is. In the case where the size of array is set to 2**26, OpenMP takes 73 (ms) while OpenACC needs to spend 396 (ms) to complete the same computation.
Can anyone tell me anything wrong in my code? I attached the code I used for this experiment. Please see below.
Thanks,
Li
subroutine saxpy_openmp(n,a,x,y)
implicit none
integer :: n,i
real, intent(in) :: x(n),a
real, intent(inout) :: y(n)
!$omp parallel do
do i=1,n
y(i)=a*x(i)+y(i)
enddo
!$omp end parallel do
end subroutine saxpy_openmp
subroutine saxpy(n,a,x,y)
implicit none
integer :: n, i
real, intent(in) :: x(n), a
real, intent(inout) :: y(n)
do i=1,n
y(i)=a*x(i)+y(i)
enddo
end subroutine saxpy
subroutine saxpy_openacc(m,a,x,y1)
implicit none
integer :: m, i
real :: x(m), a
real :: y1(m)
!$acc kernels loop present(x,y1)
do i=1,m
y1(i)=a*x(i)+y1(i)
enddo
end subroutine saxpy_openacc
program p
use lapack95
use blas95
use omp_lib
use accel_lib
implicit none
integer :: m=2**26 !don't set the power of 2 to exceed 26
real :: x(m),y1(m),y2(m),y3(m)
integer :: r1,r0
integer :: i,j
do i=1,m
y1(i)=1.0
y2(i)=1.0
y3(i)=1.0
x(i)=1.0
enddo
call system_clock(r0)
call saxpy_openmp(m,2.0,x,y2)
call system_clock(r1)
print*,' time: ',r1-r0
do i=1,10
print*,y2(i)
enddo
call system_clock(r0)
call saxpy(m,2.0,x,y3)
call system_clock(r1)
print*,' time: ',r1-r0
do i=1,10
print*,y3(i)
enddo
call acc_init( acc_device_nvidia )
call system_clock(r0)
!$acc data copy(x(:),y1(:))
call saxpy_openacc(m,2.0,x,y1)
!$acc end data
call system_clock(r1)
print*,' time: ',r1-r0
do i=1,10
print*,y1(i)
enddo
end program
-g -Bstatic -Mbackslash -mp -acc -I"C:\Program Files (x86)\Intel\Composer XE 2013\mkl\include" -I"C:\Program Files (x86)\Intel\Composer XE 2013\mkl\interfaces\lapack95\lapack95\include\intel64\lp64" -I"C:\Program Files (x86)\Intel\Composer XE 2013\mkl\interfaces\blas95\lib95\include\intel64\lp64" -I"c:\program files\pgi\win64\12.10\include" -I"C:\Program Files\PGI\Microsoft Open Tools 10\include" -I"C:\Program Files\PGI\Microsoft Open Tools 10\PlatformSDK\include" -I"C:\Program Files\PGI\win64\2012\cuda\4.2\include" -fastsse -Mipa=fast,inline -tp=bulldozer-64 -ta=nvidia,nowait,host -Minform=warn -Minfo=accel