c pgf90 -fopenmp -mp=gpu -Mcuda=cc80 -Minfo=all add2s2_omp.f -O2 -o add2s2_omp module foo contains subroutine add2s2_omp(a,b,c1,n) real a(n),b(n) real,value:: c1 integer,value:: n !$OMP TARGET TEAMS LOOP do i=1,n a(i)=a(i)+c1*b(i) enddo return end end module foo program add2s2_omp use foo use cublas implicit none integer k, m, n real, dimension(:), allocatable :: xbar,bbar,b,w real, dimension(:,:), allocatable :: xx, bb real, dimension(:), allocatable:: alpha logical ifwt m = 10 n = 4669440 allocate(xbar(n)) allocate(bbar(n)) allocate(b(n)) allocate(xx(n,m)) allocate(bb(n,m)) allocate(w(n)) allocate(alpha(3)) xbar = 1.1 bbar = 2.2 b = 3.3 alpha(1) = 0.2 alpha(2) = 0.3 alpha(3) = 0.4 xx = 3.5 bb = 2.1 ifwt = .true. w = 7.3 !$OMP TARGET DATA MAP(TOFROM:xbar,bbar,b) MAP(TO:xx,bb,alpha) !$OMP& use_device_ptr(xbar,xx, alpha) do k = 2,m ! call add2s2_omp(xbar,xx(:,k),alpha(k),n) call cublasSaxpy(n, alpha(k), xx(:,k), 1, xbar, 1) call add2s2_omp(bbar,bb(:,k),alpha(k),n) call add2s2_omp(b,bb(:,k),-alpha(k),n) enddo !$OMP END TARGET DATA print *, xbar(1),bbar(1),b(1) deallocate(xbar) deallocate(bbar) deallocate(b) deallocate(xx) deallocate(bb) deallocate(w) deallocate(alpha) end program