c pgf90 -fopenmp -mp=gpu -Mcuda=cc80 -Minfo=all add2s2_omp.f -O2 -o add2s2_omp module foo contains subroutine add2s2_omp(a,b,c1,n) real a(n),b(n) real,value:: c1 integer,value:: n !$OMP TARGET TEAMS LOOP do i=1,n a(i)=a(i)+c1*b(i) enddo return end end module foo program add2s2_omp use foo use cublas implicit none integer k, m, n real, dimension(:), allocatable :: xbar,bbar,b,w real, dimension(:,:), allocatable :: xx, bb real, dimension(:), allocatable:: alpha integer :: xbar_d, xx_d logical ifwt external cublas_init external cublas_shutdown m = 10 n = 4669440 allocate(xbar(n)) allocate(bbar(n)) allocate(b(n)) allocate(xx(n,m)) allocate(bb(n,m)) allocate(w(n)) allocate(alpha(3)) xbar = 1.1 bbar = 2.2 b = 3.3 alpha(1) = 0.2 alpha(2) = 0.3 alpha(3) = 0.4 xx = 3.5 bb = 2.1 ifwt = .true. w = 7.3 call cublas_init call cublas_Alloc(n,4,xbar_d) call cublas_Set_Matrix(n,1,4,xbar,n,xbar_d,n) call cublas_Alloc(n*(m-1),4,xx_d) call cublas_Set_Matrix(n,m-1,4,xx(1,2),n,xx_d,n) !!$OMP TARGET DATA MAP(TOFROM:xbar,bbar,b) MAP(TO:xx,bb) do k = 2,m !call add2s2_omp(xbar,xx(:,k),alpha(k),n) call cublas_Saxpy(n, alpha(k), xx_d, 1, xbar_d, 1) call add2s2_omp(bbar,bb(:,k),alpha(k),n) call add2s2_omp(b,bb(:,k),-alpha(k),n) enddo !!$OMP END TARGET DATA call cublas_Get_Matrix(n,1,4,xbar_d,n,xbar,n) write(*,*) "xbar(1): ", xbar(1) write(*,*) "bbar(1): ", bbar(1) write(*,*) "b(1): ", b(1) deallocate(xbar) deallocate(bbar) deallocate(b) deallocate(xx) deallocate(bb) deallocate(w) deallocate(alpha) call cublas_Free(xbar_d) call cublas_Free(xx_d) call cublas_shutdown end program