Hi Unrue,
The problem is with “alpha(k)”. Since you put “alpha” in the use_device_ptr clause, this means that the code is dereferencing the device pointer on the host, hence the segv. Also, “alpha” is getting accessed out of bounds since it’s size “3” but indexed to “m” (10).
% cat add2s2_omp_cublas.f
c pgf90 -fopenmp -mp=gpu -Mcuda=cc80 -Minfo=all add2s2_omp.f -O2 -o add2s2_omp
module foo
contains
subroutine add2s2_omp(a,b,c1,n)
real a(n),b(n)
real,value:: c1
integer,value:: n
!$OMP TARGET TEAMS LOOP
do i=1,n
a(i)=a(i)+c1*b(i)
enddo
return
end
end module foo
program add2s2_omp
use foo
use cublas
implicit none
integer k, m, n, incy, incx
real, dimension(:), allocatable :: xbar,bbar,b,w
real, dimension(:,:), allocatable :: xx, bb
real, dimension(:), allocatable:: alpha
real :: alpha_d
logical ifwt
m = 10
n = 4669440
allocate(xbar(n))
allocate(bbar(n))
allocate(b(n))
allocate(xx(n,m))
allocate(bb(n,m))
allocate(w(n))
allocate(alpha(m))
xbar = 1.1
bbar = 2.2
b = 3.3
alpha = 0.1
alpha(1) = 0.2
alpha(2) = 0.3
alpha(3) = 0.4
xx = 3.5
bb = 2.1
ifwt = .true.
w = 7.3
incx = 1
incy = 1
!$OMP TARGET DATA MAP(TOFROM:xbar,bbar,b) MAP(TO:xx,bb,alpha)
!$OMP& use_device_ptr(xbar,xx)
do k = 2,m
! call add2s2_omp(xbar,xx(:,k),alpha(k),n)
! call cublasSaxpy(n, alpha(k), xx(:,k), 1, xbar, 1)
alpha_d = alpha(k)
call cublasSaxpy(n, alpha_d, xx(:,k), incx, xbar, incy)
call add2s2_omp(bbar,bb(:,k),alpha(k),n)
call add2s2_omp(b,bb(:,k),-alpha(k),n)
enddo
!$OMP END TARGET DATA
print *, xbar(1),bbar(1),b(1)
deallocate(xbar)
deallocate(bbar)
deallocate(b)
deallocate(xx)
deallocate(bb)
deallocate(w)
deallocate(alpha)
end program
% nvfortran -mp=gpu add2s2_omp_cublas.f -cudalib=cublas ; a.out
6.000000 5.140000 0.3599997
-Mat