cuSPARSE Library with OpenACC data Directives: cusparseDnVecGetValues not resolvable

Hello,

I created a code in order to have an understanding of the library use of cuSPARSE with OpenACC directives. It has two files: one of them the main file in which a subroutine of the other file is called. When it is compiled it gives the error:
NVFORTRAN-S-0155-Could not resolve generic procedure cusparsednvecgetvalues (csrsymmv.f90: 93)
0 inform, 0 warnings, 1 severes, 0 fatal for csrsymmv

I suppose I will get the results of the multiplication in the vector V(:,K) for each loop. When I do not add the cusparseDnVecGetValues directive, it does not give an error.

If you have any other recommendation for the code, it would be helpful.

The main file:

PROGRAM MAIN

IMPLICIT NONE

! FORTRAN arrays start at 1

INTEGER N                                      ! The number of rows of Y (the same as the columns of the dense A)
INTEGER P                                      ! The number of columns of Y

INTEGER NNA                                    ! The number of nonzero elements in the matrix	

DOUBLE PRECISION, DIMENSION(5,3) :: Y          ! The matrix multipled by the CSR matrix

DOUBLE PRECISION, DIMENSION(11) :: A           ! Vector that holds all nonzero values
INTEGER, DIMENSION(6) :: IA                    ! CSR row indices
INTEGER, DIMENSION(11) :: JA                   ! CSR column indices

N = 5                              ! 1,2,3,4,5
P = 3                              ! 1,2,3

NNA = 9     

Y(1,1:P) =  (/ 1.0, 2.0, 3.0 /)
Y(2,1:P) =  (/ 0.0, 1.0, 5.0 /)
Y(3,1:P) =  (/ 1.0, 4.0, 9.0 /)
Y(4,1:P) =  (/ 1.0, 1.0, 0.0 /)
Y(5,1:P) =  (/ 0.0, 2.0, 8.0 /)

   ! 5X5 matrix
A =  (/ 1.0, 4.0, 2.0, 3.0, 5.0, 7.0, 8.0, 9.0, 6.0, 8.0, 3.0/)
IA = (/ 1, 3, 5, 8, 10, 12 /)
JA = (/ 1, 2, 2, 3, 1, 4, 5, 3, 5, 1, 4 /)


CALL CSRSYMMV(N,P,NNA,IA,JA,A,Y)


END PROGRAM

The other file:

SUBROUTINE CSRSYMMV(N,P,NNA,IA,JA,A,Y)

! This code aims to multiply matrix A, which is in csr format with A, and indices IA 
! and JA, with Y(:,K) vector in each loop. Then takes the results as V(N,P).

! In other words, multiplication of CSR matrix A with Y(:,K) at each loop.


USE openacc
USE cusparse

!$ USE OMP_LIB

IMPLICIT NONE
	
type(cusparseHandle) :: h
type(cusparseMatDescr) :: descrA
#if CUDA_VERSION >= 11000	
integer(8) :: bsize
type(c_devptr) :: buffer
type(cusparseSpMatDescr) :: matA
type(cusparseDnVecDescr) :: vecY, vecV
#endif

integer :: status	

! parameters
REAL(8) :: alpha, beta

INTEGER N                   ! Rows of Y
INTEGER P                   ! Columns of Y

INTEGER NNA                 ! # of non-zero values

! sparse CSR arrays 
INTEGER IA(N+1)             ! csrRowPtrA
INTEGER JA(NNA)             ! csrColIndA
DOUBLE PRECISION A(NNA)     

INTEGER K
	
DOUBLE PRECISION Y(N,P)
DOUBLE PRECISION V(N,P)

!initialize CUSPARSE and matrix descriptor
status = cusparseCreate(h)
if (status /= CUSPARSE_STATUS_SUCCESS) &
  write(*,*) 'cusparseCreate error: ', status
status = cusparseCreateMatDescr(descrA)
status = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
status = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)
status = cusparseSetStream(h, acc_get_cuda_stream(acc_async_sync))

alpha = 1.0
beta  = 0.0

!!$OMP PARALLEL DO DEFAULT(NONE) &
!!$OMP PRIVATE(K) &
!!$OMP SHARED(A,IA,JA,N,P,V,Y)

!$acc data copyin(IA, JA, A, Y) create(vecY, vecV, V) copyout(vecV)
DO K = 1,P
  
  write(*,*) "K", K
  
  ! vecY = vecX in the documentation
  ! vecV = vecY in the documentation
  
  ! Multiplier vector
  status = cusparseCreateDnVec(vecY, N, Y(:,K), CUDA_R_64F)
  IF (status.ne.CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseCreateDnVec: ",status
  
  ! Return vector of the multiplication
  status = cusparseCreateDnVec(vecV, N, V(:,K), CUDA_R_64F)
  IF (status.ne.CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseCreateDnVec: ",status
  
  status = cusparseCreateCsr(matA, N, N, NNA, IA, JA, A, &
                     CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, &
                     CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F)
  IF (status.ne.CUSPARSE_STATUS_SUCCESS) print *,"cusparseCreateCsr: ",status
  
  status = cusparseSpMV_buffersize(h, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, &
                  matA, vecY, beta, vecV, CUDA_R_64F, CUSPARSE_CSRMV_ALG1, bsize)
  IF (status.ne.CUSPARSE_STATUS_SUCCESS) print *,"cusparseSpMV_buffersize: ",status
  
  print *,"SpMV buffersize required: ",bsize
  IF (bsize .GT. 0) buffer = acc_malloc(bsize)
  
  status = cusparseSpMV(h, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, &
                 matA, vecY, beta, vecV, CUDA_R_64F, CUSPARSE_CSRMV_ALG1, buffer) 
  IF (status.ne.CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseSpMV: ",status	  
  
  status = cusparseDnVecGetValues(vecV, V(:,K))
  IF (status.ne.CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseDnVecGetValues: ",status
  
  IF (bsize.gt.0) CALL acc_free(buffer)
  
!  CALL MKL_DCSRSYMV('U', N, A, IA, JA, Y(:,K), V(:,K))
END DO
!$acc end data

!!$OMP END PARALLEL DO

END SUBROUTINE CSRSYMMV

The Makefile:

################################################################################

Copyright (c) 2017, NVIDIA Corporation. All rights reserved.

Please refer to the NVIDIA end user license agreement (EULA) associated

with this source code for terms and conditions that govern your use of

this software. Any use, reproduction, disclosure, or distribution of

this software and related documentation outside the terms of the EULA

is strictly prohibited.

################################################################################

TEST = document
ADD = csrsymmv
FC = nvfortran
EXE = exe
FCFLAGS = -Mpreprocess -fast -acc=gpu -cuda -Mcudalib=cusparse

all: build run verify

build: $(TEST).f90 $(ADD).f90
$(FC) $(FCFLAGS) -o $(TEST).$(EXE) $(ADD).f90 $<

run: $(TEST).$(EXE)
$(RUN) ./$(TEST).$(EXE)

verify:

clean:
@echo ‘Cleaning up…’
@rm -rf *.$(EXE) *.dwf *.pdb *.mod prof

Thanks,
Y

Hi, I see a few issues here. If you look at the example we ship, you will see that you need to use the OpenACC host_data use_device directive. The cusparse routines expect device pointers, and that is how you can pass the device address associated with the host symbol in OpenACC. For the question about cusparseDnGetValues(), it expects the 2nd argument to be a type(c_devptr). This is a pretty generic function, and the type of data can vary. It is probably easier to just read values directly from the array that is associated with vecV. Our Fortran-cuda-interfaces document lists the API calls and arguments for cusparse.

Actually, I did not get how to read values directly from the array but I tried something just after ‘!$acc wait’ directive as you did in the example. When I compiled so, it gave:

SpMV buffersize required: 0
0.00 0.00 0.00 0.00 0.00
SpMV buffersize required: 0
0.00 0.00 0.00 0.00 0.00
SpMV buffersize required: 0
0.00 0.00 0.00 0.00 0.00

which is different from my own calculation.
Can you give an example if it is not what you mean?

I’m assuming you are looking at our examples, such as tcusparse3.f90 and tcusparse4.f90. In those examples, there are lines which make the device addresses available to cusparse, like these:
!$acc host_data use_device(csrValA, csrRowPtrA, csrColIndA, x, y)

1 Like

Yes, I used tcusparse4.f90. I added that line. I did not understand how to get the values. The new version below:

SUBROUTINE CSRSYMMV(N,P,NNA,IA,JA,A,Y)

! This code aims to multiply matrix A, which is in csr format with A, and indices IA 
! and JA, with Y(:,K) vector in each loop. Then takes the results as V(N,P).

! In other words, multiplication of CSR matrix A with Y(:,K) at each loop.


USE openacc
USE cusparse

!$ USE OMP_LIB

IMPLICIT NONE
	
type(cusparseHandle) :: h
type(cusparseMatDescr) :: descrA
#if CUDA_VERSION >= 11000	
integer(8) :: bsize
type(c_devptr) :: buffer
type(cusparseSpMatDescr) :: matA
type(cusparseDnVecDescr) :: vecY, vecV
#endif

integer :: status	

! parameters
REAL(8) :: alpha, beta

INTEGER N                   ! Rows of Y
INTEGER P                   ! Columns of Y

INTEGER NNA                 ! # of non-zero values

! sparse CSR arrays 
INTEGER IA(N+1)             ! csrRowPtrA
INTEGER JA(NNA)             ! csrColIndA
DOUBLE PRECISION A(NNA)     

INTEGER K
	
DOUBLE PRECISION Y(N,P)
DOUBLE PRECISION V(N,P)

!initialize CUSPARSE and matrix descriptor
status = cusparseCreate(h)
if (status /= CUSPARSE_STATUS_SUCCESS) &
  write(*,*) 'cusparseCreate error: ', status
status = cusparseCreateMatDescr(descrA)
status = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
status = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)
status = cusparseSetStream(h, acc_get_cuda_stream(acc_async_sync))

alpha = 1.0
beta  = 0.0

!!$OMP PARALLEL DO DEFAULT(NONE) &
!!$OMP PRIVATE(K) &
!!$OMP SHARED(A,IA,JA,N,P,V,Y)

!$acc data copyin(IA, JA, A, Y) create(V) copyout(V)
DO K = 1,P
  
  !write(*,*) "K", K
  
  ! vecY = vecX in the documentation
  ! vecV = vecY in the documentation
  
  ! Multiplier vector
  !$acc host_data use_device(A,IA,JA,Y(:,K),V(:,K))
  status = cusparseCreateDnVec(vecY, N, Y(:,K), CUDA_R_64F)
  IF (status.ne.CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseCreateDnVec: ",status
  
  ! Return vector of the multiplication
  status = cusparseCreateDnVec(vecV, N, V(:,K), CUDA_R_64F)
  IF (status.ne.CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseCreateDnVec: ",status
  
  status = cusparseCreateCsr(matA, N, N, NNA, IA, JA, A, &
                     CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, &
                     CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F)
  IF (status.ne.CUSPARSE_STATUS_SUCCESS) print *,"cusparseCreateCsr: ",status
  
  status = cusparseSpMV_buffersize(h, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, &
                  matA, vecY, beta, vecV, CUDA_R_64F, CUSPARSE_CSRMV_ALG1, bsize)
  IF (status.ne.CUSPARSE_STATUS_SUCCESS) print *,"cusparseSpMV_buffersize: ",status
  
  print *,"SpMV buffersize required: ",bsize  
  IF (bsize .GT. 0) buffer = acc_malloc(bsize)
  
  status = cusparseSpMV(h, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, &
                 matA, vecY, beta, vecV, CUDA_R_64F, CUSPARSE_CSRMV_ALG1, buffer) 
  IF (status.ne.CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseSpMV: ",status	  
  
  !status = cusparseDnVecGetValues(vecV, V(:,K))
  !IF (status.ne.CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseDnVecGetValues: ",status
  
  IF (bsize.gt.0) CALL acc_free(buffer)
  !$acc end host_data
  
  !$acc wait
  write(*,'(5(1x,f7.2))') V(:,K)
  
!  CALL MKL_DCSRSYMV('U', N, A, IA, JA, Y(:,K), V(:,K))
END DO
!$acc end data



!!$OMP END PARALLEL DO

END SUBROUTINE CSRSYMMV

Either I made a mistake to use the method or write(*,’(5(1x,f7.2))’) V(:,K) is not a way to directly reach the values because it was a bunch of zeros.
Is it the way you mentioned at first?

The array V is copyout, but you are printing it before you hit the end data directive. Try printing it after that. Or, do an update host directive to move it back to the host.