Problems in the use of cusparseSpGEMM in CUDA Fortran

Thanks for your continued replies, @qanhpham !

I followed the sample code (https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_mem/spgemm_mem_example.c) and tried with ALG3, but at the first compilation cusparseSpGEMM_workstation, I get the following error:

NVFORTRAN-S-0155-Could not resolve generic procedure cusparsespgemm_workestimation
NVFORTRAN-S-0038-Symbol, cusparse_spgemm_alg3, has not been explicitly declared

I just changed CUSPARSE_SPGEMM_DEFAULT to CUSPARSE_SPGEMM_ALG3.
There is no compile problem with CUSPARSE_SPGEMM_DEFAULT as it is.
What should I do?
Thanks.

This is my code with ALG3.

subroutine SpGEMM_ALG3

use cudafor
use cusparse

Implicit none

  !!Define Matrix----------------------------------
  Integer,parameter :: A_rows=4
  Integer,parameter :: A_cols=4
  Integer,parameter :: A_nnz=9
  Integer           :: Arow(A_rows+1)
  Integer           :: Acol(A_nnz)
  Real(8)           :: Aval(A_nnz)
  Integer,device    :: Arow_d(A_rows+1)
  Integer,device    :: Acol_d(A_nnz)
  Real(8),device    :: Aval_d(A_nnz)

  Integer,parameter :: B_rows=4
  Integer,parameter :: B_cols=4
  Integer,parameter :: B_nnz=8
  Integer           :: Brow(B_rows+1)
  Integer           :: Bcol(B_nnz)
  Real(8)           :: Bval(B_nnz)
  Integer,device    :: Brow_d(B_rows+1)
  Integer,device    :: Bcol_d(B_nnz)
  Real(8),device    :: Bval_d(B_nnz)

  Integer   :: C_rows
  Integer   :: C_cols
  Integer   :: C_nnz
  Integer,allocatable :: Crow(:)
  Integer,allocatable :: Ccol(:)
  Real(8),allocatable :: Cval(:)
  Integer,allocatable,device  :: Crow_d(:)
  Integer,allocatable,device  :: Ccol_d(:)
  Real(8),allocatable,device  :: Cval_d(:)

  Integer,parameter   :: C_rows_true=4
  Integer,parameter   :: C_cols_true=4
  Integer,parameter   :: C_nnz_true=12
  Integer :: Crow_true(C_rows_true+1)
  Integer :: Ccol_true(C_nnz_true)
  Real(8) :: Cval_true(C_nnz_true)
  !!Define Matrix----------------------------------

  Real(8) :: alpha=1d0,beta=0d0

  Integer(8)   :: C_rows_dbl
  Integer(8)   :: C_cols_dbl
  Integer(8)   :: C_nnz_dbl

  Integer :: istat,status
  type(cusparseHandle) :: handle
  type(cusparseSpMatDescr) :: matA,matB,matC
  type(cusparseSpGEMMDescr) :: SpGEMMDesc
  !type(cusparseSpGEMMALG) :: CUSPARSE_SPGEMM_ALG3
  !type(cusparseSpGEMMALG) :: CUSPARSE_SPGEMM_DEFAULT

  Integer(8) :: bufferSize1
  Integer(1),pointer,device :: buffer1(:)

  Integer(8) :: bufferSize2
  Integer(1),pointer,device :: buffer2(:)

  !! ALG3
  Integer(8) :: bufferSize3
  Integer(1),pointer,device :: buffer3(:)

  Integer(8) :: num_prods
  Real(8) :: chunk_fraction=0.2d0
  !! ALG3

  !!Define Matrix----------------------------------
  Arow=(/1,4,5,8,10/)
  Acol=(/1,3,4,2,1,3,4,2,4/)
  Aval=(/1d0,2d0,3d0,4d0,5d0,6d0,7d0,8d0,9d0/)

  Brow=(/1,3,5,8,9/)
  Bcol=(/1,4,2,4,1,2,3,2/)
  Bval=(/1d0,2d0,3d0,4d0,5d0,6d0,7d0,8d0/)

  istat=cudaDeviceSynchronize
  Arow_d=Arow
  Acol_d=Acol
  Aval_d=Aval
  Brow_d=Brow
  Bcol_d=Bcol
  Bval_d=Bval
  istat=cudaDeviceSynchronize
  !!Define Matrix----------------------------------

  allocate(Crow_d(A_rows+1))


  ! initalize CUSPARSE and matrix descriptor
  status=cusparseCreate(handle)
  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseCreate error: ', status

  status=cusparseCreateCsr(matA,A_rows,A_cols,A_nnz, &
                           ARow_d,ACol_d,Aval_d,  &
                           CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, &
                           CUSPARSE_INDEX_BASE_ONE,CUDA_R_64F)
  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseCreateCsr error: ', status

  status=cusparseCreateCsr(matB,B_rows,B_cols,B_nnz, &
                          BRow_d,BCol_d,Bval_d,  &
                          CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, &
                          CUSPARSE_INDEX_BASE_ONE,CUDA_R_64F)
  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseCreateCsr error: ', status

  status=cusparseCreateCsr(matC,A_rows,B_cols,0, &
                           null(),null(),null(),  &
                           CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, &
                           CUSPARSE_INDEX_BASE_ONE,CUDA_R_64F)
  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseCreateCsr error: ', status

  status=cudaDeviceSynchronize

  !!----------------------------------------------------------------------------------------------------

  !!SpGEMM computation
  status=cusparseSpGEMM_createDescr(SpGEMMDesc)
  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseSpGEMM_CreateDescr error: ', status


  !! ask bufferSize1 bytes for external memory
  nullify(buffer1)
  status=cusparseSpGEMM_workEstimation(handle,&
                                       CUSPARSE_OPERATION_NON_TRANSPOSE,CUSPARSE_OPERATION_NON_TRANSPOSE,&
                                       alpha,matA,matB,beta,matC,&
                                       CUDA_R_64F,CUSPARSE_SPGEMM_ALG3,&
                                       !!CUDA_R_64F,CUSPARSE_SPGEMM_DEFAULT,&
                                       SpGEMMDesc,bufferSize1,buffer1)
  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseSpGEMM_workEstimation error: ', status

  istat=cudaDeviceSynchronize
  print *, "bufferSize1=",bufferSize1

  if(bufferSize1 /= 0) allocate(buffer1(bufferSize1))

!!  !! inspect the A and B to understand the memory requirement for
!!  !! the next step
!!  status=cusparseSpGEMM_workEstimation(handle,&
!!                                       CUSPARSE_OPERATION_NON_TRANSPOSE,CUSPARSE_OPERATION_NON_TRANSPOSE,&
!!                                       alpha,matA,matB,beta,matC,&
!!                                       CUDA_R_64F,CUSPARSE_SPGEMM_ALG3,&
!!                                       SpGEMMDesc,bufferSize1,buffer1)
!!  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseSpGEMM_workEstimation error: ', status
!!
!!  !!ALG3--------------------------------------
!!
!!  status=cusparseSpGEMM_getNumProducts(SpGEMMDesc,num_prods)
!!
!!  !! ask bufferSize3 bytes for external memory
!!  nullify(buffer3)
!!  status=cusparseSpGEMM_estimateMemory(handle,&
!!                                       CUSPARSE_OPERATION_NON_TRANSPOSE,CUSPARSE_OPERATION_NON_TRANSPOSE,&
!!                                       alpha,matA,matB,beta,matC,&
!!                                       CUDA_R_64F,CUSPARSE_SPGEMM_ALG3,&
!!                                       SpGEMMDesc,chunk_fraction,&
!!                                       bufferSize3,buffer3,&
!!                                       bufferSize2)
!!  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseSpGEMM_workEstimation error: ', status
!!
!!  istat=cudaDeviceSynchronize
!!  print *, "bufferSize2=",bufferSize2
!!  print *, istat
!!
!!  if(bufferSize2 /= 0) allocate(buffer2(bufferSize2))
!!
!!  !! buffer3 can be safely freed to save more memory
!!  deallocate(buffer3)
!!
!!  !!ALG3--------------------------------------
!!
!!
!!
!!
!!  !! compute the intermediate product of A * B
!!  status=cusparseSpGEMM_compute(handle,&
!!                               CUSPARSE_OPERATION_NON_TRANSPOSE,CUSPARSE_OPERATION_NON_TRANSPOSE,&
!!                               alpha,matA,matB,beta,matC,&
!!                               CUDA_R_64F,CUSPARSE_SPGEMM_ALG3,&
!!                               SpGEMMDesc,bufferSize2,buffer2)
!!  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseSpGEMM_compute error: ', status
!!
!!
!!  !! get matrix C non-zero entires C_nnz1
!!  status=cusparseSpMatGetSize(matC,C_rows_dbl,C_cols_dbl,C_nnz_dbl)
!!  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseSpMatGetSize error: ', status
!!
!!  istat=cudaDeviceSynchronize
!!  C_rows=C_rows_dbl
!!  C_cols=C_cols_dbl
!!  C_nnz=C_nnz_dbl
!!  istat=cudaDeviceSynchronize
!!
!!  write(*,*) "A_rows",A_rows,"A_cols",A_cols,"A_nnz",A_nnz
!!  write(*,*) "B_rows",B_rows,"B_cols",B_cols,"B_nnz",B_nnz
!!  write(*,*) "C_rows",C_rows,"C_cols",C_cols,"C_nnz",C_nnz
!!
!!
!!  !! allocate matrix C
!!  if(allocated(Ccol_d)) deallocate(Ccol_d)
!!  if(allocated(Cval_d)) deallocate(Cval_d)
!!  allocate(Ccol_d(C_nnz))
!!  allocate(Cval_d(C_nnz))
!!  istat=cudaDeviceSynchronize
!!
!!  !! update matC with the new pointers
!!  status=cusparseCsrSetPointers(matC,Crow_d,Ccol_d,Cval_d)
!!  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseCsrSetPointers error: ', status
!!
!!  !! copy the final products to the matrix C
!!  status=cusparseSpGEMM_copy(handle,&
!!                             CUSPARSE_OPERATION_NON_TRANSPOSE,CUSPARSE_OPERATION_NON_TRANSPOSE,&
!!                             alpha,matA,matB,beta,matC,&
!!                             CUDA_R_64F,CUSPARSE_SPGEMM_ALG3,SpGEMMDesc)
!!  if(status/=CUSPARSE_STATUS_SUCCESS) print *, 'cusparseSpGEMM_copy error: ', status
!!
!!
!!  deallocate(buffer1)
!!  deallocate(buffer2)
!!
!!  status=cusparseSpGEMM_destroyDescr(SpGEMMDesc)
!!  status=cusparseDestroySpMat(matA)
!!  status=cusparseDestroySpMat(matB)
!!  status=cusparseDestroySpMat(matC)
!!  status=cusparseDestroy(handle)
!!
!!
!!!======================================================
!!
!!  istat=cudaDeviceSynchronize
!!  Crow=Crow_d
!!  Ccol=Ccol_d
!!  Cval=Cval_d
!!  istat=cudaDeviceSynchronize
!!
!!  print *, Crow
!!  print *, " "
!!  print *, Ccol
!!  print *, " "
!!  print *, Cval


return
end subroutine SpGEMM_ALG3