Preconditioning with cusparseSpSV CSR

Does the cusparseSpSV CSR have any built-in preconditioner? I am attempting to use cusparseSpSV CSR along with cusparseDcsrilu02, but my code results in NaN. If I do not use cusparseDcsrilu02, I get real values but my code takes much longer. I am trying to convert from using cusparseDcsrsv2_solve and other deprecated functions, but can not find a good guide on how to mesh the new function of cusparseSpSV with a preconditioner.

Can you please report the main API calls that you are using?
Also, cusparseSpSV and cusparseDcsrsv2_solve compute the same thing. It could be that one parameter of cusparseSpSV is not correct or the matrix attributes (cusparseSpMatSetAttribute) are not set in the right way.

This is the part of my code utilizing the calls. I had a working version in 21.3 with cusparseDcsrsv2_solve, but not sure where I am going wrong in this version.

void load_lusol_cusparse2(int* sparse_offsets, int* sparse_col,
  double* sparse_val, int nc, int nzero,
  double* dense_Y)
{

// Setup cusparse
    cusparseCreate(&cusparseHandle);

// Set up M
    cusparseCreateMatDescr(&M_described);
    cusparseSetMatIndexBase(M_described, CUSPARSE_INDEX_BASE_ONE);
    cusparseSetMatType(M_described, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseCreateCsrilu02Info(&M_analyzed);


// Get algorithm buffer sizes
    cusparseDcsrilu02_bufferSize(cusparseHandle, nc, nzero,
        M_described, sparse_val, sparse_offsets, sparse_col,
        M_analyzed, &Mbuf_size);

// Allocate buffers
    cudaMalloc((void**)&Mbuffer, Mbuf_size);

//Analyze M
    cusparseDcsrilu02_analysis(cusparseHandle, nc, nzero,
        M_described, sparse_val, sparse_offsets, sparse_col,
        M_analyzed, M_policy, Mbuffer);

    cusparseXcsrilu02_zeroPivot(cusparseHandle,M_analyzed, &struct_zero);
    
// Create the sparse matrixes
    cusparseCreateCsr(&L_SparseA, nc, nc, nzero,
      (void*)sparse_offsets, (void*)sparse_col, (void*)sparse_val,
      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
      CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F);
    
    cusparseCreateCsr(&U_SparseA, nc, nc, nzero,
        (void*)sparse_offsets, (void*)sparse_col, (void*)sparse_val,
        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
        CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F);
    
// Create the dense vectors
    cusparseCreateDnVec(&Dense_Y, nc,(void*)dense_Y, CUDA_R_64F);
    
// Initialize the data structures
    cusparseSpSV_createDescr(&L_spsv_described);
    cusparseSpSV_createDescr(&U_spsv_described);
    
// Set filling mode types
    cusparseFillMode_t L_fill_mode = CUSPARSE_FILL_MODE_LOWER;
    cusparseSpMatSetAttribute(L_SparseA, CUSPARSE_SPMAT_FILL_MODE,
      &L_fill_mode, sizeof(L_fill_mode));
    
    cusparseFillMode_t U_fill_mode = CUSPARSE_FILL_MODE_UPPER;
    cusparseSpMatSetAttribute(U_SparseA, CUSPARSE_SPMAT_FILL_MODE,
      &U_fill_mode, sizeof(U_fill_mode));
    
// Set diagonal unit types
    cusparseDiagType_t L_type_diag = CUSPARSE_DIAG_TYPE_UNIT;
    cusparseSpMatSetAttribute(L_SparseA, CUSPARSE_SPMAT_DIAG_TYPE,
        &L_type_diag, sizeof(L_type_diag));
    cusparseDiagType_t U_type_diag = CUSPARSE_DIAG_TYPE_NON_UNIT;
    cusparseSpMatSetAttribute(U_SparseA, CUSPARSE_SPMAT_DIAG_TYPE,
        &U_type_diag, sizeof(U_type_diag));

// allocate buffers
    cusparseSpSV_bufferSize(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
        &alpha, L_SparseA, DenseY, DenseY, CUDA_R_64F,
        CUSPARSE_SPSV_ALG_DEFAULT, L_spsv_described,&L_SpSv_bufferSize);
    
    cusparseSpSV_bufferSize(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
        &alpha, U_SparseA, DenseY, DenseY, CUDA_R_64F,
        CUSPARSE_SPSV_ALG_DEFAULT, U_spsv_described,&U_SpSv_bufferSize);
    
    cudaMalloc(&L_SpSv_Buffer, L_SpSv_bufferSize);
    cudaMalloc(&U_SpSv_Buffer, U_SpSv_bufferSize);
    
// Analyze the Sparse Matrixes
    //Analyze L
    cusparseSpSV_analysis(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
      &alpha, L_SparseA, DenseY, DenseY, CUDA_R_64F,
      CUSPARSE_SPSV_ALG_DEFAULT, L_spsv_described, L_SpSv_Buffer);
    //Analyze U
    cusparseSpSV_analysis(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
        &alpha, U_SparseA, DenseY, DenseY, CUDA_R_64F,
        CUSPARSE_SPSV_ALG_DEFAULT, U_spsv_described, U_SpSv_Buffer);

 // Set preconditioner (M=LU)
    cusparseDcsrilu02(cusparseHandle, nc, nzero,
        M_described, sparse_val, sparse_offsets,
        sparse_col, M_analyzed, M_policy, Mbuffer);

    cusparseDestroySpMat(L_SparseA);
    cusparseDestroySpMat(U_SparseA);

    // Create the sparse matrixes
    cusparseCreateCsr(&L_SparseA, nr, nc, nzero,
      (void*)sparse_offsets, (void*)sparse_col, (void*)sparse_val,
      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
      CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F);
    
    cusparseCreateCsr(&U_SparseA, nr, nc, nzero,
        (void*)sparse_offsets, (void*)sparse_col, (void*)sparse_val,
        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
        CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F);

    // Set filling mode types
    cusparseSpMatSetAttribute(L_SparseA, CUSPARSE_SPMAT_FILL_MODE,
      &L_fill_mode, sizeof(L_fill_mode));
    
    cusparseSpMatSetAttribute(U_SparseA, CUSPARSE_SPMAT_FILL_MODE,
      &U_fill_mode, sizeof(U_fill_mode));
    
    // Set diagonal unit types
    cusparseSpMatSetAttribute(L_SparseA, CUSPARSE_SPMAT_DIAG_TYPE,
        &L_type_diag, sizeof(L_type_diag));
    cusparseSpMatSetAttribute(U_SparseA, CUSPARSE_SPMAT_DIAG_TYPE,
        &U_type_diag, sizeof(U_type_diag));
    
    cudaDeviceSynchronize();
}

void lusol_cusparse2(double* dense_Y)
{
  // dense_Y = values for the dense vector Y and X

  // Set values of the dense vector Y
  void* temp=(void*)dense_Y;
  cusparseDnVecSetValues(DenseY,(void*)dense_Y);

  // Forward solve (Lx=y)
  cusparseSpSV_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
   &alpha, L_SparseA, DenseY, DenseY, CUDA_R_64F,
   CUSPARSE_SPSV_ALG_DEFAULT, L_spsv_described);


  // Backward solve (Uy=x)
  cusparseSpSV_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
      &alpha, U_SparseA, DenseY, DenseY, CUDA_R_64F,
      CUSPARSE_SPSV_ALG_DEFAULT, U_spsv_described);

  // Get the values of the dense vector into dense_Y
  cusparseDnVecGetValues(DenseY,&temp);

  dense_Y = (double*)temp;
  cudaDeviceSynchronize();
}
  • Miko

The workflow seems correct. Can you please provide a minimal working code and the input matrix? in this way, we can better identify the problem

Thank you for the help!

I was most concerned if my workflow was correct. Since it appears to be, I now have a better idea of where to find my potential error. Between my two versions, I had to make a small change to my main code, which in all likelihood is the error. I had to pass in variables in a slightly different way, and this may be the culprit. I will post back if it is not the case.

  • Miko

We did a further investigation. cusparseSpSV does not support in-place computation d_X == d_Y (DenseYin your case).

cusparseSpSV_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
   &alpha, L_SparseA, DenseY, DenseY, CUDA_R_64F,
   CUSPARSE_SPSV_ALG_DEFAULT, L_spsv_described);

I have updated my code as follows, but I think I narrowed my problem down to not copying or updating dense_Y correctly between my C and Fortran code. I have included my Fortran call code.

C:

void lusol_cusparse2(double* dense_Y)
{
  // dense_Y = values for the dense vector Y and X
    
  // Set values of the dense vector Y
  
  cusparseDnVecSetValues(DenseY,(void*)dense_Y);
  cusparseDnVecSetValues(DenseX,(void*)dense_Y);

  // Forward solve (Lx=y)
  cusparseSpSV_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
   &alpha, L_SparseA, DenseX, DenseY, CUDA_R_64F,
   CUSPARSE_SPSV_ALG_DEFAULT, L_spsv_described);
    
  void* temp=(void*)dense_Y;
  cusparseDnVecGetValues(DenseY,&temp);
  cusparseDnVecSetValues(DenseX,temp);
    
  // Backward solve (Uy=x)
  cusparseSpSV_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
      &alpha, U_SparseA, DenseX, DenseY, CUDA_R_64F,
      CUSPARSE_SPSV_ALG_DEFAULT, U_spsv_described);

  // Get the values of the dense vector into dense_Y
  cusparseDnVecGetValues(DenseY,&temp);
    
  dense_Y = (double*)temp;
  cudaDeviceSynchronize();
}

Fortran:

c#######################################################################
      subroutine prec_inv (dense_Y)
c
c-----------------------------------------------------------------------
c
      use ....
      implicit none
      real(r_typ), dimension(N) :: dense_Y
c
c-----------------------------------------------------------------------
c
!$acc host_data use_device(dense_Y)
c
      call lusol_cusparse2(C_LOC(dense_Y(1)))
!$acc end host_data

c
      return
      end

Actually that is not problem. Sorry

  • Miko

Hello. I finally found my bug. It was caused by the order of my functions. I had to set the preconditioner (M=LU) before doing any analysis on L and U.My code works fine with d_X==d_Y. If anyone has an issue similar to mine, here is my pseudocode:

void load_lusol_cusparse(int* sparse_offsets, int* sparse_col,
  double* sparse_val, int nc, int nzero, double* dense_Y)
{
  // Setup cusparse.
  // Set up M
  // Get algorithm buffer size for M
  // Allocate buffer for M
  // Analyze M
  // Set preconditioner (M=LU)

  ////////////////////////////////////////////////////////////////////////////////////
  //
  // Set up L
  //
  ////////////////////////////////////////////////////////////////////////////////////
  // Creat the sparse matrix
  // Initialize the data structure
  // Set filling mode types
  // Set diagonal unit types

  ////////////////////////////////////////////////////////////////////////////////////
  //
  // Set up U
  //
  ////////////////////////////////////////////////////////////////////////////////////
  // Creat the sparse matrix
  // Initialize the data structure
  // Set filling mode types
  // Set diagonal unit types

  ////////////////////////////////////////////////////////////////////////////////////

  // Set up Dense Vector  
  // Get algorithm buffer sizes for L and U
  // Allocate buffers for L and U
  // Analyze L
  // Analyze U

  cudaDeviceSynchronize();

}


void lusol_cusparse(double* dense_Y)
{
  cusparseDnVecSetValues(DenseVec,(void*)dense_Y);
  // Forward solve (Lx=y)
  // Backward solve (Uy=x)

  cudaDeviceSynchronize();
}

perfect. Happy that the problem has been fixed