Sparse triangular matrix solvers in MKL and cuSparse produce different results?

MoeSparse · July 23, 2023, 3:17pm

In my research, I need to use SSOR as the preconditioner for the PCG method. I created a CSR matrix, in which the L and the U parts contain the SSOR information. I actually use two SpMV routines for solving the preconditioned system.

I created the L mat in the following manner:

// A struct encapsulated the information for a CSR matrix, and csrMat is the original linear operator. 

  spMat<T> L{nullptr, csrMat.rowOffsetsPtr, csrMat.colIndPtr, nullptr};

// Copy the values of SSOR matrix from the host. 
  CHECK_CUDA_ERROR(cudaMalloc(reinterpret_cast<void **>(&L.valuesPtr), nnz * sizeof(T)));
  CHECK_CUDA_ERROR(cudaMemcpy(reinterpret_cast<void *>(L.valuesPtr), reinterpret_cast<void *>(ssorValues), nnz * sizeof(T), cudaMemcpyHostToDevice));
  
// Create the mat.
  CHECK_CUDA_ERROR(cusparseCreateCsr(&L.descr, static_cast<int64_t>(size), static_cast<int64_t>(size), static_cast<int64_t>(nnz), reinterpret_cast<void *>(L.rowOffsetsPtr), reinterpret_cast<void *>(L.colIndPtr), reinterpret_cast<void *>(L.valuesPtr), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cuTraits<T>::valueType));
  
// Set attributes, note for L mat, the diagonal entries are treated as one.
  cusparseFillMode_t fill_mode{CUSPARSE_FILL_MODE_LOWER};
  cusparseDiagType_t diag_type{CUSPARSE_DIAG_TYPE_UNIT};
  CHECK_CUDA_ERROR(cusparseSpMatSetAttribute(L.descr, CUSPARSE_SPMAT_FILL_MODE, &fill_mode, sizeof(fill_mode)));
  CHECK_CUDA_ERROR(cusparseSpMatSetAttribute(L.descr, CUSPARSE_SPMAT_DIAG_TYPE, &diag_type, sizeof(diag_type)));

Then, I create U mat in the following manner:

// U mat shared the same sparsity pattern with the L and the original mat.
  spMat<T> U{nullptr, csrMat.rowOffsetsPtr, csrMat.colIndPtr, L.valuesPtr};

// Create the U mat.
  CHECK_CUDA_ERROR(cusparseCreateCsr(&U.descr, static_cast<int64_t>(size), static_cast<int64_t>(size), static_cast<int64_t>(nnz), reinterpret_cast<void *>(U.rowOffsetsPtr), reinterpret_cast<void *>(U.colIndPtr), reinterpret_cast<void *>(U.valuesPtr), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cuTraits<T>::valueType));

// Set attributes.
  fill_mode = CUSPARSE_FILL_MODE_UPPER;
  diag_type = CUSPARSE_DIAG_TYPE_NON_UNIT;
  CHECK_CUDA_ERROR(cusparseSpMatSetAttribute(U.descr, CUSPARSE_SPMAT_FILL_MODE, &fill_mode, sizeof(fill_mode)));
  CHECK_CUDA_ERROR(cusparseSpMatSetAttribute(U.descr, CUSPARSE_SPMAT_DIAG_TYPE, &diag_type, sizeof(diag_type)));

Prepare the triangular matrix solvers for the L and U mat:

// Create the description.
   cusparseSpSVDescr_t spsvDescrL{nullptr}, spsvDescrU{nullptr};
  CHECK_CUDA_ERROR(cusparseSpSV_createDescr(&spsvDescrL));
  CHECK_CUDA_ERROR(cusparseSpSV_createDescr(&spsvDescrU));

// Malloc the buffer.
  void  *bufferSV{nullptr};
  size_t bufferSizeL{0}, bufferSizeU{0};
  alpha = 1;
  CHECK_CUDA_ERROR(cusparseSpSV_bufferSize(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, L.descr, r.descr, aux.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrL, &bufferSizeL));
  CHECK_CUDA_ERROR(cusparseSpSV_bufferSize(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, U.descr, aux.descr, z.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrU, &bufferSizeU));
  CHECK_CUDA_ERROR(cudaMalloc(reinterpret_cast<void **>(&bufferSV), std::max(bufferSizeL, bufferSizeU)));

// Do the analysis routine.
  CHECK_CUDA_ERROR(cusparseSpSV_analysis(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, L.descr, r.descr, aux.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrL, bufferSV));
  CHECK_CUDA_ERROR(cusparseSpSV_analysis(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, U.descr, aux.descr, z.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrU, bufferSV));

The following codes show the solving process:

// Reset the memory for the solution vector.
   CHECK_CUDA_ERROR(cudaMemset(reinterpret_cast<void *>(aux.ptr), 0, size * sizeof(T)));

// Solve.
  CHECK_CUDA_ERROR(cusparseSpSV_solve(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, L.descr, r.descr, aux.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrL));

// Reset.
  CHECK_CUDA_ERROR(cudaMemset(reinterpret_cast<void *>(z.ptr), 0, size * sizeof(T)));

// Solve.
  CHECK_CUDA_ERROR(cusparseSpSV_solve(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, U.descr, aux.descr, z.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrU));

MoeSparse · July 23, 2023, 3:21pm

However, it seems cuSPARSE produced different results with the almost same routine that I used in mkl.

Are there any different behaviors between cuSPARSE and mkl regarding the sparse triangular solvers?

MoeSparse · July 24, 2023, 3:32am

Moreover, it is wired that if I use SpMV to check the solution of (L aux = r):

// Malloc the MV buffer.
  void    *bufferMVtemp{nullptr};
  size_t   buffterMVtempSizeL{0};
  T        minusOne = -1, one = 1;
  dnVec<T> rhs{nullptr, reinterpret_cast<T *>(compBuffer)};
  CHECK_CUDA_ERROR(cusparseCreateDnVec(&rhs.descr, static_cast<int64_t>(size), reinterpret_cast<void *>(rhs.ptr), cuTraits<T>::valueType));
  CHECK_CUDA_ERROR(cusparseSpMV_bufferSize(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minusOne, L.descr, aux.descr, &one, rhs.descr, cuTraits<T>::valueType, CUSPARSE_SPMV_ALG_DEFAULT, &buffterMVtempSizeL));
  CHECK_CUDA_ERROR(cudaMalloc((&bufferMVtemp), static_cast<size_t>(buffterMVtempSizeL)));

// Copy the solution into a vector named rhs.
  CHECK_CUDA_ERROR(cudaMemcpy(rhs.ptr, r.ptr, size * sizeof(T), cudaMemcpyDeviceToDevice));

// Perform the SpMV.
  CHECK_CUDA_ERROR(cusparseSpMV(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minusOne, L.descr, aux.descr, &one, rhs.descr, cuTraits<T>::valueType, CUSPARSE_SPMV_ALG_DEFAULT, bufferMVtemp));
  T rhsNorm = 0;

// Check the residual again.
  cublasNorm(blasHandle, size, &rhs.ptr[0], &rhsNorm);
  std::printf("rhs error=%.6e\n", rhsNorm);
  cuFreeMod(bufferMVtemp);

The residual is far from zero.

malmasri · July 24, 2023, 7:11am

Hi @MoeSparse
You need to create a buffer for L and a different buffer for U. The external buffer cannot be shared between the L and U parts.

 void  *bufferSVL{nullptr}, *bufferSVU{nullptr};
 CHECK_CUDA_ERROR(cudaMalloc(reinterpret_cast<void **>(&bufferSVL), bufferSizeL));
 CHECK_CUDA_ERROR(cudaMalloc(reinterpret_cast<void **>(&bufferSVU), bufferSizeU));

Please have a look at the usage of SpSV in CUDA samples: CUDA Samples

Thanks

MoeSparse · July 24, 2023, 1:58pm

Thanks, It works.

system · August 7, 2023, 1:59pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
cuSparse incomplete LU decomposition as preconditioner GPU-Accelerated Libraries	9	2901	September 9, 2016
cusparseSpSV_solve function extremely slow GPU-Accelerated Libraries	4	44	November 19, 2024
cusparseScsrilu02 breaks with large matrices GPU-Accelerated Libraries cublas , cusolver , cusparse	10	1155	June 1, 2022
Preconditioning with cusparseSpSV CSR GPU-Accelerated Libraries cusparse	10	1529	October 12, 2021
cuSPARSE BSR Matrix Solver GPU-Accelerated Libraries cuda , cusolver , cusparse	2	60	October 23, 2024
Some questions for the function "cusparseDcsrsv2_solve" nvc, nvc++ and nvfortran cuda	12	57	August 9, 2024
cusparseDcsrsv_solve does not even write in all result vector positions GPU-Accelerated Libraries	1	1006	June 13, 2014
Problem with "cusparseDcsrsv" GPU-Accelerated Libraries	5	2104	May 23, 2013
Sparse LU decomposition got wrong results for large matrices CUDA Programming and Performance	2	633	July 20, 2018
cusparseScsrmv transpose mode is not working CUDA Programming and Performance	17	1504	July 9, 2018

Sparse triangular matrix solvers in MKL and cuSparse produce different results?

Related topics