Sparse triangular matrix solvers in MKL and cuSparse produce different results?

In my research, I need to use SSOR as the preconditioner for the PCG method. I created a CSR matrix, in which the L and the U parts contain the SSOR information. I actually use two SpMV routines for solving the preconditioned system.

I created the L mat in the following manner:

// A struct encapsulated the information for a CSR matrix, and csrMat is the original linear operator. 

  spMat<T> L{nullptr, csrMat.rowOffsetsPtr, csrMat.colIndPtr, nullptr};

// Copy the values of SSOR matrix from the host. 
  CHECK_CUDA_ERROR(cudaMalloc(reinterpret_cast<void **>(&L.valuesPtr), nnz * sizeof(T)));
  CHECK_CUDA_ERROR(cudaMemcpy(reinterpret_cast<void *>(L.valuesPtr), reinterpret_cast<void *>(ssorValues), nnz * sizeof(T), cudaMemcpyHostToDevice));
  
// Create the mat.
  CHECK_CUDA_ERROR(cusparseCreateCsr(&L.descr, static_cast<int64_t>(size), static_cast<int64_t>(size), static_cast<int64_t>(nnz), reinterpret_cast<void *>(L.rowOffsetsPtr), reinterpret_cast<void *>(L.colIndPtr), reinterpret_cast<void *>(L.valuesPtr), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cuTraits<T>::valueType));
  
// Set attributes, note for L mat, the diagonal entries are treated as one.
  cusparseFillMode_t fill_mode{CUSPARSE_FILL_MODE_LOWER};
  cusparseDiagType_t diag_type{CUSPARSE_DIAG_TYPE_UNIT};
  CHECK_CUDA_ERROR(cusparseSpMatSetAttribute(L.descr, CUSPARSE_SPMAT_FILL_MODE, &fill_mode, sizeof(fill_mode)));
  CHECK_CUDA_ERROR(cusparseSpMatSetAttribute(L.descr, CUSPARSE_SPMAT_DIAG_TYPE, &diag_type, sizeof(diag_type)));

Then, I create U mat in the following manner:

// U mat shared the same sparsity pattern with the L and the original mat.
  spMat<T> U{nullptr, csrMat.rowOffsetsPtr, csrMat.colIndPtr, L.valuesPtr};

// Create the U mat.
  CHECK_CUDA_ERROR(cusparseCreateCsr(&U.descr, static_cast<int64_t>(size), static_cast<int64_t>(size), static_cast<int64_t>(nnz), reinterpret_cast<void *>(U.rowOffsetsPtr), reinterpret_cast<void *>(U.colIndPtr), reinterpret_cast<void *>(U.valuesPtr), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cuTraits<T>::valueType));

// Set attributes.
  fill_mode = CUSPARSE_FILL_MODE_UPPER;
  diag_type = CUSPARSE_DIAG_TYPE_NON_UNIT;
  CHECK_CUDA_ERROR(cusparseSpMatSetAttribute(U.descr, CUSPARSE_SPMAT_FILL_MODE, &fill_mode, sizeof(fill_mode)));
  CHECK_CUDA_ERROR(cusparseSpMatSetAttribute(U.descr, CUSPARSE_SPMAT_DIAG_TYPE, &diag_type, sizeof(diag_type)));

Prepare the triangular matrix solvers for the L and U mat:

// Create the description.
   cusparseSpSVDescr_t spsvDescrL{nullptr}, spsvDescrU{nullptr};
  CHECK_CUDA_ERROR(cusparseSpSV_createDescr(&spsvDescrL));
  CHECK_CUDA_ERROR(cusparseSpSV_createDescr(&spsvDescrU));

// Malloc the buffer.
  void  *bufferSV{nullptr};
  size_t bufferSizeL{0}, bufferSizeU{0};
  alpha = 1;
  CHECK_CUDA_ERROR(cusparseSpSV_bufferSize(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, L.descr, r.descr, aux.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrL, &bufferSizeL));
  CHECK_CUDA_ERROR(cusparseSpSV_bufferSize(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, U.descr, aux.descr, z.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrU, &bufferSizeU));
  CHECK_CUDA_ERROR(cudaMalloc(reinterpret_cast<void **>(&bufferSV), std::max(bufferSizeL, bufferSizeU)));

// Do the analysis routine.
  CHECK_CUDA_ERROR(cusparseSpSV_analysis(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, L.descr, r.descr, aux.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrL, bufferSV));
  CHECK_CUDA_ERROR(cusparseSpSV_analysis(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, U.descr, aux.descr, z.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrU, bufferSV));

The following codes show the solving process:

// Reset the memory for the solution vector.
   CHECK_CUDA_ERROR(cudaMemset(reinterpret_cast<void *>(aux.ptr), 0, size * sizeof(T)));

// Solve.
  CHECK_CUDA_ERROR(cusparseSpSV_solve(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, L.descr, r.descr, aux.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrL));

// Reset.
  CHECK_CUDA_ERROR(cudaMemset(reinterpret_cast<void *>(z.ptr), 0, size * sizeof(T)));

// Solve.
  CHECK_CUDA_ERROR(cusparseSpSV_solve(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, U.descr, aux.descr, z.descr, cuTraits<T>::valueType, CUSPARSE_SPSV_ALG_DEFAULT, spsvDescrU));

However, it seems cuSPARSE produced different results with the almost same routine that I used in mkl.

Are there any different behaviors between cuSPARSE and mkl regarding the sparse triangular solvers?

Moreover, it is wired that if I use SpMV to check the solution of (L aux = r):

// Malloc the MV buffer.
  void    *bufferMVtemp{nullptr};
  size_t   buffterMVtempSizeL{0};
  T        minusOne = -1, one = 1;
  dnVec<T> rhs{nullptr, reinterpret_cast<T *>(compBuffer)};
  CHECK_CUDA_ERROR(cusparseCreateDnVec(&rhs.descr, static_cast<int64_t>(size), reinterpret_cast<void *>(rhs.ptr), cuTraits<T>::valueType));
  CHECK_CUDA_ERROR(cusparseSpMV_bufferSize(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minusOne, L.descr, aux.descr, &one, rhs.descr, cuTraits<T>::valueType, CUSPARSE_SPMV_ALG_DEFAULT, &buffterMVtempSizeL));
  CHECK_CUDA_ERROR(cudaMalloc((&bufferMVtemp), static_cast<size_t>(buffterMVtempSizeL)));

// Copy the solution into a vector named rhs.
  CHECK_CUDA_ERROR(cudaMemcpy(rhs.ptr, r.ptr, size * sizeof(T), cudaMemcpyDeviceToDevice));

// Perform the SpMV.
  CHECK_CUDA_ERROR(cusparseSpMV(sprHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minusOne, L.descr, aux.descr, &one, rhs.descr, cuTraits<T>::valueType, CUSPARSE_SPMV_ALG_DEFAULT, bufferMVtemp));
  T rhsNorm = 0;

// Check the residual again.
  cublasNorm(blasHandle, size, &rhs.ptr[0], &rhsNorm);
  std::printf("rhs error=%.6e\n", rhsNorm);
  cuFreeMod(bufferMVtemp);

The residual is far from zero.

Hi @MoeSparse
You need to create a buffer for L and a different buffer for U. The external buffer cannot be shared between the L and U parts.

 void  *bufferSVL{nullptr}, *bufferSVU{nullptr};
 CHECK_CUDA_ERROR(cudaMalloc(reinterpret_cast<void **>(&bufferSVL), bufferSizeL));
 CHECK_CUDA_ERROR(cudaMalloc(reinterpret_cast<void **>(&bufferSVU), bufferSizeU));

Please have a look at the usage of SpSV in CUDA samples: CUDA Samples

Thanks

1 Like

Thanks, It works.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.