Hey,
I try to solve a linear equation system coming from FEM algorithm with cuSparse. I use the example from the cuSparse documentation with LU decomposition (my matrix is non-symmetric) and solve the system with cusparseDcsrsm2_solve.
The problem is: I compare the solution from cuSpase with the solution calculated on CPU (Host), but the solution differs from the host solution (calculated with intel pardiso from the mkl lib).
Can anybody help me? Is there a bug in the code?
Here is my code:
cusparseStatus_t status;
cusparseHandle_t handle = 0;
cusparseMatDescr_t descr_M = 0;
cusparseMatDescr_t descr_L = 0;
cusparseMatDescr_t descr_U = 0;
csrilu02Info_t info_M = 0;
csrsv2Info_t info_L = 0;
csrsv2Info_t info_U = 0;
int pBufferSize_M;
int pBufferSize_L;
int pBufferSize_U;
int pBufferSize;
void *pBuffer = 0;
int structural_zero;
int numerical_zero;
const double alpha = 1.;
const cusparseSolvePolicy_t policy_M = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
const cusparseSolvePolicy_t policy_L = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
const cusparseSolvePolicy_t policy_U = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseOperation_t trans_L = CUSPARSE_OPERATION_NON_TRANSPOSE;
const cusparseOperation_t trans_U = CUSPARSE_OPERATION_NON_TRANSPOSE;
// -- init cuSPARSE
cusparseCreate(&handle);
/******************************************************/
/* SETUP DESCRIPTOR FOR MATRIX A AND LU DECOMPOSITION */
/******************************************************/
// - matrix M is base-1
// - matrix L is base-1
// - matrix L is lower triangular
// - matrix L has unit diagonal
// - matrix U is base-1
// - matrix U is upper triangular
// - matrix U has non-unit diagonal
cusparseCreateMatDescr(&descr_M);
cusparseSetMatIndexBase(descr_M, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseCreateMatDescr(&descr_L);
cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
cusparseCreateMatDescr(&descr_U);
cusparseSetMatIndexBase(descr_U, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
/*****************************************/
/* STEP 2: create a empty info structure */
/*****************************************/
// we need one info for csrilu02 and two info's for csrsv2
cusparseCreateCsrilu02Info(&info_M);
cusparseCreateCsrsv2Info(&info_L);
cusparseCreateCsrsv2Info(&info_U);
/**************************************************************************************/
/* STEP 3: query how much memory used in csrilu02 and csrsv2, and allocate the buffer */
/**************************************************************************************/
cusparseDcsrilu02_bufferSize(handle, ldrsd, nnz,
descr_M, d_acsr, d_iacsr, d_jacsr, info_M, &pBufferSize_M);
cusparseDcsrsv2_bufferSize(handle, trans_L, ldrsd, nnz,
descr_L, d_acsr, d_iacsr, d_jacsr, info_L, &pBufferSize_L);
cusparseDcsrsv2_bufferSize(handle, trans_U, ldrsd, nnz,
descr_U, d_acsr, d_iacsr, d_jacsr, info_U, &pBufferSize_U);
pBufferSize = max(pBufferSize_M, max(pBufferSize_L, pBufferSize_U));
// pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
cudaMalloc((void**)&pBuffer, pBufferSize);
/*******************************************************/
// STEP 4: perform analysis of incomplete Cholesky on M
// perform analysis of triangular solve on L
// perform analysis of triangular solve on U
/*******************************************************/
// The lower(upper) triangular part of M has the same sparsity pattern as L(U),
// we can do analysis of csrilu0 and csrsv2 simultaneously.
cusparseDcsrilu02_analysis(handle, ldrsd, nnz, descr_M,
d_acsr, d_iacsr, d_jacsr, info_M,
policy_M, pBuffer);
status = cusparseXcsrilu02_zeroPivot(handle, info_M, &structural_zero);
if (CUSPARSE_STATUS_ZERO_PIVOT == status){
printf("A(%d,%d) is missing\n", structural_zero, structural_zero);
}
cusparseDcsrsv2_analysis(handle, trans_L, ldrsd, nnz, descr_L,
d_acsr, d_iacsr, d_jacsr,
info_L, policy_L, pBuffer);
cusparseDcsrsv2_analysis(handle, trans_U, ldrsd, nnz, descr_U,
d_acsr, d_iacsr, d_jacsr,
info_U, policy_U, pBuffer);
/********************/
// step 5: M = L * U /
/********************/
cusparseDcsrilu02(handle, ldrsd, nnz, descr_M,
d_acsr, d_iacsr, d_jacsr, info_M, policy_M, pBuffer);
status = cusparseXcsrilu02_zeroPivot(handle, info_M, &numerical_zero);
if (CUSPARSE_STATUS_ZERO_PIVOT == status){
printf("U(%d,%d) is zero\n", numerical_zero, numerical_zero);
}
/************************/
// step 6: solve L*z = x
/************************/
cusparseDcsrsv2_solve(handle, trans_L, ldrsd, nnz, &alpha, descr_L,
d_acsr, d_iacsr, d_jacsr, info_L,
d_rsd, d_X, policy_L, pBuffer);
/************************/
// step 7: solve U*y = z
/************************/
double *d_z; cudaMalloc(&d_z, ldrsd * sizeof(double));
cusparseDcsrsv2_solve(handle, trans_U, ldrsd, nnz, &alpha, descr_U,
d_acsr, d_iacsr, d_jacsr, info_U,
d_X, d_z, policy_U, pBuffer);
cudaFree(pBuffer);
cusparseDestroyMatDescr(descr_M);
cusparseDestroyMatDescr(descr_L);
cusparseDestroyMatDescr(descr_U);
cusparseDestroyCsrilu02Info(info_M);
cusparseDestroyCsrsv2Info(info_L);
cusparseDestroyCsrsv2Info(info_U);
cusparseDestroy(handle);