Sparse LU decomposition got wrong results for large matrices

I use scipy.sparse to feed csr sparse matrices to CUDA C. Here’s an example:

A = [[ 10123., 253453., 0.]
[ 0., 588678., 38676.]
[ 0., 1867535., 13834.]],
b = [153.0, 2383.0, 1430.0].

scipy.sparse.linalg.spsolve got result of x = [0.00638599, 0.0003486, 0.05630843].
cuda lu solver got result of x = [0.00638599, 0.0003486, 0.05630843].

It works fine for small matrices, but when I pass some big matrices to it, I just got wrong result. I’ve been trying to solve this for the whole day, but I just can’t figure out what went wrong.

Below is the code I used.

#include <iostream>
#include <vector>
#include <algorithm>

#include <cuda_runtime.h>
#include <cusparse_v2.h>

cusparseHandle_t cusparse_handle;

cusparseMatDescr_t  descrA = 0;
cusparseMatDescr_t  descr_L = 0;
cusparseMatDescr_t  descr_U = 0;

csrilu02Info_t info_A = 0;
csrsv2Info_t info_L = 0;
csrsv2Info_t info_U = 0;

void *pBuffer = 0;

//------------------------------
// SETUP DESCRIPTOR FUNCTION
//------------------------------
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase)
{
	cusparseCreateMatDescr(&descrA);
	cusparseSetMatType(descrA, matrixType);
	cusparseSetMatIndexBase(descrA, indexBase);
}

//-------------------------------------------------
// SETUP DESCRIPTOR FUNCTION FOR LU DECOMPOSITION
//-------------------------------------------------
void setUpDescriptorLU(cusparseMatDescr_t &descrLU, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase, cusparseFillMode_t fillMode, cusparseDiagType_t diagType)
{
	cusparseCreateMatDescr(&descrLU);
	cusparseSetMatType(descrLU, matrixType);
	cusparseSetMatIndexBase(descrLU, indexBase);
	cusparseSetMatFillMode(descrLU, fillMode);
	cusparseSetMatDiagType(descrLU, diagType);
}

//-------------------------------------------------
// MEMORY QUERY FUNCTION FOR LU DECOMPOSITION
//-------------------------------------------------
void memoryQueryLU(csrilu02Info_t &info_A, csrsv2Info_t &info_L, csrsv2Info_t &info_U, cusparseHandle_t cusparse_handle, const int N, const int nnz, cusparseMatDescr_t descrA, cusparseMatDescr_t descr_L,
	cusparseMatDescr_t descr_U, float *d_A, int *d_A_RowPtr, int *d_A_ColInd, cusparseOperation_t matrixOperation, void **pBuffer) {

	cusparseCreateCsrilu02Info(&info_A);
	cusparseCreateCsrsv2Info(&info_L);
	cusparseCreateCsrsv2Info(&info_U);

	int pBufferSize_M, pBufferSize_L, pBufferSize_U;
	cusparseScsrilu02_bufferSize(cusparse_handle, N, nnz, descrA, d_A, d_A_RowPtr, d_A_ColInd, info_A, &pBufferSize_M);
	cusparseScsrsv2_bufferSize(cusparse_handle, matrixOperation, N, nnz, descr_L, d_A, d_A_RowPtr, d_A_ColInd, info_L, &pBufferSize_L);
	cusparseScsrsv2_bufferSize(cusparse_handle, matrixOperation, N, nnz, descr_U, d_A, d_A_RowPtr, d_A_ColInd, info_U, &pBufferSize_U);

	int pBufferSize = std::max(pBufferSize_M, std::max(pBufferSize_L, pBufferSize_U));
	cudaMalloc((void**)pBuffer, pBufferSize);
}

//-------------------------------------------------
// ANALYSIS FUNCTION FOR LU DECOMPOSITION
//-------------------------------------------------
void analysisLUDecomposition(csrilu02Info_t &info_A, csrsv2Info_t &info_L, csrsv2Info_t &info_U, cusparseHandle_t cusparse_handle, const int N, const int nnz, cusparseMatDescr_t descrA, cusparseMatDescr_t descr_L,
	cusparseMatDescr_t descr_U, float *d_A, int *d_A_RowPtr, int *d_A_ColInd, cusparseOperation_t matrixOperation, cusparseSolvePolicy_t solvePolicy1, cusparseSolvePolicy_t solvePolicy2, void *pBuffer)
{
	int structural_zero;

	cusparseScsrilu02_analysis(cusparse_handle, N, nnz, descrA, d_A, d_A_RowPtr, d_A_ColInd, info_A, solvePolicy1, pBuffer);

	cusparseStatus_t status = cusparseXcsrilu02_zeroPivot(cusparse_handle, info_A, &structural_zero);

	if (CUSPARSE_STATUS_ZERO_PIVOT == status) { printf("A(%d,%d) is missing\n", structural_zero, structural_zero); }
	
	cusparseScsrsv2_analysis(cusparse_handle, matrixOperation, N, nnz, descr_L, d_A, d_A_RowPtr, d_A_ColInd, info_L, solvePolicy1, pBuffer);

	cusparseScsrsv2_analysis(cusparse_handle, matrixOperation, N, nnz, descr_U, d_A, d_A_RowPtr, d_A_ColInd, info_U, solvePolicy2, pBuffer);
}

//-------------------------------------------------
// COMPUTE LU DECOMPOSITION FOR SPARSE MATRICES
//-------------------------------------------------
void computeSparseLU(csrilu02Info_t &info_A, cusparseHandle_t cusparse_handle, const int N, const int nnz, cusparseMatDescr_t descrA,
	float *d_A, int *d_A_RowPtr, int *d_A_ColInd, cusparseSolvePolicy_t solutionPolicy, void *pBuffer)
{

	int numerical_zero;
	
	cusparseScsrilu02(cusparse_handle, N, nnz, descrA, d_A, d_A_RowPtr, d_A_ColInd, info_A, solutionPolicy, pBuffer);
	
	cusparseStatus_t status = cusparseXcsrilu02_zeroPivot(cusparse_handle, info_A, &numerical_zero);

	if (CUSPARSE_STATUS_ZERO_PIVOT == status) { printf("U(%d,%d) is zero\n", numerical_zero, numerical_zero); }
}

//------------------------------------------------------------------------------------------
// Solve Ax=b using LU decomposition (only works for small sparse matrices)
//------------------------------------------------------------------------------------------
void spsolve(int ndofs, int nnz, float *d_A_Data, int *d_A_RowPtr, int *d_A_ColInd, float *d_b, float *d_x)
{
	//---------------------------------------------
	// STEP 1: CREATE DESCRIPTORS FOR A, L AND U
	//---------------------------------------------
	setUpDescriptor(descrA, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO);
	setUpDescriptorLU(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_UNIT);
	setUpDescriptorLU(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_FILL_MODE_UPPER, CUSPARSE_DIAG_TYPE_NON_UNIT);

	//-------------------------------------------------------------------------------------------------------
	// STEP 2: QUERY HOW MUCH MEMORY USED IN LU FACTORIZATION AND THE TWO FOLLOWING SYSTEM INVERSIONS
	//-------------------------------------------------------------------------------------------------------
	memoryQueryLU(info_A, info_L, info_U, cusparse_handle, ndofs, nnz, descrA, descr_L, descr_U, d_A_Data, d_A_RowPtr, d_A_ColInd, CUSPARSE_OPERATION_NON_TRANSPOSE, &pBuffer);

	//-------------------------------------------------------------------------------------------------------
	// STEP 3: ANALYZE THE THREE PROBLEMS: LU FACTORIZATION AND THE TWO FOLLOWING SYSTEM INVERSIONS
	//-------------------------------------------------------------------------------------------------------
	analysisLUDecomposition(info_A, info_L, info_U, cusparse_handle, ndofs, nnz, descrA, descr_L, descr_U, d_A_Data, d_A_RowPtr, d_A_ColInd,
		CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_SOLVE_POLICY_NO_LEVEL, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer);

	//---------------------------------------------
	// STEP 4: FACTORIZATION A = L * U
	//---------------------------------------------
	// A_Data will be overwritten
	computeSparseLU(info_A, cusparse_handle, ndofs, nnz, descrA, d_A_Data, d_A_RowPtr, d_A_ColInd, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer);

	//---------------------------------------------
	// STEP 5: L * y = b
	//---------------------------------------------
	// Allocating the intermediate result vector
	float *d_y;
	cudaMalloc(&d_y, ndofs * sizeof(float));

	const float alpha = 1.;

	// It will not modify d_b
	cusparseScsrsv2_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, ndofs, nnz, &alpha, descr_L, d_A_Data, d_A_RowPtr, d_A_ColInd, info_L, d_b, d_y, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer);

	//---------------------------------------------
	// STEP 5: U * x = y
	//---------------------------------------------
	// It will not modify d_y
	cusparseScsrsv2_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, ndofs, nnz, &alpha, descr_U, d_A_Data, d_A_RowPtr, d_A_ColInd, info_U, d_y, d_x, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer);
}

extern "C"
{
	__declspec(dllexport) float *my_func(int ndofs, int nnz, int *Cindptr, int *Cindices, float *Csrv, float *Ccsr_data)
	{
		float *Gsrv;
		cudaMalloc((void**)&Gsrv, ndofs * sizeof(float));
		cudaMemcpy(Gsrv, Csrv, ndofs * sizeof(float), cudaMemcpyHostToDevice);

		float *Gcsr_data;
		cudaMalloc((void**)&Gcsr_data, nnz * sizeof(float));
		cudaMemcpy(Gcsr_data, Ccsr_data, nnz * sizeof(float), cudaMemcpyHostToDevice);

		int *Gindices;
		cudaMalloc((void**)&Gindices, nnz * sizeof(int));
		cudaMemcpy(Gindices, Cindices, nnz * sizeof(int), cudaMemcpyHostToDevice);

		int *Gindptr;
		cudaMalloc((void**)&Gindptr, (ndofs + 1) * sizeof(int));
		cudaMemcpy(Gindptr, Cindptr, (ndofs + 1) * sizeof(int), cudaMemcpyHostToDevice);

		float *Gx;
		cudaMalloc((void**)&Gx, ndofs * sizeof(float));

		// Create library handles:
		cusparseCreate(&cusparse_handle);

		// Solve
		spsolve(ndofs, nnz, Gcsr_data, Gindptr, Gindices, Gsrv, Gx);

		// Copy x back to CPU
		float *Cx = new float[ndofs];
		cudaMemcpy(Cx, Gx, ndofs * sizeof(float), cudaMemcpyDeviceToHost);

		// Clean up
		cudaFree(Gx);
		cudaFree(Gsrv);
		cudaFree(Gcsr_data);
		cudaFree(Gindices);
		cudaFree(Gindptr);

		return Cx;
	}
}

Even I changed from float to double, it still can’t get right result for large matrices. So I guess the precision is not the reason.

Sorry, I made a mistake. This is just incomplete LU decomposition instead of LU decomposition.