cuSPARSE LU decomposition error

I’m trying to use the cusparseDbsrilu02() function to create an ILU0 preconditioner, based on the blocked csr format. cusparseXbsrilu02_zeroPivot() reported an error, the default message in the examples is ‘block U(0,0) is not invertible’. When done manually, the 3x3 block is actually invertible. It is also in invertible by numpy (Python). Am I using the library wrong? Is the actual error about something else? When making all diagonal values nonzero, it is successfull.

The cpp file:
https://gist.github.com/Tongdongq/386e3b93c099a46879100da9706f6109
I compile with ‘nvcc tmp.cpp -I/usr/local/cuda-9.2/samples/common/inc -lcusparse’ and run with ‘./a.out’.

Python test:

import numpy
m = numpy.matrix([[0, 0, 1],[2, 3, 0],[4, 5, 6]])
numpy.linalg.det(m)
-2.0
m.I
matrix([[-9. , -2.5, 1.5],
[ 6. , 2. , -1. ],
[ 1. , 0. , 0. ]])
b = m.I
m*b
matrix([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])

#include <cuda_runtime.h>
#include <algorithm> // for max()

#include "cublas_v2.h"
#include "cusparse_v2.h"

#include "helper_cuda.h" // for checkCudaErrors(), getLastCudaError(), findCudaDevice()

const int blockDim = 3;
const cusparseDirection_t dir = CUSPARSE_DIRECTION_COLUMN;
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseOperation_t trans  = CUSPARSE_OPERATION_NON_TRANSPOSE;

int main(){

	int NNZ = 9;
	int DIM = 3;
	int base = 0;
	int mb;
	int matrixN = DIM;
	void* pBuffer;
	cusparseHandle_t cusparseHandle;
	cusparseMatDescr_t descra, descr_M, descr_L, descr_U;
	bsrilu02Info_t info_M  = 0;
	bsrsv2Info_t info_L, info_U;
	cusparseStatus_t status1, status2, status3;

	double tval[] = {0, 0, 1, 2, 3, 0, 4, 5, 6};
	int trow[] = {0, 3, 6, 9};
	int tcol[] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
	mb = (matrixN + blockDim - 1) / blockDim;

	int nnzb;
	int *h_nnzTotal = &nnzb;

	double *d_aVals;
	double *d_bValsM, *d_bVals;
	int *d_aCols, *d_aRows, *d_bCols, *d_bRows;

	checkCudaErrors(cusparseCreate(&cusparseHandle));

	checkCudaErrors(cudaMalloc((void**)&d_aVals, sizeof(double)*NNZ));
	checkCudaErrors(cudaMalloc((void**)&d_bValsM, sizeof(double)*NNZ));
	checkCudaErrors(cudaMalloc((void**)&d_aCols, sizeof(int)*NNZ));
	checkCudaErrors(cudaMalloc((void**)&d_aRows, sizeof(int)*(DIM+1)));

	printf("cudaMalloced\n");

	checkCudaErrors(cudaMemcpy(d_aVals, tval, (size_t)(NNZ * sizeof(tval[0])), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_aCols, tcol, (size_t)(NNZ * sizeof(tcol[0])), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_aRows, trow, (size_t)(DIM * sizeof(trow[0])), cudaMemcpyHostToDevice));

	printf("Copied to GPU\n");

	int pBufferSize_M, pBufferSize_L, pBufferSize_U, pBufferSize;

	// step 1: create a descriptor which contains
	cusparseCreateMatDescr(&descra);
	cusparseSetMatType(descra,CUSPARSE_MATRIX_TYPE_GENERAL);
	cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO;

	cusparseSetMatIndexBase(descra, base_type);

	getLastCudaError("descriptors created and initialized\n");

	cusparseCreateMatDescr(&descr_M);
	cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
	cusparseSetMatIndexBase(descr_M, base_type);

	cusparseCreateMatDescr(&descr_L);
	cusparseSetMatIndexBase(descr_L, base_type);
	cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
	cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
	cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);

	cusparseCreateMatDescr(&descr_U);
	cusparseSetMatIndexBase(descr_U, base_type);
	cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
	cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
	cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
	getLastCudaError("descriptors created and initialized\n");

	printf("Created matrix descriptors\n");

	// step 2: create a empty info structure
	// we need one info for csrilu02 and two info's for csrsv2
	cusparseCreateBsrilu02Info(&info_M);
	cusparseCreateBsrsv2Info(&info_L);
	cusparseCreateBsrsv2Info(&info_U);
	getLastCudaError("infos created\n");

	checkCudaErrors(cudaMalloc((void**)&d_bRows, sizeof(int)*(mb+1)));
	checkCudaErrors(cusparseXcsr2bsrNnz(cusparseHandle, dir, matrixN, matrixN,
		descra, d_aRows, d_aCols, blockDim, descr_M, d_bRows, h_nnzTotal));
	if(NULL != h_nnzTotal){
		nnzb = *h_nnzTotal;
	}else{
		checkCudaErrors(cudaMemcpy(&nnzb, d_bRows+mb, sizeof(int), cudaMemcpyDeviceToHost));
		checkCudaErrors(cudaMemcpy(&base, d_bRows, sizeof(int), cudaMemcpyDeviceToHost));
		nnzb -= base;
	}
	checkCudaErrors(cudaMalloc((void**)&d_bCols, sizeof(int)*nnzb));
	checkCudaErrors(cudaMalloc((void**)&d_bVals, sizeof(double)*blockDim*blockDim*nnzb));
	checkCudaErrors(cusparseDcsr2bsr(cusparseHandle, dir, matrixN, matrixN,
		descra, d_aVals, d_aRows, d_aCols, blockDim, descr_M, d_bVals, d_bRows, d_bCols));
	printf("nnzb: %d, mb: %d\n", nnzb, mb);

	cusparseDbsrilu02_bufferSize(cusparseHandle, dir, mb, nnzb,
		descr_M, d_bVals, d_bRows, d_bCols, blockDim, info_M, &pBufferSize_M);
	cusparseDbsrsv2_bufferSize(cusparseHandle, dir, trans, mb, nnzb,
		descr_L, d_bVals, d_bRows, d_bCols, blockDim, info_L, &pBufferSize_L);
	cusparseDbsrsv2_bufferSize(cusparseHandle, dir, trans, mb, nnzb,
		descr_U, d_aVals, d_bRows, d_bCols, blockDim, info_U, &pBufferSize_U);
	getLastCudaError("buffersizes checked\n");
	pBufferSize = std::max(pBufferSize_M, std::max(pBufferSize_L, pBufferSize_U));
	checkCudaErrors(cudaMalloc((void**)&pBuffer, pBufferSize));
	
	// start the analysis
	checkCudaErrors(cusparseDbsrilu02_analysis(cusparseHandle, dir, mb, nnzb, descra,
		d_bVals, d_bRows, d_bCols, blockDim, info_M, policy, pBuffer));
	int structural_zero;
	status1 = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
	if (CUSPARSE_STATUS_ZERO_PIVOT == status1){
	   printf("WARNING A(%d,%d) is missing\n", structural_zero, structural_zero);
	}

	checkCudaErrors(cusparseDbsrsv2_analysis(cusparseHandle, dir, trans, mb, nnzb, 
		descr_L, d_bVals, d_bRows, d_bCols, 
		blockDim, info_L, policy, pBuffer));
	checkCudaErrors(cusparseDbsrsv2_analysis(cusparseHandle, dir, trans, mb, nnzb, 
		descr_U, d_bVals, d_bRows, d_bCols, 
		blockDim, info_U, policy, pBuffer));

	// create prec
	checkCudaErrors(cusparseDbsrilu02(cusparseHandle, dir, mb, nnzb, descr_M, 
				d_bValsM, d_bRows, d_bCols, 
				blockDim, info_M, policy, pBuffer));

	int numerical_zero;
	status1 = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &numerical_zero);
	if (CUSPARSE_STATUS_ZERO_PIVOT == status1){
		printf("ERROR block U(%d,%d) is not invertible\n", numerical_zero, numerical_zero);
		return 1;
	}

	printf("Created prec\n");

	return 0;
}