cusolverRf: LU factorization pivoting parameters

Hi All,
I’m trying to use the new cusolverRf methods to solve Ax=b. I get wrong results for matrices A where P and Q are not equal to the identity matrices (PAQ = L*U).

As far as I can tell this how to use cusolverRf:

  • Factorize A on the host to obtain P, Q, L, U
  • Use the cusolverRf library:
    1. Create a cusolverRfHandle
    2. Call cusolverRfSetupHost: Pass A, L, U (in CSR format), P,Q (as permuation arrays) [all host memory]
    3. Call cusolverRfAnalyze
    4. Call cusolverRfRefactor
    5. Call cusolverRfSolve: Pass P, Q, Temp, b [all device memory: there's a error in the documentation http://docs.nvidia.com/cuda/cusolver/index.html#glu-lt-t-gt-solve which says Temp and b should be on the host]
    6. Destroy the handle using cusolverRfDestroy

The following python code (requires PyCuda) wraps some cusolverRf functions and solves 3 small systems Ax=b using the procedure described above [line 173ff]. I first factorize A on the host using scipy.sparse.linalg.splu() to get L, U, P, Q and then pass them to the cusolverRf functions.
For reference purposes I solve the same systems using scipy.sparse.linalg.spsolve() and cusolverSpDcsrlsvqr().

import numpy as np
import ctypes
import scipy.sparse as sps
import scipy.sparse.linalg as spl
import matplotlib.pyplot as plt

# GPU specific imports / definitions
from pycuda.autoinit import context
from pycuda import gpuarray

###############################################################################
# Wrappers for all required cusolver/cusparse functions
###############################################################################
# cuSparse
_libcusparse = ctypes.cdll.LoadLibrary('libcusparse.so')

_libcusparse.cusparseCreate.restype = int
_libcusparse.cusparseCreate.argtypes = [ctypes.c_void_p]

_libcusparse.cusparseDestroy.restype = int
_libcusparse.cusparseDestroy.argtypes = [ctypes.c_void_p]

_libcusparse.cusparseCreateMatDescr.restype = int
_libcusparse.cusparseCreateMatDescr.argtypes = [ctypes.c_void_p]

# cuSOLVER
_libcusolver = ctypes.cdll.LoadLibrary('libcusolver.so')

_libcusolver.cusolverSpCreate.restype = int
_libcusolver.cusolverSpCreate.argtypes = [ctypes.c_void_p]

_libcusolver.cusolverSpDestroy.restype = int
_libcusolver.cusolverSpDestroy.argtypes = [ctypes.c_void_p]

_libcusolver.cusolverSpDcsrlsvqr.restype = int
_libcusolver.cusolverSpDcsrlsvqr.argtypes= [ctypes.c_void_p,
                                            ctypes.c_int,
                                            ctypes.c_int,
                                            ctypes.c_void_p,
                                            ctypes.c_void_p,
                                            ctypes.c_void_p,
                                            ctypes.c_void_p,
                                            ctypes.c_void_p,
                                            ctypes.c_double,
                                            ctypes.c_int,
                                            ctypes.c_void_p,
                                            ctypes.c_void_p]
# cusolver-Rf
def cusolver_check_status(status):
    if status != 0:
        raise RuntimeError("CUDA returned with status " + str(status))

_libcusolver.cusolverRfCreate.restype = int
_libcusolver.cusolverRfCreate.argtypes = [ctypes.c_void_p]
def cusolverRfCreate():
    handle = ctypes.c_void_p()
    cusolver_check_status(_libcusolver.cusolverRfCreate(ctypes.byref(handle)))
    return handle.value

_libcusolver.cusolverRfDestroy.restype = int
_libcusolver.cusolverRfDestroy.argtypes = [ctypes.c_void_p]
def cusolverRfDestroy(handle):
    cusolver_check_status(_libcusolver.cusolverRfDestroy(handle))

_libcusolver.cusolverRfSetupHost.restype = int
_libcusolver.cusolverRfSetupHost.argtypes = [ctypes.c_int,     # n
                                          ctypes.c_int,     # nnz
                                          ctypes.c_void_p,  # csrRowA
                                          ctypes.c_void_p,  # csrColIndA
                                          ctypes.c_void_p,  # csrValA
                                          ctypes.c_int,     # nnzL
                                          ctypes.c_void_p,  # csrRowPtrL
                                          ctypes.c_void_p,  # csrColIndL
                                          ctypes.c_void_p,  # csrValL
                                          ctypes.c_int,     # nnzU
                                          ctypes.c_void_p,  # csrRowPtrU
                                          ctypes.c_void_p,  # csrColIndU
                                          ctypes.c_void_p,  # csrValU
                                          ctypes.c_void_p,  # P
                                          ctypes.c_void_p,  # Q
                                          ctypes.c_void_p ] # handle
def cusolverRfSetupHost(n, nnzA, csrRowPtrA, csrColIndA, csrValA,
                    nnzL, csrRowPtrL, csrColIndL, csrValL,
                    nnzU, csrRowPtrU, csrColIndU, csrValU,
                    P, Q, handle):
    cusolver_check_status(_libcusolver.cusolverRfSetupHost(
        n,
        nnzA, int(csrRowPtrA.ctypes.data), int(csrColIndA.ctypes.data), int(csrValA.ctypes.data),
        nnzL, int(csrRowPtrL.ctypes.data), int(csrColIndL.ctypes.data), int(csrValL.ctypes.data),
        nnzU, int(csrRowPtrU.ctypes.data), int(csrColIndU.ctypes.data), int(csrValU.ctypes.data),
        int(P.ctypes.data), int(Q.ctypes.data), handle))

_libcusolver.cusolverRfAnalyze.restype = int
_libcusolver.cusolverRfAnalyze.argtypes = [ctypes.c_void_p]
def cusolverRfAnalyze(handle):
    cusolver_check_status(_libcusolver.cusolverRfAnalyze(handle))

_libcusolver.cusolverRfRefactor.restype = ctypes.c_int
_libcusolver.cusolverRfRefactor.argtypes = [ctypes.c_void_p]
def cusolverRfRefactor(handle):
    cusolver_check_status(_libcusolver.cusolverRfRefactor(handle))

_libcusolver.cusolverRfSolve.restype = int
_libcusolver.cusolverRfSolve.argtypes = [ctypes.c_void_p, #handle
                                         ctypes.c_void_p, #P
                                         ctypes.c_void_p, #Q
                                         ctypes.c_int,    #nrhs
                                         ctypes.c_void_p, #Temp
                                         ctypes.c_int,    #ldt
                                         ctypes.c_void_p, #XF
                                         ctypes.c_int]    #ldxf
def cusolverRfSolve(P, Q, nrhs, Temp, ldt, XF, ldxf, handle):
    cusolver_check_status(_libcusolver.cusolverRfSolve(
        handle, int(P.gpudata), int(Q.gpudata), nrhs, int(Temp.gpudata),
        ldt, int(XF.gpudata), ldxf)
        )
###############################################################################
###############################################################################

def solve_QR(Acsr,b):
    """ Solve Ax =b using cusolverSpDcsrlsvqr """
    # prepare parameters
    dcsrVal = gpuarray.to_gpu(Acsr.data)
    dcsrColInd = gpuarray.to_gpu(Acsr.indices)
    dcsrIndPtr = gpuarray.to_gpu(Acsr.indptr)
    db = gpuarray.to_gpu(b)
    dx = gpuarray.empty_like(db)
    m = ctypes.c_int(Acsr.shape[0])
    nnz = ctypes.c_int(Acsr.nnz)
    descrA = ctypes.c_void_p()
    reorder = ctypes.c_int(0)
    tol = ctypes.c_double(1e-10)
    singularity = ctypes.c_int(99)

    #create cusparse-handle
    _cusp_handle = ctypes.c_void_p()
    status = _libcusparse.cusparseCreate(ctypes.byref(_cusp_handle))
    cusp_handle = _cusp_handle.value
    #create cusolver-handle
    _cuso_handle = ctypes.c_void_p()
    status = _libcusolver.cusolverSpCreate(ctypes.byref(_cuso_handle))
    cuso_handle = _cuso_handle.value

    #create MatDescriptor
    status = _libcusparse.cusparseCreateMatDescr(ctypes.byref(descrA))

    stat = _libcusolver.cusolverSpDcsrlsvqr(cuso_handle,
                                     m,
                                     nnz,
                                     descrA,
                                     int(dcsrVal.gpudata),
                                     int(dcsrIndPtr.gpudata),
                                     int(dcsrColInd.gpudata),
                                     int(db.gpudata),
                                     tol,
                                     reorder,
                                     int(dx.gpudata),
                                     ctypes.byref(singularity))
    xx = dx.get()
    # destroy handle
    status = _libcusolver.cusolverSpDestroy(cuso_handle)
    status = _libcusparse.cusparseDestroy(cusp_handle)
    return xx

def solve_LURf(A,b):
    """ Solve Ax=b using cusolverRf:
    1. cusolverRfCreate
    2. cusolverRfSetupHost
    3. cusolverRfAnalyze
    4. cusolverRfRefactor
    5. cusolverRfSolve
    6. cusolverRfDestroy
    """
    A = A.tocsc()
    n = A.shape[0]
    lu_obj = spl.splu(A)
    L = lu_obj.L.tocsr()
    U = lu_obj.U.tocsr()
    A = A.tocsr()
    P = lu_obj.perm_r
    Q = lu_obj.perm_c
    print('P: ' + str(P))
    print('Q: ' + str(Q))

    dP = gpuarray.to_gpu(P)
    dQ = gpuarray.to_gpu(Q)

h = cusolverRfCreate()
    context.synchronize()
    cusolverRfSetupHost(n,
                        A.nnz, A.indptr, A.indices, A.data,
                        L.nnz, L.indptr, L.indices, L.data,
                        U.nnz, U.indptr, U.indices, U.data,
                        P, Q, h)

    context.synchronize()
    cusolverRfAnalyze(h)
    context.synchronize()
    cusolverRfRefactor(h)

    #prepare parameters
    nrhs = 1
    ldt = n
    Temp = gpuarray.to_gpu(np.zeros(n * nrhs, dtype=np.float64))
    db = gpuarray.to_gpu(b)
    ldxf = n

    context.synchronize()
    cusolverRfSolve(dP, dQ, nrhs, Temp, ldt, db, ldxf, h)
    context.synchronize()
    x = db.get()
    cusolverRfDestroy(h)
    return x

###############################################################################
# Start script
###############################################################################

# Working example
m = 3
A = sps.dia_matrix(([2*np.ones(m),np.ones(m)],[0,1]),shape=(m,m)).tocsr()
print('Working matrix A:')
print(A.todense())
b = np.ones(m,dtype=np.float64)
xScipy = spl.spsolve(A,b)
xQR = solve_QR(A,b)
print('Scipy: ' + str(xScipy))
print('QR: ' + str(xQR))
xLU = solve_LURf(A,b)
print('LURf: ' + str(xLU))

# Working example:
m = 3
A = sps.dia_matrix(([np.ones(m),2*np.ones(m),np.ones(m)],[-1,0,1]),shape=(m,m)).tocsr()
print('\nWorking matrix A2:')
print(A.todense())
b = np.ones(m,dtype=np.float64)
xScipy = spl.spsolve(A,b)
xQR = solve_QR(A,b)
print('Scipy: ' + str(xScipy))
print('QR: ' + str(xQR))
xLU = solve_LURf(A,b)
print('LURf: ' + str(xLU))

# Nonworking example: CUSOLVER_STATUS_ZERO_PIVOT
m = 3
A = sps.dia_matrix(([3*np.ones(m),2*np.ones(m),np.ones(m)],[-1,0,1]),shape=(m,m)).tocsr()
print('\nNonworking (ZERO_PIVOT) matrix A:')
print(A.todense())
b = np.ones(m,dtype=np.float64)
xScipy = spl.spsolve(A,b)
xQR = solve_QR(A,b)
print('Scipy: ' + str(xScipy))
print('QR: ' + str(xQR))
xLU = solve_LURf(A,b)
print('LURf: ' + str(xLU))

This is the output of above script, showing the matrix, the results from the different solvers as well as the P and Q vectors:

Working matrix A:
[[ 2.  1.  0.]
 [ 0.  2.  1.]
 [ 0.  0.  2.]]
Scipy: [ 0.375  0.25   0.5  ]
QR: [ 0.375  0.25   0.5  ]
P: [0 1 2]
Q: [0 1 2]
LURf: [ 0.375  0.25   0.5  ]

Working matrix A2:
[[ 2.  1.  0.]
 [ 1.  2.  1.]
 [ 0.  1.  2.]]
Scipy: [ 0.5  0.   0.5]
QR: [  5.00000000e-01   1.99045548e-16   5.00000000e-01]
P: [0 1 2]
Q: [0 1 2]
LURf: [ 0.5  0.   0.5]

Nonworking (ZERO_PIVOT) matrix A:
[[ 2.  1.  0.]
 [ 3.  2.  1.]
 [ 0.  3.  2.]]
Scipy: [ 0.  1. -1.]
QR: [  1.84752279e-16   1.00000000e+00  -1.00000000e+00]
P: [2 0 1]
Q: [0 1 2]
Traceback (most recent call last):
  File "QR_vs_LU.py", line 272, in <module>
    xLU = solve_LURf(A,b)
  File "QR_vs_LU.py", line 213, in solve_LURf
    cusolverRfRefactor(h)
  File "QR_vs_LU.py", line 113, in cusolverRfRefactor
    cusolver_check_status(_libcusolver.cusolverRfRefactor(handle))
  File "QR_vs_LU.py", line 61, in cusolver_check_status
    raise RuntimeError("CUDA returned with status " + str(status))
RuntimeError: CUDA returned with status 10

The code yields the correct result for matrix 1 and matrix 2 and fails in the cusolverRfRefactor() function with a CUSOLVER_STATUS_ZERO_PIVOT for matrix 3.
The P and Q matrices are both the identity matrix for the two working examples, which suggests there is some error in how I pass the pivoting parameters.

Does anyone have an idea of how to correctly pass the parameters P,Q to the cusolverRf library?
Thanks a lot for your help!

To answer my own question:
The permutation arrays P,Q returned by scipy.sparse.linalg.splu (which uses SuperLU internally) do not match the definitions required in cusolverRf. Replacing P with inv§ and Q with inv(Q) solves the problem.