Hi All,
I’m trying to use the new cusolverRf methods to solve Ax=b. I get wrong results for matrices A where P and Q are not equal to the identity matrices (PAQ = L*U).
As far as I can tell this how to use cusolverRf:
- Factorize A on the host to obtain P, Q, L, U
- Use the cusolverRf library:
- Create a cusolverRfHandle
- Call cusolverRfSetupHost: Pass A, L, U (in CSR format), P,Q (as permuation arrays) [all host memory]
- Call cusolverRfAnalyze
- Call cusolverRfRefactor
- Call cusolverRfSolve: Pass P, Q, Temp, b [all device memory: there's a error in the documentation http://docs.nvidia.com/cuda/cusolver/index.html#glu-lt-t-gt-solve which says Temp and b should be on the host]
- Destroy the handle using cusolverRfDestroy
The following python code (requires PyCuda) wraps some cusolverRf functions and solves 3 small systems Ax=b using the procedure described above [line 173ff]. I first factorize A on the host using scipy.sparse.linalg.splu() to get L, U, P, Q and then pass them to the cusolverRf functions.
For reference purposes I solve the same systems using scipy.sparse.linalg.spsolve() and cusolverSpDcsrlsvqr().
import numpy as np
import ctypes
import scipy.sparse as sps
import scipy.sparse.linalg as spl
import matplotlib.pyplot as plt
# GPU specific imports / definitions
from pycuda.autoinit import context
from pycuda import gpuarray
###############################################################################
# Wrappers for all required cusolver/cusparse functions
###############################################################################
# cuSparse
_libcusparse = ctypes.cdll.LoadLibrary('libcusparse.so')
_libcusparse.cusparseCreate.restype = int
_libcusparse.cusparseCreate.argtypes = [ctypes.c_void_p]
_libcusparse.cusparseDestroy.restype = int
_libcusparse.cusparseDestroy.argtypes = [ctypes.c_void_p]
_libcusparse.cusparseCreateMatDescr.restype = int
_libcusparse.cusparseCreateMatDescr.argtypes = [ctypes.c_void_p]
# cuSOLVER
_libcusolver = ctypes.cdll.LoadLibrary('libcusolver.so')
_libcusolver.cusolverSpCreate.restype = int
_libcusolver.cusolverSpCreate.argtypes = [ctypes.c_void_p]
_libcusolver.cusolverSpDestroy.restype = int
_libcusolver.cusolverSpDestroy.argtypes = [ctypes.c_void_p]
_libcusolver.cusolverSpDcsrlsvqr.restype = int
_libcusolver.cusolverSpDcsrlsvqr.argtypes= [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_double,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p]
# cusolver-Rf
def cusolver_check_status(status):
if status != 0:
raise RuntimeError("CUDA returned with status " + str(status))
_libcusolver.cusolverRfCreate.restype = int
_libcusolver.cusolverRfCreate.argtypes = [ctypes.c_void_p]
def cusolverRfCreate():
handle = ctypes.c_void_p()
cusolver_check_status(_libcusolver.cusolverRfCreate(ctypes.byref(handle)))
return handle.value
_libcusolver.cusolverRfDestroy.restype = int
_libcusolver.cusolverRfDestroy.argtypes = [ctypes.c_void_p]
def cusolverRfDestroy(handle):
cusolver_check_status(_libcusolver.cusolverRfDestroy(handle))
_libcusolver.cusolverRfSetupHost.restype = int
_libcusolver.cusolverRfSetupHost.argtypes = [ctypes.c_int, # n
ctypes.c_int, # nnz
ctypes.c_void_p, # csrRowA
ctypes.c_void_p, # csrColIndA
ctypes.c_void_p, # csrValA
ctypes.c_int, # nnzL
ctypes.c_void_p, # csrRowPtrL
ctypes.c_void_p, # csrColIndL
ctypes.c_void_p, # csrValL
ctypes.c_int, # nnzU
ctypes.c_void_p, # csrRowPtrU
ctypes.c_void_p, # csrColIndU
ctypes.c_void_p, # csrValU
ctypes.c_void_p, # P
ctypes.c_void_p, # Q
ctypes.c_void_p ] # handle
def cusolverRfSetupHost(n, nnzA, csrRowPtrA, csrColIndA, csrValA,
nnzL, csrRowPtrL, csrColIndL, csrValL,
nnzU, csrRowPtrU, csrColIndU, csrValU,
P, Q, handle):
cusolver_check_status(_libcusolver.cusolverRfSetupHost(
n,
nnzA, int(csrRowPtrA.ctypes.data), int(csrColIndA.ctypes.data), int(csrValA.ctypes.data),
nnzL, int(csrRowPtrL.ctypes.data), int(csrColIndL.ctypes.data), int(csrValL.ctypes.data),
nnzU, int(csrRowPtrU.ctypes.data), int(csrColIndU.ctypes.data), int(csrValU.ctypes.data),
int(P.ctypes.data), int(Q.ctypes.data), handle))
_libcusolver.cusolverRfAnalyze.restype = int
_libcusolver.cusolverRfAnalyze.argtypes = [ctypes.c_void_p]
def cusolverRfAnalyze(handle):
cusolver_check_status(_libcusolver.cusolverRfAnalyze(handle))
_libcusolver.cusolverRfRefactor.restype = ctypes.c_int
_libcusolver.cusolverRfRefactor.argtypes = [ctypes.c_void_p]
def cusolverRfRefactor(handle):
cusolver_check_status(_libcusolver.cusolverRfRefactor(handle))
_libcusolver.cusolverRfSolve.restype = int
_libcusolver.cusolverRfSolve.argtypes = [ctypes.c_void_p, #handle
ctypes.c_void_p, #P
ctypes.c_void_p, #Q
ctypes.c_int, #nrhs
ctypes.c_void_p, #Temp
ctypes.c_int, #ldt
ctypes.c_void_p, #XF
ctypes.c_int] #ldxf
def cusolverRfSolve(P, Q, nrhs, Temp, ldt, XF, ldxf, handle):
cusolver_check_status(_libcusolver.cusolverRfSolve(
handle, int(P.gpudata), int(Q.gpudata), nrhs, int(Temp.gpudata),
ldt, int(XF.gpudata), ldxf)
)
###############################################################################
###############################################################################
def solve_QR(Acsr,b):
""" Solve Ax =b using cusolverSpDcsrlsvqr """
# prepare parameters
dcsrVal = gpuarray.to_gpu(Acsr.data)
dcsrColInd = gpuarray.to_gpu(Acsr.indices)
dcsrIndPtr = gpuarray.to_gpu(Acsr.indptr)
db = gpuarray.to_gpu(b)
dx = gpuarray.empty_like(db)
m = ctypes.c_int(Acsr.shape[0])
nnz = ctypes.c_int(Acsr.nnz)
descrA = ctypes.c_void_p()
reorder = ctypes.c_int(0)
tol = ctypes.c_double(1e-10)
singularity = ctypes.c_int(99)
#create cusparse-handle
_cusp_handle = ctypes.c_void_p()
status = _libcusparse.cusparseCreate(ctypes.byref(_cusp_handle))
cusp_handle = _cusp_handle.value
#create cusolver-handle
_cuso_handle = ctypes.c_void_p()
status = _libcusolver.cusolverSpCreate(ctypes.byref(_cuso_handle))
cuso_handle = _cuso_handle.value
#create MatDescriptor
status = _libcusparse.cusparseCreateMatDescr(ctypes.byref(descrA))
stat = _libcusolver.cusolverSpDcsrlsvqr(cuso_handle,
m,
nnz,
descrA,
int(dcsrVal.gpudata),
int(dcsrIndPtr.gpudata),
int(dcsrColInd.gpudata),
int(db.gpudata),
tol,
reorder,
int(dx.gpudata),
ctypes.byref(singularity))
xx = dx.get()
# destroy handle
status = _libcusolver.cusolverSpDestroy(cuso_handle)
status = _libcusparse.cusparseDestroy(cusp_handle)
return xx
def solve_LURf(A,b):
""" Solve Ax=b using cusolverRf:
1. cusolverRfCreate
2. cusolverRfSetupHost
3. cusolverRfAnalyze
4. cusolverRfRefactor
5. cusolverRfSolve
6. cusolverRfDestroy
"""
A = A.tocsc()
n = A.shape[0]
lu_obj = spl.splu(A)
L = lu_obj.L.tocsr()
U = lu_obj.U.tocsr()
A = A.tocsr()
P = lu_obj.perm_r
Q = lu_obj.perm_c
print('P: ' + str(P))
print('Q: ' + str(Q))
dP = gpuarray.to_gpu(P)
dQ = gpuarray.to_gpu(Q)
h = cusolverRfCreate()
context.synchronize()
cusolverRfSetupHost(n,
A.nnz, A.indptr, A.indices, A.data,
L.nnz, L.indptr, L.indices, L.data,
U.nnz, U.indptr, U.indices, U.data,
P, Q, h)
context.synchronize()
cusolverRfAnalyze(h)
context.synchronize()
cusolverRfRefactor(h)
#prepare parameters
nrhs = 1
ldt = n
Temp = gpuarray.to_gpu(np.zeros(n * nrhs, dtype=np.float64))
db = gpuarray.to_gpu(b)
ldxf = n
context.synchronize()
cusolverRfSolve(dP, dQ, nrhs, Temp, ldt, db, ldxf, h)
context.synchronize()
x = db.get()
cusolverRfDestroy(h)
return x
###############################################################################
# Start script
###############################################################################
# Working example
m = 3
A = sps.dia_matrix(([2*np.ones(m),np.ones(m)],[0,1]),shape=(m,m)).tocsr()
print('Working matrix A:')
print(A.todense())
b = np.ones(m,dtype=np.float64)
xScipy = spl.spsolve(A,b)
xQR = solve_QR(A,b)
print('Scipy: ' + str(xScipy))
print('QR: ' + str(xQR))
xLU = solve_LURf(A,b)
print('LURf: ' + str(xLU))
# Working example:
m = 3
A = sps.dia_matrix(([np.ones(m),2*np.ones(m),np.ones(m)],[-1,0,1]),shape=(m,m)).tocsr()
print('\nWorking matrix A2:')
print(A.todense())
b = np.ones(m,dtype=np.float64)
xScipy = spl.spsolve(A,b)
xQR = solve_QR(A,b)
print('Scipy: ' + str(xScipy))
print('QR: ' + str(xQR))
xLU = solve_LURf(A,b)
print('LURf: ' + str(xLU))
# Nonworking example: CUSOLVER_STATUS_ZERO_PIVOT
m = 3
A = sps.dia_matrix(([3*np.ones(m),2*np.ones(m),np.ones(m)],[-1,0,1]),shape=(m,m)).tocsr()
print('\nNonworking (ZERO_PIVOT) matrix A:')
print(A.todense())
b = np.ones(m,dtype=np.float64)
xScipy = spl.spsolve(A,b)
xQR = solve_QR(A,b)
print('Scipy: ' + str(xScipy))
print('QR: ' + str(xQR))
xLU = solve_LURf(A,b)
print('LURf: ' + str(xLU))
This is the output of above script, showing the matrix, the results from the different solvers as well as the P and Q vectors:
Working matrix A:
[[ 2. 1. 0.]
[ 0. 2. 1.]
[ 0. 0. 2.]]
Scipy: [ 0.375 0.25 0.5 ]
QR: [ 0.375 0.25 0.5 ]
P: [0 1 2]
Q: [0 1 2]
LURf: [ 0.375 0.25 0.5 ]
Working matrix A2:
[[ 2. 1. 0.]
[ 1. 2. 1.]
[ 0. 1. 2.]]
Scipy: [ 0.5 0. 0.5]
QR: [ 5.00000000e-01 1.99045548e-16 5.00000000e-01]
P: [0 1 2]
Q: [0 1 2]
LURf: [ 0.5 0. 0.5]
Nonworking (ZERO_PIVOT) matrix A:
[[ 2. 1. 0.]
[ 3. 2. 1.]
[ 0. 3. 2.]]
Scipy: [ 0. 1. -1.]
QR: [ 1.84752279e-16 1.00000000e+00 -1.00000000e+00]
P: [2 0 1]
Q: [0 1 2]
Traceback (most recent call last):
File "QR_vs_LU.py", line 272, in <module>
xLU = solve_LURf(A,b)
File "QR_vs_LU.py", line 213, in solve_LURf
cusolverRfRefactor(h)
File "QR_vs_LU.py", line 113, in cusolverRfRefactor
cusolver_check_status(_libcusolver.cusolverRfRefactor(handle))
File "QR_vs_LU.py", line 61, in cusolver_check_status
raise RuntimeError("CUDA returned with status " + str(status))
RuntimeError: CUDA returned with status 10
The code yields the correct result for matrix 1 and matrix 2 and fails in the cusolverRfRefactor() function with a CUSOLVER_STATUS_ZERO_PIVOT for matrix 3.
The P and Q matrices are both the identity matrix for the two working examples, which suggests there is some error in how I pass the pivoting parameters.
Does anyone have an idea of how to correctly pass the parameters P,Q to the cusolverRf library?
Thanks a lot for your help!