Hi all,
I’m trying to use the cuSOLVER-sparse library to solve Ax=b where A (very sparse) stems from a 3D-Poisson equation discretization and I am experiencing strange problems.
The matrix A basically consists of the main diagonal and six off-diagonals at positions (nxny, nx, -1, 0, 1, nx, nxny) where nx,ny,nz are the dimensions of the 3D-domain (mesh). The solver I’m trying to use is cusolverSpdcsrlsvqr().
Problem description:
Solving a system of size nxnynz=323216 fails, cusolverSpDcsrlsvqr() returns cusolver_status_t 7. Solving a system of nxnynz=163232 works fine.
Note that the nnz and dimensions of A are the same in both cases.
I suspected this to be due to the larger bandwidth of the 323216 matrix and the resulting bigger fill-in, which could lead to a too high memory consumption and consequent errors, however this seems not to be the case.
Here’s a (rather large, sorry!) minimum working example with matrices similar to the ones I’m using. It loads the matrix from file (Python code below), measures the memory requirements using the cusolverSpdDsrqrAnalysisBatched()/BufferInfoBatched() methods and solves the system using cusolverSpDcsrqr().
//test.cpp
#include <fstream>
#include <iostream>
#include <iterator>
#include <vector>
#include <string>
#include <type_traits>
#include <sstream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusolverSp.h>
#include <cusparse.h>
//solve Ax=b using cusolverSpDcsrlsvqr()
//A is in csr format
void solve(int nnz, int m, double tol, double *dcsrVal, int *dcsrColInd,
int *dcsrRowPtr, double *db, double *dx) {
// --- create library handles:
cusolverSpHandle_t cusolver_handle;
cusolverStatus_t cusolver_status;
cusolver_status = cusolverSpCreate(&cusolver_handle);
std::cout << "status create cusolver handle: " << cusolver_status << std::endl;
cusparseHandle_t cusparse_handle;
cusparseStatus_t cusparse_status;
cusparse_status = cusparseCreate(&cusparse_handle);
std::cout << "status create cusparse handle: " << cusparse_status << std::endl;
// --- prepare solving and copy to GPU:
int reorder = 0;
int singularity = 0;
// create matrix descriptor
cusparseMatDescr_t descrA;
cusparse_status = cusparseCreateMatDescr(&descrA);
std::cout << "status cusparse createMatDescr: " << cusparse_status << std::endl;
cudaDeviceSynchronize();
//solve the system
cusolver_status = cusolverSpDcsrlsvqr(cusolver_handle, m, nnz, descrA, dcsrVal,
dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
&singularity);
cudaDeviceSynchronize();
std::cout << "status cusolver solving (!): " << cusolver_status << std::endl;
std::cout << "singularity (should be -1): " << singularity << std::endl;
cusparse_status = cusparseDestroy(cusparse_handle);
std::cout << "status destroy cusparse handle: " << cusparse_status << std::endl;
cusolver_status = cusolverSpDestroy(cusolver_handle);
std::cout << "status destroy cusolver handle: " << cusolver_status << std::endl;
}
// write info about memory requirements of the qr-decomposition to stdout
void get_memInfo(int nnz, int m, double tol, double *csrVal, int *csrColInd,
int *csrRowPtr, double *b, double *x) {
int* dCol, *dRow;
double* dVal;
cudaError_t error;
//allocate device memory, copy H2D
cudaMalloc((void**)&dCol, sizeof(int)*nnz);
cudaMalloc((void**)&dRow, sizeof(int)*(m+1));
cudaMalloc((void**)&dVal, sizeof(double)*nnz);
cudaMemcpy(dCol, csrColInd, sizeof(int)*nnz, cudaMemcpyHostToDevice);
cudaMemcpy(dRow, csrRowPtr, sizeof(int)*(m+1), cudaMemcpyHostToDevice);
cudaMemcpy(dVal, csrVal, sizeof(double)*nnz, cudaMemcpyHostToDevice);
error = cudaGetLastError();
std::cout << "Error status after cudaMemcpy in getmemInfo: " << error << std::endl;
//create and initialize library handles
cusolverSpHandle_t cusolver_handle;
cusparseHandle_t cusparse_handle;
cusolverStatus_t cusolver_status;
cusparseStatus_t cusparse_status;
cusparse_status = cusparseCreate(&cusparse_handle);
std::cout << "status cusparseCreate: " << cusparse_status << std::endl;
cusolver_status = cusolverSpCreate(&cusolver_handle);
std::cout << "status cusolverSpCreate: " << cusolver_status << std::endl;
//create CsrqrInfo
csrqrInfo_t info;
cusolver_status = cusolverSpCreateCsrqrInfo(&info);
std::cout << "status cusolverSpCrateCsrqrInfo: " << cusolver_status << std::endl;
//create mat descriptor
cusparseMatDescr_t descrA;
cusparse_status = cusparseCreateMatDescr(&descrA);
cusparse_status = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
std::cout << "status cusparse createMatDescr: " << cusparse_status << std::endl;
cudaDeviceSynchronize();
//call SpDcsrqrAnalysisBatched.
cusolver_status = cusolverSpXcsrqrAnalysisBatched(cusolver_handle,
m,
m,
nnz,
descrA,
dRow,
dCol,
info);
std::cout << "status cusolverSpDcsrqrAnalysisBatched: " << cusolver_status << std::endl;
//get the buffer size via BufferInfoBatched
int batchsize = 1;
size_t internalDataInBytes = 99;
size_t workspaceInBytes = 99;
cusolver_status = cusolverSpDcsrqrBufferInfoBatched(cusolver_handle,
m,
m,
nnz,
descrA,
dVal,
dRow,
dCol,
batchsize,
info,
&internalDataInBytes,
&workspaceInBytes);
std::cout << "status cusolverSpDcsrqrBufferInfoBatched: " << cusolver_status << std::endl;
std::cout << "internalbuffer(Bytes): " << internalDataInBytes << std::endl;
std::cout << "workspace(Bytes): " << workspaceInBytes << std::endl;
//destroy stuff
cusolver_status = cusolverSpDestroyCsrqrInfo(info);
std::cout << "status cusolverSpDestroyCsrqrInfo: " << cusolver_status << std::endl;
cusparse_status = cusparseDestroy(cusparse_handle);
std::cout << "status cusparseDestroy: " << cusparse_status << std::endl;
cusolver_status = cusolverSpDestroy(cusolver_handle);
std::cout << "status cusolverSpDestroy: " << cusolver_status << std::endl;
cudaFree(dCol);
cudaFree(dRow);
cudaFree(dVal);
}
// allocates device memory and calls the solve() function
void solve_hostpointer(int nnz, int m, double tol, double *csrVal, int *csrColInd,
int *csrRowPtr, double *b, double *x)
{
double* dVal, *db, *dx;
int* dCol, *dRow;
cudaError_t error;
cudaMalloc((void**)&dVal, sizeof(double)*nnz);
cudaMalloc((void**)&dCol, sizeof(int)*nnz);
cudaMalloc((void**)&dRow, sizeof(int)*(m+1));
cudaMalloc((void**)&db, sizeof(double)*m);
cudaMalloc((void**)&dx, sizeof(double)*m);
error = cudaGetLastError();
std::cout << "Error status after cudaMalloc: " << error << std::endl;
cudaMemcpy(dVal, csrVal, sizeof(double)*nnz, cudaMemcpyHostToDevice);
cudaMemcpy(dCol, csrColInd, sizeof(int)*nnz, cudaMemcpyHostToDevice);
cudaMemcpy(dRow, csrRowPtr, sizeof(int)*(m+1), cudaMemcpyHostToDevice);
cudaMemcpy(db, b, sizeof(double)*m, cudaMemcpyHostToDevice);
cudaMemcpy(dx, x, sizeof(double)*m, cudaMemcpyHostToDevice);
error = cudaGetLastError();
std::cout << "Error status after memcpy H2D: " << error << std::endl;
cudaDeviceSynchronize();
solve(nnz, m, tol, dVal, dCol, dRow, db, dx);
error = cudaGetLastError();
std::cout << "Error status after solve(): " << error << std::endl;
cudaMemcpy(x, dx, sizeof(double)*m, cudaMemcpyDeviceToHost);
error = cudaGetLastError();
std::cout << "Error status after memcpy D2H: " << error << std::endl;
cudaDeviceSynchronize();
cudaFree(dVal);
cudaFree(dCol);
cudaFree(dRow);
cudaFree(db);
cudaFree(dx);
}
//input/output of matrices (vectors) //////////////////////////////////////////
template<class T>
std::vector<T> read_vector(std::string filename) {
std::vector<T> res;
std::string line;
std::ifstream file(filename);
while(std::getline(file, line)) {
if (std::is_floating_point<T>::value) {
T val = std::stod(line);
res.push_back(val);
} else {
T val = std::stoi(line);
res.push_back(val);
}
}
return res;
}
template<class T>
void write_vector(std::string filename, std::vector<T> vec) {
std::ofstream output_file(filename);
std::ostream_iterator<double> output_iterator(output_file, "\n");
std::copy(vec.begin(), vec.end(), output_iterator);
}
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char* argv[]) {
if (argc != 4) {
std::cout << "Correct usage: ./a.out nx ny nz\n";
return 1;
}
int nx = std::atoi(argv[1]);
int ny = std::atoi(argv[2]);
int nz = std::atoi(argv[3]);
std::stringstream ending;
ending << "_" << nx << "x" << ny << "x" << nz << ".txt";
//fn for reading/writing csr matrix and rhs
auto fn_Val = "bandedVal" + ending.str();
auto fn_Col = "bandedCol" + ending.str();
auto fn_Row = "bandedRow" + ending.str();
auto fn_b = "bandedb" + ending.str();
auto fn_x = "bandedx" + ending.str();
//create vectors, A(val, ind, rowptr)*x = b
std::vector<double> csrVal = read_vector<double>(fn_Val);
std::vector<int> csrColInd = read_vector<int>(fn_Col);
std::vector<int> csrRowPtr = read_vector<int>(fn_Row);
std::vector<double> b = read_vector<double>(fn_b);
std::vector<double> x(b.size(), -99);
//initialize parameters
int nnz = csrVal.size();
int m = csrRowPtr.size()-1;
double tol = 1e-10;
std::cout << "Loaded matrix from: \n"
<< fn_Val << std::endl
<< fn_Col << std::endl
<< fn_Row << std::endl
<< fn_b << std::endl << std::endl;
std::cout << "system info:\n";
std::cout << "nnz: " << nnz << std::endl;
std::cout << "m: " << m << std::endl;
std::cout << "sizes of Val Col Row: " << csrVal.size() << " " << csrColInd.size() << " "
<< csrRowPtr.size() << std::endl;
// write some info about the memory requirements of the qr-decomposition
// to stdout
get_memInfo(nnz, m, tol,csrVal.data(), csrColInd.data(),
csrRowPtr.data(), b.data(), x.data());
//solve the system
std::cout << "Calling solve():\n";
solve_hostpointer(nnz, m, tol,csrVal.data(), csrColInd.data(),
csrRowPtr.data(), b.data(), x.data());
std::cout << "Writing result to " << fn_x << std::endl;
write_vector(fn_x, x);
return 0;
}
The following small Python script creates and saves the matrices:
#create_matrices.py
import numpy as np
import scipy.sparse as sps
import scipy.sparse.linalg as spl
import matplotlib.pyplot as plt
### Create 3 banded matrices,rhs and solutions and store them in txt files ###
# create the matrix for case 1: 32x32x16
# the matrices have offdiagonals at a distance of 1, 32 and 32x32
m1 = 16385 #32x32x16
data1 = [np.ones(m1), np.ones(m1), np.ones(m1), np.ones(m1), np.ones(m1),np.ones(m1),np.ones(m1)]
offsets = [-32*32, -32, -1, 0, 1, 32, 32*32]
A1 = sps.dia_matrix((data1,offsets), shape=(m1,m1)).tocsr()
#create the matrix for case 2: 16x32x32
# the matrix has offdiagonals at 1, 16, 16x32
m2 = m1
data2 = data1
offsets = [-16*32, -16, -1, 0, 1, 16, 16*32]
A2 = sps.dia_matrix((data2,offsets), shape=(m2,m2)).tocsr()
#create the matrix for case 3: 8x8x8
#the matrix has offdiagonals at 1, 8, 8x8
data3 = data1
offsets = [-8*8, -8, -1, 0, 1, 8, 8*8]
m3 = 8*8*8
A3 = sps.dia_matrix((data3,offsets), shape=(m3,m3)).tocsr()
b1 = np.ones(m1)
b2 = b1
b3 = np.ones(m3)
x1 = spl.spsolve(A1,b1)
x2 = spl.spsolve(A2,b2)
x3 = spl.spsolve(A3,b3)
np.savetxt("bandedVal_32x32x16"+".txt", A1.data,fmt='%1.16f')
np.savetxt("bandedCol_32x32x16"+".txt", A1.indices,fmt='%10i')
np.savetxt("bandedRow_32x32x16"+".txt",A1.indptr,fmt='%10i')
np.savetxt("bandedb_32x32x16"+".txt",b1,fmt='%1.16f')
np.savetxt("bandedx_32x32x16_scipy"+".txt",x1,fmt='%1.16f')
np.savetxt("bandedVal_16x32x32"+".txt", A2.data,fmt='%1.16f')
np.savetxt("bandedCol_16x32x32"+".txt", A2.indices,fmt='%10i')
np.savetxt("bandedRow_16x32x32"+".txt",A2.indptr,fmt='%10i')
np.savetxt("bandedb_16x32x32"+".txt",b2,fmt='%1.16f')
np.savetxt("bandedx_16x32x32_scipy"+".txt",x2,fmt='%1.16f')
np.savetxt("bandedVal_8x8x8"+".txt", A3.data,fmt='%1.16f')
np.savetxt("bandedCol_8x8x8"+".txt", A3.indices,fmt='%10i')
np.savetxt("bandedRow_8x8x8"+".txt",A3.indptr,fmt='%10i')
np.savetxt("bandedb_8x8x8"+".txt",b3,fmt='%1.16f')
np.savetxt("bandedx_8x8x8_scipy"+".txt",x3,fmt='%1.16f')
To run the code:
python create_matrices.py #might take a few secs
nvcc -std=c++11 test.cpp -lcusolver -lcusparse -o test
./test 32 32 16 #for the 32*32*16 case
./test 16 32 32 #for the 16*32*32 case
The memory requirements according to above code are:
- 32*32*16: 615Mb internal buffer and 44Mb working space
- 16*32*32: 322Mb internal buffer and 44Mb working space (so the fill in is reduced)
which is well below the available device memory (5375Mb on a Tesla C2075).
Another peculiarity is that running the code using cuda-gdb does not fail in either case. This might have something to do with register use being replaced by local memory?
I do not understand what is going wrong here, do you have any suggestions/explanations?
Thanks for your help!
I’m planning to try other solvers (LU on CPU combined with cusolverRfSolve on GPU) as well; however I am still very interested in finding out what is going on here.