Hi,
I am trying to implement the ICCG from the cusparse library as outlined in https://developer.nvidia.com/content/incomplete-lu-and-cholesky-preconditioned-iterative-methods-using-cusparse-and-cublas, however I can’t seem to get the correct results, except for a 4x4 matrix. The Cholesky factor appears to be correct, so the problem seems to be in either the solve or analysis phase.
If anyone with experience with this library could take a look at the code, it would be much appreciated.
#include <iostream>
#include <iomanip>
#include <cusparse_v2.h>
#include <cublas_v2.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <cusp/csr_matrix.h>
#include <cusp/array1d.h>
#include <cusp/gallery/poisson.h>
// where to perform the computation
typedef cusp::host_memory HostMemorySpace;
typedef cusp::device_memory MemorySpace;
const char* cublasGetErrorString(cublasStatus_t status);
const char* cusparseGetErrorString(cusparseStatus_t status);
int main(int argc, char **argv)
{
const double TOL = 1e-3;
const size_t MAX_ITER = 1000;
const size_t numGrid = 2; // N = numGrid*numGrid
cusp::csr_matrix<int, double, HostMemorySpace> h_A;
cusp::gallery::poisson5pt(h_A, numGrid, numGrid);
assert(h_A.num_rows == h_A.num_cols);// sanity check
const size_t N = h_A.num_rows;//Number of rows and columns (square matrix assumed)
const size_t NZ = h_A.num_entries;//Total number of non-zero entries in complete matrix
//Number of non-zero entries in lower or upper + diagonals (symmetric matrix assumed)
const size_t LOWER = 0.5*(NZ + N);
double *h_val = (double *)malloc(LOWER*sizeof(double));
int *h_col = (int *)malloc(LOWER*sizeof(int));
int *h_row = (int *)malloc((N+1)*sizeof(int));
double *h_b = (double *)malloc(N*sizeof(double));
double *h_x = (double *)malloc(N*sizeof(double));
//populate lower triangular column indices and row offsets for zero fill-in IC
h_row[N] = LOWER;
int k = 0;
for (int i = 0; i < N; i++) {
h_row[i] = k;
int numRowElements = h_A.row_offsets[i+1] - h_A.row_offsets[i];
int m = 0;
for (int j = 0; j < numRowElements; j++) {
if (!(h_A.column_indices[h_A.row_offsets[i] + j] > i)) {
h_col[h_row[i] + m] = h_A.column_indices[h_A.row_offsets[i] + j];
h_val[h_row[i] + m] = h_A.values[h_A.row_offsets[i] + j];
k++; m++;
}
}
}
// RHS vector
h_b[0] = 1.0; h_b[1] = 1.0; h_b[2] = 1.0; h_b[3] = 1.0;
// solution vector
h_x[0] = 0.0; h_x[1] = 0.0; h_x[2] = 0.0; h_x[3] = 0.0;
double *d_val;
int *d_col;
int *d_row;
double *d_b;
double *d_x;
double *d_valCholesky;
cudaMalloc((void **)&d_val, LOWER*sizeof(double));
cudaMalloc((void **)&d_col, LOWER*sizeof(int));
cudaMalloc((void **)&d_row, (N+1)*sizeof(int));
cudaMalloc((void **)&d_b, N*sizeof(double));
cudaMalloc((void **)&d_x, N*sizeof(double));
cudaMalloc(&d_valCholesky, LOWER*sizeof(double));
cudaMemcpy(d_val, h_val, LOWER*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_col, h_col, LOWER*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_row, h_row, (N+1)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, h_x, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_valCholesky, d_val, LOWER*sizeof(double), cudaMemcpyDeviceToDevice);
//for printing to check values
thrust::device_ptr<double> d_val_devptr(d_val);
thrust::device_ptr<int> d_col_devptr(d_col);
thrust::device_ptr<int> d_row_devptr(d_row);
thrust::device_ptr<double> d_b_devptr(d_b);
thrust::device_ptr<double> d_x_devptr(d_x);
thrust::device_ptr<double> d_valCholesky_devptr(d_valCholesky);
// cudaError_t cudaError;
cublasStatus_t cublasStatus;
cusparseStatus_t cusparseStatus;
cublasHandle_t cublasHandle0 = 0;
cublasStatus = cublasCreate(&cublasHandle0);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
std::cout << "cublasCreate(&cublasHandle0) failed, status returned: " << std::endl;
std::cout << cublasGetErrorString(cublasStatus) << std::endl;
return 1;
}
// Create CUSPARSE context
cusparseHandle_t cusparseHandle0 = 0;
cusparseStatus = cusparseCreate(&cusparseHandle0);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseCreate(&cusparseHandle0) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
cusparseMatDescr_t descrA = 0;
cusparseStatus = cusparseCreateMatDescr(&descrA);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseCreateMatDescr(&descrA) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_SYMMETRIC);
cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER);
cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
// create the analysis info object for the A matrix
cusparseSolveAnalysisInfo_t infoA = 0;
cusparseStatus = cusparseCreateSolveAnalysisInfo(&infoA);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseCreateSolveAnalysisInfo(&infoA) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
// Perform the analysis for the Non-Transpose case
cusparseStatus = cusparseDcsrsv_analysis(cusparseHandle0, CUSPARSE_OPERATION_NON_TRANSPOSE,
N, LOWER, descrA, d_val, d_row, d_col, infoA);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseDcsrsv_analysis(infoA) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
// Generate the incomplete Cholesky factor
cusparseStatus = cusparseDcsric0(cusparseHandle0, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descrA,
d_valCholesky, d_row, d_col, infoA);
cusparseDestroySolveAnalysisInfo(infoA);
// describe the lower triangular matrix of the Cholesky factor
cusparseMatDescr_t descrL = 0;
cusparseStatus = cusparseCreateMatDescr(&descrL);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseCreateMatDescr(&descrL) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
cusparseSetMatType(descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
cusparseSetMatFillMode(descrL, CUSPARSE_FILL_MODE_LOWER);
cusparseSetMatDiagType(descrL, CUSPARSE_DIAG_TYPE_NON_UNIT);
cusparseSetMatIndexBase(descrL, CUSPARSE_INDEX_BASE_ZERO);
// create the analysis info object for the lower Cholesky factor
cusparseSolveAnalysisInfo_t infoL = 0;
cusparseStatus = cusparseCreateSolveAnalysisInfo(&infoL);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseCreateSolveAnalysisInfo(&infoL) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
cusparseStatus = cusparseDcsrsv_analysis(cusparseHandle0, CUSPARSE_OPERATION_NON_TRANSPOSE,
N, LOWER, descrL, d_valCholesky, d_row, d_col, infoL);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseDcsrsv_analysis(infoL) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
// create the analysis info object for the upper Cholesky factor
cusparseSolveAnalysisInfo_t infoU = 0;
cusparseStatus = cusparseCreateSolveAnalysisInfo(&infoU);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseCreateSolveAnalysisInfo(&infoU) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
cusparseStatus = cusparseDcsrsv_analysis(cusparseHandle0, CUSPARSE_OPERATION_TRANSPOSE,
N, LOWER, descrL, d_valCholesky, d_row, d_col, infoU);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseDcsrsv_analysis(infoU) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
double *d_r;
double *d_z;
double *d_t;
double *d_p;
double *d_q;
double rho;
double rhop;
double ptAp;
double alpha, negalpha;
double beta;
double normr0, normr;
size_t iter = 0;
double zero = 0.0;
double one = 1.0, negone = -1.0;
cudaMalloc((void **)&d_r, N*sizeof(double));
cudaMalloc((void **)&d_z, N*sizeof(double));
cudaMalloc((void **)&d_t, N*sizeof(double));
cudaMalloc((void **)&d_p, N*sizeof(double));
cudaMalloc((void **)&d_q, N*sizeof(double));
cudaMemcpy(d_r, d_x, N*sizeof(double), cudaMemcpyDeviceToDevice);
cudaMemcpy(d_z, d_x, N*sizeof(double), cudaMemcpyDeviceToDevice);
cudaMemcpy(d_t, d_x, N*sizeof(double), cudaMemcpyDeviceToDevice);
cudaMemcpy(d_p, d_x, N*sizeof(double), cudaMemcpyDeviceToDevice);
cudaMemcpy(d_q, d_x, N*sizeof(double), cudaMemcpyDeviceToDevice);
//for printing to check values
thrust::device_ptr<double> d_r_devptr(d_r);
thrust::device_ptr<double> d_z_devptr(d_z);
thrust::device_ptr<double> d_t_devptr(d_t);
thrust::device_ptr<double> d_p_devptr(d_p);
thrust::device_ptr<double> d_q_devptr(d_q);
//1: r <- b - A*x0 initial residual
//1a: r <- A*x0
cusparseStatus = cusparseDcsrmv(cusparseHandle0, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, LOWER,
&one, descrA, d_val, d_row, d_col, d_x, &zero, d_r);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseDcsrmv(r <- Ax0) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
//1b: r <- -r
cublasDscal(cublasHandle0, N, &negone, d_r, 1);
//1c: r <- b + (-r)
cublasDaxpy(cublasHandle0, N, &one, d_b, 1, d_r, 1);
// rho <- <r,z>
cublasDdot(cublasHandle0, N, d_r, 1, d_z, 1, &rho);
// normr0 <- sqrt(<r,r>)_0
cublasDnrm2(cublasHandle0, N, d_r, 1, &normr0);
normr = normr0;
// iterate until convergence or maximum iterations are reached
while (normr/normr0 > TOL && iter < MAX_ITER) {
//2: Solve M*z = r via L*[transpose(L)*z] = r where transpose(L)*z = t
//2a: L*t <- r
cusparseStatus = cusparseDcsrsv_solve(cusparseHandle0, CUSPARSE_OPERATION_NON_TRANSPOSE, N,
&one, descrL, d_valCholesky, d_row, d_col, infoL, d_r, d_t);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseDcsrsv_solve(LOWER) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
//2b: transpose(L)*z <- t
cusparseStatus = cusparseDcsrsv_solve(cusparseHandle0, CUSPARSE_OPERATION_TRANSPOSE, N, &one,
descrL, d_valCholesky, d_row, d_col, infoU, d_t, d_z);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseDcsrsv_solve(UPPER) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
//3: rhop <- <r,z>_i
rhop = rho;
//4: rho <- <r,z>_{i+1}
cublasDdot(cublasHandle0, N, d_r, 1, d_z, 1, &rho);
if (iter == 0) {
//5: p <- z
cublasDcopy(cublasHandle0, N, d_z, 1, d_p, 1);
} else {
//5: beta <- <r,z>_i / <r,z>_{i+1}
beta = rho/rhop;
//6: p = z + beta*p
//6a: z <- beta*p + z
cublasDaxpy(cublasHandle0, N, &beta, d_p, 1, d_z, 1);
//6b: p <- z
cublasDcopy(cublasHandle0, N, d_z, 1, d_p, 1);
}
//7: q <- A*p
cusparseStatus = cusparseDcsrmv(cusparseHandle0,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, LOWER,
&one, descrA, d_val, d_row, d_col, d_p, &zero, d_q);
if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
std::cout << "cusparseDcsrmv(q <- Ap) failed, status returned: " << std::endl;
std::cout << cusparseGetErrorString(cusparseStatus) << std::endl;
return 1;
}
//8: alpha = <r,z>_i / <p,A*p> = rho/<p,q>
//8a: ptAp <- <p,q>
cublasDdot(cublasHandle0, N, d_p, 1, d_q, 1, &ptAp);
//8b: alpha <- rho/ptAp
alpha = rho / ptAp;
//9: x <- x + alpha*p
cublasDaxpy(cublasHandle0, N, &alpha, d_p, 1, d_x, 1);
//10: r <- r - alpha*q
//10a:
negalpha = -alpha;
//10b: r <- r + negalpha
cublasDaxpy(cublasHandle0, N, &negalpha, d_q, 1, d_r, 1);
iter++;
// normr <- sqrt(<r,r>)_iter
cublasDnrm2(cublasHandle0, N, d_r, 1, &normr);
}
cublasDestroy(cublasHandle0);
cusparseDestroy(cusparseHandle0);
cusparseDestroyMatDescr(descrA);
cusparseDestroyMatDescr(descrL);
cusparseDestroySolveAnalysisInfo(infoL);
cusparseDestroySolveAnalysisInfo(infoU);
std::cout << std::endl << "N = " << N << std::endl;
std::cout << "Cholesky factor values (lower - csr storage)" << std::endl;
for (int i = 0; i < LOWER; i++) {
std::cout << std::fixed << std::setprecision(3) << d_valCholesky_devptr[i] << ", ";
}
std::cout << std::endl;
std::cout << "Cholesky factor column indices" << std::endl;
for (int i = 0; i < LOWER; i++) {
std::cout << std::fixed << std::setprecision(3) << d_col_devptr[i] << ", ";
}
std::cout << std::endl;
std::cout << "Cholesky factor row offsets" << std::endl;
for (int i = 0; i < N+1; i++) {
std::cout << std::fixed << std::setprecision(3) << d_row_devptr[i] << ", ";
}
std::cout << std::endl;
std::cout << std::endl << "d_x[i] solution vector" << std::endl;
for (int i = 0; i < N; i++) {
std::cout << std::fixed << std::setprecision(3) << d_x_devptr[i] << ", ";
}
std::cout << std::endl;
return 0;
}
const char* cublasGetErrorString(cublasStatus_t status)
{
switch(status)
{
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
default: return "UNKNOWN_ERROR";
}
}
const char* cusparseGetErrorString(cusparseStatus_t status)
{
switch(status)
{
case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
default: return "UNKNOWN_ERROR";
}
}
Here are the results I am getting compared to those from the CG cusp example
cusp cg
=======
N = 4
x[i] solution vector
0.500, 0.500, 0.500, 0.500,
N = 9
x[i] solution vector
0.688, 0.875, 0.688, 0.875, 1.125, 0.875, 0.688, 0.875, 0.688,
N = 16
x[i] solution vector
0.833, 1.167, 1.167, 0.833, 1.167, 1.667, 1.667, 1.167, 1.167, 1.667, 1.667, 1.167, 0.833, 1.167, 1.167, 0.833,
cusparse iccg
=============
N = 4
d_x[i] solution vector
0.500, 0.500, 0.500, 0.500,
N = 9
d_x[i] solution vector
0.607, 0.794, 0.821, 0.634, 0.749, 1.490, 0.179, 0.080, 0.392,
N = 16
d_x[i] solution vector
0.680, 1.051, 0.816, 0.475, 0.669, 1.708, 0.738, 0.084, 0.289, 0.375, 0.345, 0.124, 0.112, 0.157, 0.142, 0.066,
Thanks,
David