Problem in basic dense to csr format conversion using CUSPARSE

Following is the code for sparse matrix inverse using CUSPARSE library. Since there is no direct method to find out the inverse, I am using cusparseScsrsm_solve (A*Y=X; where A is the matrix whose inverse I intend to find; X is identity matrix and Y is the solution or the inverse). I am not getting the correct answer. so I thought putting some basic debugging statements(77-79; 96-99) to print out the csr format of the matrix (and to ensure cusparsedense2csr works fine)

The code

#include <cusparse_v2.h>
#include <stdio.h>
#include <stdbool.h>

#define N 18

// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


int main(int argc, char *argv[]) {

    cusparseHandle_t hndl;
    cusparseStatus_t stat;
    cusparseMatDescr_t descrA;
    cusparseDirection_t dir = CUSPARSE_DIRECTION_COLUMN;

    int *nnzPerRow;
    int *csrRowPtrA, *csrColIndA;
    float *csrValA, *d_sparse;
    int nnzA;
    int * nnzTotalDevHostPtr = &nnzA;


    cudaMalloc((void**)&nnzPerRow, N*sizeof(int));
    cudaMalloc((void**)&csrRowPtrA, N*sizeof(int));
    cudaMalloc((void**)&d_sparse, N*N*sizeof(float));
    cudaCheckErrors("cudaMalloc fail");

    float h_sparse[N][N]; FILE * f;
    f = fopen("H_matrix", "r");
    int i=0; char x;
    while((x=fgetc(f)) != EOF) {
        if ((x == '1') | (x == '0')) {
           h_sparse[i/N][i%N] = (float)(x-'0');
           i++;
           }
    }
    fclose(f);

//    for (i=0; i<N*N; i++) {
//    if (i%N == 0) printf("\n");
//    printf("%f\t", h_sparse[i/N][i%N]);
//}

    cudaMemcpy(d_sparse, h_sparse, N*N*sizeof(float), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy fail");

    CUSPARSE_CHECK(cusparseCreate(&hndl));
    stat = cusparseCreateMatDescr(&descrA);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(stat);
    cusparseSnnz(hndl, dir, N, N,
                descrA, d_sparse, N,
                nnzPerRow, nnzTotalDevHostPtr);
    if (nnzTotalDevHostPtr != NULL) {
        nnzA = * nnzTotalDevHostPtr;
    }

    int h_nnzPerRow[N];
    cudaMemcpy(h_nnzPerRow, nnzPerRow, sizeof(int)*N, cudaMemcpyDeviceToHost);
    for (int i=0; i<N; i++) {
        printf("%d\n", h_nnzPerRow[i]);
    }

    cudaMalloc((void**)&csrValA, sizeof(float)*nnzA);
    cudaMalloc((void**)&csrColIndA, sizeof(int)*nnzA);
    cudaCheckErrors("cudaMalloc fail");
    cusparseSdense2csr(hndl, N, N,
                       descrA, d_sparse,
                       N, nnzPerRow,
                       csrValA, csrRowPtrA, csrColIndA);

    float * h_csrValA; int * h_csrRowPtrA, * h_csrColIndA;
    h_csrValA = (float *)malloc(nnzA*sizeof(float));
    h_csrColIndA = (int *)malloc(nnzA*sizeof(int));
    h_csrRowPtrA = (int *)malloc(N*sizeof(int));
    cudaMemcpy(h_csrValA, csrValA, nnzA*sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_csrColIndA, csrColIndA, nnzA*sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_csrRowPtrA, csrRowPtrA, N*sizeof(float), cudaMemcpyDeviceToHost);    
    for (int i=0;i<nnzA;i++) {
        printf("\n%f\t%d", h_csrValA[i], h_csrColIndA[i]); }
    for (int i =0; i<N; i++) {
        printf("\n%d", h_csrRowPtrA[i]); }


    float h_X[N][N], *h_Y, *X, *Y;
    h_Y = (float *)malloc(N*N*sizeof(float));
    for (int i=0; i<N; i++) {
        for (int j=0; j<N; j++) {
            h_X[i][j] = (i==j) ? 1 : 0;
        }
    }

//   for (int i=0; i<N; i++) {
//       printf("\n");
//       for (int j=0; j<N; j++) {
//            printf("%f\t", h_X[i][j]);
//        }
//    }

    cudaMalloc((void**)&X, N*N*sizeof(float));
    cudaMalloc((void**)&Y, N*N*sizeof(float));
    cudaCheckErrors("cudaMalloc fail");
    cudaMemcpy(X, h_X, N*N*sizeof(float), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy fail");

    cusparseOperation_t operationA = CUSPARSE_OPERATION_NON_TRANSPOSE;
    cusparseSolveAnalysisInfo_t info;
    stat = cusparseCreateSolveAnalysisInfo(&info);
    CUSPARSE_CHECK(stat);
    stat = cusparseScsrsm_analysis(hndl, operationA, N, nnzA,
                            descrA, csrValA, csrRowPtrA, csrColIndA,
                            info);
    CUSPARSE_CHECK(stat);

    float p = 1;
    const float *alpha = &p;
    stat = cusparseScsrsm_solve(hndl, operationA, N, N,
                                alpha,
                                descrA,
                                csrValA, csrRowPtrA, csrColIndA,
                                info, X, N, Y, N);
    CUSPARSE_CHECK(stat);

    cudaMemcpy(h_Y, Y, N*N*sizeof(float), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail");

//    for(int i=0; i<N*N; i++) {
//        if (i%N==0) printf("\n");
//        printf("%f\t", h_Y[i]);
//}
}

The result obtained was:

nnzPerRow
6
6
6
6
6
6
6
6
6
6
9
7
9
7
8
6
6
7


csrValA         csrColIndA
1.000000        0
1.000000        3
1.000000        6
1.000000        9
1.000000        11
1.000000        17
1.000000        1
1.000000        4
1.000000        7
1.000000        10
1.000000        12
1.000000        2
1.000000        2
1.000000        5
1.000000        8
1.000000        10
1.000000        11
1.000000        13
1.000000        14
1.000000        15
1.000000        16
1.000000        17
1.000000        12
1.000000        7
1.000000        1
1.000000        3
1.000000        8
1.000000        10
1.000000        13
1.000000        14
1.000000        15
1.000000        4
1.000000        6
1.000000        10
1.000000        11
1.000000        12
1.000000        16
1.000000        17
1.000000        8
1.000000        9
1.000000        15
1.000000        0
1.000000        1
1.000000        5
1.000000        6
1.000000        10
1.000000        12
1.000000        13
1.000000        14
1.000000        3
1.000000        7
1.000000        11
1.000000        13
1.000000        16
1.000000        17
1.000000        3
1.000000        6
1.000000        9
1.000000        10
1.000000        12
1.000000        14
1.000000        5
1.000000        8
1.000000        14
1.000000        15
1.000000        12
1.000000        14
1.000000        1
1.000000        5
1.000000        2
1.000000        4
1.000000        7
1.000000        10
1.000000        12
1.000000        16
1.000000        10
1.000000        0
1.000000        4
1.000000        8
1.000000        9
1.000000        11
1.000000        12
1.000000        13
1.000000        14
1.000000        15
1.000000        17
1.000000        3
1.000000        7
1.000000        1
1.000000        3
1.000000        7
1.000000        2
1.000000        2
1.000000        5
1.000000        6
1.000000        10
1.000000        13
1.000000        14
1.000000        16
1.000000        5
1.000000        0
1.000000        5
1.000000        7
1.000000        9
1.000000        11
1.000000        12
1.000000        13
1.000000        17
1.000000        6
1.000000        10
1.000000        11
1.000000        17
1.000000        2
1.000000        3
1.000000        8
1.000000        12
1.000000        14
1.000000        15
1.000000        16

csrRowPtrA
0
6
12
18
24
30
36
42
48
54
60
69
76
85
92
100
106

The H_matrix is:

1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0
0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1
1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,1
0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0
0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,1,0
0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,1,0,0
0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1
1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
0,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,0
1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0
0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,1,0,1
0,0,1,0,1,0,0,1,1,0,0,0,1,0,1,1,0,0
0,0,1,0,1,0,0,1,0,1,1,0,1,0,1,0,0,1
0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1
0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1
1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0

The result deviates at the 5th entry itself in the ColIndex values where the column index value should be 12 instead of 11 and 15 instead of 17. The current values of csrColIndA are the values if direction was in row major order but the direction is set to column major order. However, the csrRowPtrA and nnzPerRow are perfectly correct i.e. along the column major order. I tried setting the direction to CUSPARSE_DIRECTION_ROW but now the result turns out to be incorrect for csrRowPtrA and nnzPerRow. The result for csrColIndA remains the same for both CUSPARSE_DIRECTION_ROW and CUSPARSE_DIRECTION_COLUMN.

Result if dir = CUSPARSE_DIRECTION_ROW;

6
5
10
5
7
8
5
7
7
7
5
6
10
3
7
8
6
7

1.000000        0
1.000000        3
1.000000        6
1.000000        9
1.000000        11
1.000000        17
1.000000        1
1.000000        4
1.000000        7
1.000000        10
1.000000        12
1.000000        2
1.000000        5
1.000000        8
1.000000        10
1.000000        11
1.000000        13
1.000000        14
1.000000        15
1.000000        16
1.000000        17
1.000000        0
1.000000        5
1.000000        7
1.000000        9
1.000000        12
1.000000        1
1.000000        3
1.000000        8
1.000000        10
1.000000        13
1.000000        14
1.000000        15
1.000000        2
1.000000        4
1.000000        6
1.000000        10
1.000000        11
1.000000        12
1.000000        16
1.000000        17
1.000000        0
1.000000        4
1.000000        8
1.000000        9
1.000000        15
1.000000        1
1.000000        5
1.000000        6
1.000000        10
1.000000        12
1.000000        13
1.000000        14
1.000000        2
1.000000        3
1.000000        7
1.000000        11
1.000000        13
1.000000        16
1.000000        17
1.000000        0
1.000000        3
1.000000        6
1.000000        9
1.000000        10
1.000000        12
1.000000        14
1.000000        1
1.000000        5
1.000000        8
1.000000        14
1.000000        15
1.000000        2
1.000000        4
1.000000        7
1.000000        10
1.000000        12
1.000000        16
1.000000        0
1.000000        4
1.000000        8
1.000000        9
1.000000        11
1.000000        12
1.000000        13
1.000000        14
1.000000        15
1.000000        17
1.000000        1
1.000000        3
1.000000        7
1.000000        2
1.000000        5
1.000000        6
1.000000        10
1.000000        13
1.000000        14
1.000000        16
1.000000        0
1.000000        5
1.000000        7
1.000000        9
1.000000        11
1.000000        12
1.000000        13
1.000000        17
1.000000        1
1.000000        4
1.000000        6
1.000000        10
1.000000        11
1.000000        17
1.000000        2
1.000000        3
1.000000        8
1.000000        12
1.000000        14
1.000000        15
1.000000        16
0
6
11
21
26
33
41
46
53
60
67
72
78
88
91
98
106

Can someone please help me out with basic dense to csr conversion? Also, the values I try to print out in the end (of the final solution or the inverse) are junk values. Could you help to find the mistake there too?

Ok I think I’ve figured out your issue. The issue is that your data is stored in row major format, but CUDA expects it in column major. I didn’t see this explicitly stated, but it’s implied when it states that the matrix is shaped [lda,n], implying it’s column major. (values are stored across lda)

This is why you get partially correct results with CUSPARSE_DIRECTION_ROW or CUSPARSE_DIRECTION_COLUMN. You should be using CUSPARSE_DIRECTION_ROW.

I have attached my code with both the row major and column major data sets (I transposed your matrix). You can comment out the thrust::copy operations to try the different data sets and see that it returns the correct result.

Hope that helps.

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <cuda_runtime.h>
#include <cusparse.h>

using namespace std;

int main(){

const int m = 18;
const int n = 18;
const int sz = m*n;

// Place holder for nnz (host)
int nnz;
int mynnz = 0;

// The matrix in question, in row major order
float mat_rowmaj[sz] = { 
1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,
0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,
0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,
1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,1,
0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0,
0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,
1,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,1,0,
0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,1,0,0,
0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,
1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,
0,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,0,
1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0,
0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,1,0,1,
0,0,1,0,1,0,0,1,1,0,0,0,1,0,1,1,0,0,
0,0,1,0,1,0,0,1,0,1,1,0,1,0,1,0,0,1,
0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,
0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,
1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0
};

// The matrix in question, in col major order
float mat_colmaj[sz] = {
1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,
0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,
0,0,1,0,0,1,0,0,1,0,1,1,0,1,1,1,1,1,
1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,
0,1,0,1,0,0,0,0,1,0,1,0,0,1,1,1,0,0,
0,0,1,0,1,0,1,0,0,0,1,1,1,0,0,0,1,1,
1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,
0,1,0,0,0,1,1,0,0,0,1,0,1,1,1,0,0,0,
0,0,1,1,0,0,0,1,0,0,0,1,0,1,0,0,1,1,
1,0,0,1,0,0,1,0,0,1,1,0,1,0,1,0,0,0,
0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,
0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,
1,0,0,0,1,0,0,0,1,1,0,1,1,1,1,1,0,1,
0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,1,1,0,0,0,1,0,0,1,1,0,1,0,
1,0,0,0,0,1,0,1,0,1,0,1,1,1,0,0,0,1,
0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,1,
0,0,1,1,0,0,0,0,1,0,0,0,1,0,1,1,1,0
};

// Allocate some device memory
thrust::device_vector<float> mat_h(sz);
thrust::device_vector<int> nnz_perrow_h(m);

// Copy row major matrix to the device
// thrust::copy( mat_rowmaj, mat_rowmaj+sz, mat_h.begin() );

// Copy col major matrix to the device
thrust::copy( mat_colmaj, mat_colmaj+sz, mat_h.begin() );

// Set up our cuSPARSE stuff
cusparseStatus_t cusstat;
cusparseHandle_t cushand;
cusparseCreate(&cushand);

// Matrix description
cusparseMatDescr_t descr = 0;
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

// Get the number of non-zeros across columns (i.e. per row)
cusstat = cusparseSnnz( cushand, CUSPARSE_DIRECTION_ROW, m, n,
			descr, thrust::raw_pointer_cast(&mat_h[0]), 
			n, thrust::raw_pointer_cast(&nnz_perrow_h[0]), &nnz );

cout << "NNZ per row" << endl; 
cout << "Total nnz: " << nnz << endl;
for(int i=0; i<m; i++)
	cout << "[" << i << "] " << nnz_perrow_h[i] << endl;

// Check it ourselves
for(int i=0; i<m; i++){
	for(int j=0; j<n; j++){
		if(mat_rowmaj[i*m+j] != 0.0)
			mynnz++;
	}
}

cout << "My NNZ " << mynnz << endl;

// Allocate more device memory for the CSR conversion
thrust::device_vector<int> rowos_h(m+1);
thrust::device_vector<int> colidx_h(nnz);
thrust::device_vector<float> val_h(nnz);

// Now the conversion
cusstat = cusparseSdense2csr( cushand, m, n, descr,
			thrust::raw_pointer_cast(&mat_h[0]), m, 
			thrust::raw_pointer_cast(&nnz_perrow_h[0]),
			thrust::raw_pointer_cast(&val_h[0]),
			thrust::raw_pointer_cast(&rowos_h[0]),
			thrust::raw_pointer_cast(&colidx_h[0]) );

cout << endl << "Row Info: " << endl;
for(int i=0; i<m+1; i++)
	cout << "[" << i << "] " << rowos_h[i] << endl;

cout << endl << "[Col] Val " << endl;
for(int i=0; i<nnz; i++)
	cout << "[" << colidx_h[i] << "] " << val_h[i] << endl;

// Clean up 
cusparseDestroy(cushand);

return 0;

}

trf86
Thanks a lot for your time and I am sorry for such a late reply. I would have never figured out the mistake without your help.
Your suggestion that the matrix should be stored in column-major format is in fact mentioned in the docs. That indeed seems to be the problem with my code (although I don’t have access to CUDA server anymore so I couldn’t run the code with correct data format).
The first line of the Dense Format section 3.3.1 http://docs.nvidia.com/cuda/cusparse/#dense-format2 says that the matrix is expected to be stored in column-major format in the memory like this:

A matrix in this form:

X11 X12 X13 ... X1N
X21 X22 X23 ... X2N
.
.
.
XM1 XM2 XM3 ... XMN

should be stored like the following in the memory:

X11 X21 X31 ... XM1 ... X1N X2N X3N ... XMN

which is nothing but the column-major format.

Also, if anyone happens to know of any free CUDA virtual server, then please let me know!

Regards
icono

No problem. Thanks for pointing that out in the documentation.