Following is the code for sparse matrix inverse using CUSPARSE library. Since there is no direct method to find out the inverse, I am using cusparseScsrsm_solve (A*Y=X; where A is the matrix whose inverse I intend to find; X is identity matrix and Y is the solution or the inverse). I am not getting the correct answer. so I thought putting some basic debugging statements(77-79; 96-99) to print out the csr format of the matrix (and to ensure cusparsedense2csr works fine)
The code
#include <cusparse_v2.h>
#include <stdio.h>
#include <stdbool.h>
#define N 18
// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
int main(int argc, char *argv[]) {
cusparseHandle_t hndl;
cusparseStatus_t stat;
cusparseMatDescr_t descrA;
cusparseDirection_t dir = CUSPARSE_DIRECTION_COLUMN;
int *nnzPerRow;
int *csrRowPtrA, *csrColIndA;
float *csrValA, *d_sparse;
int nnzA;
int * nnzTotalDevHostPtr = &nnzA;
cudaMalloc((void**)&nnzPerRow, N*sizeof(int));
cudaMalloc((void**)&csrRowPtrA, N*sizeof(int));
cudaMalloc((void**)&d_sparse, N*N*sizeof(float));
cudaCheckErrors("cudaMalloc fail");
float h_sparse[N][N]; FILE * f;
f = fopen("H_matrix", "r");
int i=0; char x;
while((x=fgetc(f)) != EOF) {
if ((x == '1') | (x == '0')) {
h_sparse[i/N][i%N] = (float)(x-'0');
i++;
}
}
fclose(f);
// for (i=0; i<N*N; i++) {
// if (i%N == 0) printf("\n");
// printf("%f\t", h_sparse[i/N][i%N]);
//}
cudaMemcpy(d_sparse, h_sparse, N*N*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
CUSPARSE_CHECK(cusparseCreate(&hndl));
stat = cusparseCreateMatDescr(&descrA);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
CUSPARSE_CHECK(stat);
cusparseSnnz(hndl, dir, N, N,
descrA, d_sparse, N,
nnzPerRow, nnzTotalDevHostPtr);
if (nnzTotalDevHostPtr != NULL) {
nnzA = * nnzTotalDevHostPtr;
}
int h_nnzPerRow[N];
cudaMemcpy(h_nnzPerRow, nnzPerRow, sizeof(int)*N, cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++) {
printf("%d\n", h_nnzPerRow[i]);
}
cudaMalloc((void**)&csrValA, sizeof(float)*nnzA);
cudaMalloc((void**)&csrColIndA, sizeof(int)*nnzA);
cudaCheckErrors("cudaMalloc fail");
cusparseSdense2csr(hndl, N, N,
descrA, d_sparse,
N, nnzPerRow,
csrValA, csrRowPtrA, csrColIndA);
float * h_csrValA; int * h_csrRowPtrA, * h_csrColIndA;
h_csrValA = (float *)malloc(nnzA*sizeof(float));
h_csrColIndA = (int *)malloc(nnzA*sizeof(int));
h_csrRowPtrA = (int *)malloc(N*sizeof(int));
cudaMemcpy(h_csrValA, csrValA, nnzA*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_csrColIndA, csrColIndA, nnzA*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_csrRowPtrA, csrRowPtrA, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0;i<nnzA;i++) {
printf("\n%f\t%d", h_csrValA[i], h_csrColIndA[i]); }
for (int i =0; i<N; i++) {
printf("\n%d", h_csrRowPtrA[i]); }
float h_X[N][N], *h_Y, *X, *Y;
h_Y = (float *)malloc(N*N*sizeof(float));
for (int i=0; i<N; i++) {
for (int j=0; j<N; j++) {
h_X[i][j] = (i==j) ? 1 : 0;
}
}
// for (int i=0; i<N; i++) {
// printf("\n");
// for (int j=0; j<N; j++) {
// printf("%f\t", h_X[i][j]);
// }
// }
cudaMalloc((void**)&X, N*N*sizeof(float));
cudaMalloc((void**)&Y, N*N*sizeof(float));
cudaCheckErrors("cudaMalloc fail");
cudaMemcpy(X, h_X, N*N*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
cusparseOperation_t operationA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseSolveAnalysisInfo_t info;
stat = cusparseCreateSolveAnalysisInfo(&info);
CUSPARSE_CHECK(stat);
stat = cusparseScsrsm_analysis(hndl, operationA, N, nnzA,
descrA, csrValA, csrRowPtrA, csrColIndA,
info);
CUSPARSE_CHECK(stat);
float p = 1;
const float *alpha = &p;
stat = cusparseScsrsm_solve(hndl, operationA, N, N,
alpha,
descrA,
csrValA, csrRowPtrA, csrColIndA,
info, X, N, Y, N);
CUSPARSE_CHECK(stat);
cudaMemcpy(h_Y, Y, N*N*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
// for(int i=0; i<N*N; i++) {
// if (i%N==0) printf("\n");
// printf("%f\t", h_Y[i]);
//}
}
The result obtained was:
nnzPerRow
6
6
6
6
6
6
6
6
6
6
9
7
9
7
8
6
6
7
csrValA csrColIndA
1.000000 0
1.000000 3
1.000000 6
1.000000 9
1.000000 11
1.000000 17
1.000000 1
1.000000 4
1.000000 7
1.000000 10
1.000000 12
1.000000 2
1.000000 2
1.000000 5
1.000000 8
1.000000 10
1.000000 11
1.000000 13
1.000000 14
1.000000 15
1.000000 16
1.000000 17
1.000000 12
1.000000 7
1.000000 1
1.000000 3
1.000000 8
1.000000 10
1.000000 13
1.000000 14
1.000000 15
1.000000 4
1.000000 6
1.000000 10
1.000000 11
1.000000 12
1.000000 16
1.000000 17
1.000000 8
1.000000 9
1.000000 15
1.000000 0
1.000000 1
1.000000 5
1.000000 6
1.000000 10
1.000000 12
1.000000 13
1.000000 14
1.000000 3
1.000000 7
1.000000 11
1.000000 13
1.000000 16
1.000000 17
1.000000 3
1.000000 6
1.000000 9
1.000000 10
1.000000 12
1.000000 14
1.000000 5
1.000000 8
1.000000 14
1.000000 15
1.000000 12
1.000000 14
1.000000 1
1.000000 5
1.000000 2
1.000000 4
1.000000 7
1.000000 10
1.000000 12
1.000000 16
1.000000 10
1.000000 0
1.000000 4
1.000000 8
1.000000 9
1.000000 11
1.000000 12
1.000000 13
1.000000 14
1.000000 15
1.000000 17
1.000000 3
1.000000 7
1.000000 1
1.000000 3
1.000000 7
1.000000 2
1.000000 2
1.000000 5
1.000000 6
1.000000 10
1.000000 13
1.000000 14
1.000000 16
1.000000 5
1.000000 0
1.000000 5
1.000000 7
1.000000 9
1.000000 11
1.000000 12
1.000000 13
1.000000 17
1.000000 6
1.000000 10
1.000000 11
1.000000 17
1.000000 2
1.000000 3
1.000000 8
1.000000 12
1.000000 14
1.000000 15
1.000000 16
csrRowPtrA
0
6
12
18
24
30
36
42
48
54
60
69
76
85
92
100
106
The H_matrix is:
1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0
0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1
1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,1
0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0
0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,1,0
0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,1,0,0
0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1
1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
0,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,0
1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0
0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,1,0,1
0,0,1,0,1,0,0,1,1,0,0,0,1,0,1,1,0,0
0,0,1,0,1,0,0,1,0,1,1,0,1,0,1,0,0,1
0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1
0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1
1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0
The result deviates at the 5th entry itself in the ColIndex values where the column index value should be 12 instead of 11 and 15 instead of 17. The current values of csrColIndA are the values if direction was in row major order but the direction is set to column major order. However, the csrRowPtrA and nnzPerRow are perfectly correct i.e. along the column major order. I tried setting the direction to CUSPARSE_DIRECTION_ROW but now the result turns out to be incorrect for csrRowPtrA and nnzPerRow. The result for csrColIndA remains the same for both CUSPARSE_DIRECTION_ROW and CUSPARSE_DIRECTION_COLUMN.
Result if dir = CUSPARSE_DIRECTION_ROW;
6
5
10
5
7
8
5
7
7
7
5
6
10
3
7
8
6
7
1.000000 0
1.000000 3
1.000000 6
1.000000 9
1.000000 11
1.000000 17
1.000000 1
1.000000 4
1.000000 7
1.000000 10
1.000000 12
1.000000 2
1.000000 5
1.000000 8
1.000000 10
1.000000 11
1.000000 13
1.000000 14
1.000000 15
1.000000 16
1.000000 17
1.000000 0
1.000000 5
1.000000 7
1.000000 9
1.000000 12
1.000000 1
1.000000 3
1.000000 8
1.000000 10
1.000000 13
1.000000 14
1.000000 15
1.000000 2
1.000000 4
1.000000 6
1.000000 10
1.000000 11
1.000000 12
1.000000 16
1.000000 17
1.000000 0
1.000000 4
1.000000 8
1.000000 9
1.000000 15
1.000000 1
1.000000 5
1.000000 6
1.000000 10
1.000000 12
1.000000 13
1.000000 14
1.000000 2
1.000000 3
1.000000 7
1.000000 11
1.000000 13
1.000000 16
1.000000 17
1.000000 0
1.000000 3
1.000000 6
1.000000 9
1.000000 10
1.000000 12
1.000000 14
1.000000 1
1.000000 5
1.000000 8
1.000000 14
1.000000 15
1.000000 2
1.000000 4
1.000000 7
1.000000 10
1.000000 12
1.000000 16
1.000000 0
1.000000 4
1.000000 8
1.000000 9
1.000000 11
1.000000 12
1.000000 13
1.000000 14
1.000000 15
1.000000 17
1.000000 1
1.000000 3
1.000000 7
1.000000 2
1.000000 5
1.000000 6
1.000000 10
1.000000 13
1.000000 14
1.000000 16
1.000000 0
1.000000 5
1.000000 7
1.000000 9
1.000000 11
1.000000 12
1.000000 13
1.000000 17
1.000000 1
1.000000 4
1.000000 6
1.000000 10
1.000000 11
1.000000 17
1.000000 2
1.000000 3
1.000000 8
1.000000 12
1.000000 14
1.000000 15
1.000000 16
0
6
11
21
26
33
41
46
53
60
67
72
78
88
91
98
106
Can someone please help me out with basic dense to csr conversion? Also, the values I try to print out in the end (of the final solution or the inverse) are junk values. Could you help to find the mistake there too?