I am trying to test sparse matrix multiplciation using cusparseScsrmm().
The code is simple as the following:
#include <stdio.h>
#include <cuda.h>
#include "cusparse_v2.h"
#include "cublas_v2.h"
const int M = 4;
const int N = 3;
void genMatrix(int m, int n, float *A)
{
for(int i=0; i<m; i++)
for(int j=0; j<n; j++)
A[i*n+j] = i*n+j;
}
void printMat(float *A, int m, int n)
{
for(int i=0; i<m; i++){
for(int j=0; j<n; j++)
printf("%2.2f ", A[i*n+j]);
printf("\n");
}
printf("\n");
}
void printVal(float *val, int m)
{
for(int i=0; i<m; i++)
printf("%2.2f ", val[i]);
printf("\n\n");
}
void printIdx(int *idx, int m)
{
for(int i=0; i<m; i++)
printf("%2d ", idx[i]);
printf("\n\n");
}
int main(void)
{
float *hA = (float*)malloc(sizeof(float)*M*N);
float *hB = (float*)malloc(sizeof(float)*N*M);
float *hC = (float*)malloc(sizeof(float)*M*M);
genMatrix(M, N, hA);
genMatrix(N, M, hB);
float *dA, *dB, *dC;
cudaMalloc((void**)&dA, sizeof(float)*M*N);
cudaMalloc((void**)&dB, sizeof(float)*N*M);
cudaMalloc((void**)&dC, sizeof(float)*M*M);
cudaMemcpy(dA, hA, sizeof(float)*M*N, cudaMemcpyHostToDevice);
cudaMemcpy(dB, hB, sizeof(float)*N*M, cudaMemcpyHostToDevice);
float one = 1.0;
float zero = 0.0;
cusparseHandle_t cusparseHandle = 0;
cusparseMatDescr_t descr = 0;
cusparseCreate(&cusparseHandle);
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST);
int *d_nNonzPerRowA;
int nNonzA;
cudaMalloc((void**)&d_nNonzPerRowA, sizeof(int)*M);
cusparseSnnz(cusparseHandle, CUSPARSE_DIRECTION_ROW, M, N, descr, dA, M, d_nNonzPerRowA, &nNonzA);
float *d_csrVal_A;
int *d_csrRowPtr_A, *d_csrColIdx_A;
float *h_csrVal_A;
int *h_csrRowPtr_A, *h_csrColIdx_A;
h_csrVal_A = (float*)malloc(sizeof(float)*nNonzA);
h_csrRowPtr_A = (int*)malloc(sizeof(int)*(M+1));
h_csrColIdx_A = (int*)malloc(sizeof(int)*nNonzA);
cudaMalloc((void**)&d_csrVal_A, sizeof(float)*nNonzA);
cudaMalloc((void**)&d_csrRowPtr_A, sizeof(int)*(M+1));
cudaMalloc((void**)&d_csrColIdx_A, sizeof(int)*nNonzA);
cusparseSdense2csr(cusparseHandle, M, N, descr, dA, M, d_nNonzPerRowA,
d_csrVal_A, d_csrRowPtr_A, d_csrColIdx_A);
cudaMemcpy(h_csrVal_A, d_csrVal_A, sizeof(float)*nNonzA, cudaMemcpyDeviceToHost);
cudaMemcpy(h_csrRowPtr_A, d_csrRowPtr_A, sizeof(int)*nNonzA, cudaMemcpyDeviceToHost);
cudaMemcpy(h_csrColIdx_A, d_csrColIdx_A, sizeof(int)*(M+1), cudaMemcpyDeviceToHost);
printf("A:\n");
printMat(hA, M, N);
printf("h_csrVal_A:\n");
printVal(h_csrVal_A, nNonzA);
printf("h_csrRowPtr_A:\n");
printIdx(h_csrRowPtr_A, M+1);
printf("h_csrColIdx_A:\n");
printIdx(h_csrColIdx_A, nNonzA);
cusparseScsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, M, N, N, nNonzA, &one, descr,
d_csrVal_A, d_csrRowPtr_A, d_csrColIdx_A, dB, N, &zero, dC, M);
cudaMemcpy(hC, dC, sizeof(float)*M*M, cudaMemcpyDeviceToHost);
printf("B:\n");
printMat(hB, N, M);
printf("C:\n");
printMat(hC, M, M);
cublasHandle_t cublasHandle = 0;
cublasCreate(&cublasHandle);
return 0;
}
However, for some reasons I keep getting weird results. The following is the result I got.
A:
0.00 1.00 2.00
3.00 4.00 5.00
6.00 7.00 8.00
9.00 10.00 11.00
h_csrVal_A:
4.00 8.00 1.00 5.00 9.00 2.00 6.00 10.00 3.00 7.00 11.00
h_csrRowPtr_A:
33040128 0 28054528 0 -1437690858
h_csrColIdx_A:
1 2 0 1 2 0 33040464 0 32987456 0 0
B:
0.00 1.00 2.00 3.00
4.00 5.00 6.00 7.00
8.00 9.00 10.00 11.00
C:
20.00 23.00 26.00 29.00
56.00 68.00 80.00 92.00
92.00 113.00 134.00 155.00
-nan -nan -nan -nan
I don’t really know why it happens. What is the problem!? Thank you in advance.