cannot get proper results for cusparseSnnz

Hello,
I tried to use cusparseSnnz but I couldn’t get proper results then I try to run and compile the example cusparse.cu from the book “Professional Cuda C programming” but also I cannot get good results. I try to print the vectors of the matrix in the CSR format but they are not as I expected.
I add the version with some modifications. Thank you.

#include “…/common/common.h”
#include <stdio.h>
#include <stdlib.h>
#include <cusparse_v2.h>
#include <cuda.h>

/*

  • This is an example demonstrating usage of the cuSPARSE library to perform a
  • sparse matrix-vector multiplication on randomly generated data.
    */

/*

  • M = # of rows
  • N = # of columns
    */
    int M = 3;
    int N = 3;

/*

  • Generate a vector of length N with random single-precision floating-point

  • values between 0 and 100.
    */
    void generate_random_vector(int N, float **outX)
    {
    int i;
    double rMax = (double)RAND_MAX;
    float *X = (float *)malloc(sizeof(float) * N);

    for (i = 0; i < N; i++)
    {
    int r = rand();
    double dr = (double)r;
    X[i] = (dr / rMax) * 100.0;
    }

    *outX = X;
    }

/*

  • Generate random dense matrix A in column-major order, while rounding some

  • elements down to zero to ensure it is sparse.
    */
    int generate_random_dense_matrix(int M, int N, float **outA)
    {
    int i, j;
    double rMax = (double)RAND_MAX;
    float *A = (float *)malloc(sizeof(float) * M * N);
    int totalNnz = 0;

    for (j = 0; j < N; j++)
    {
    for (i = 0; i < M; i++)
    {
    int r = rand();
    float *curr = A + (j * M + i);

         if (r % 3 > 0)
         {
             *curr = 0.0f;
         }
         else
         {
             double dr = (double)r;
             *curr = (dr / rMax) * 100.0;
         }
    
         if (*curr != 0.0f)
         {
             totalNnz++;
         }
     }
    

    }

    *outA = A;
    return totalNnz;
    }

int main(int argc, char **argv)
{
int row;
float *A, *dA;
int *dNnzPerRow;
float *dCsrValA;
int *dCsrRowPtrA;
int *dCsrColIndA;
int totalNnz;
float alpha = 3.0f;
float beta = 4.0f;
float *dX, *X;
float *dY, *Y;
cusparseHandle_t handle = 0;
cusparseMatDescr_t descr = 0;

// Generate input
srand(9384);
int trueNnz = generate_random_dense_matrix(M, N, &A);
generate_random_vector(N, &X);
generate_random_vector(M, &Y);

// Create the cuSPARSE handle
CHECK_CUSPARSE(cusparseCreate(&handle));

// Allocate device memory for vectors and the dense form of the matrix A
CHECK(cudaMalloc((void **)&dX, sizeof(float) * N));
CHECK(cudaMalloc((void **)&dY, sizeof(float) * M));
CHECK(cudaMalloc((void **)&dA, sizeof(float) * M * N));
CHECK(cudaMalloc((void **)&dNnzPerRow, sizeof(int) * M));

// Construct a descriptor of the matrix A
CHECK_CUSPARSE(cusparseCreateMatDescr(&descr));
CHECK_CUSPARSE(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
CHECK_CUSPARSE(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));

// Transfer the input vectors and dense matrix A to the device
CHECK(cudaMemcpy(dX, X, sizeof(float) * N, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(dY, Y, sizeof(float) * M, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(dA, A, sizeof(float) * M * N, cudaMemcpyHostToDevice));

int *nnzh=NULL;

// Compute the number of non-zero elements in A
CHECK_CUSPARSE(cusparseSnnz(handle, CUSPARSE_DIRECTION_ROW, M, N, descr, dA,
                            M, dNnzPerRow, &totalNnz));

nnzh = (int *)malloc(sizeof(int)*N);
cudaMemcpy(dNnzPerRow,nnzh, N*sizeof(int), cudaMemcpyDeviceToHost);

for( int i=0 ; i<N ;i++)
{
printf(“nnzh[%d]=%d\n”,i,nnzh[i]);
}
printf(“totalNnz=%d\n”,totalNnz);

if (totalNnz != trueNnz)
{
    fprintf(stderr, "Difference detected between cuSPARSE NNZ and true "
            "value: expected %d but got %d\n", trueNnz, totalNnz);
    return 1;
}

// Allocate device memory to store the sparse CSR representation of A
CHECK(cudaMalloc((void **)&dCsrValA, sizeof(float) * totalNnz));
CHECK(cudaMalloc((void **)&dCsrRowPtrA, sizeof(int) * (M + 1)));
CHECK(cudaMalloc((void **)&dCsrColIndA, sizeof(int) * totalNnz));

// Convert A from a dense formatting to a CSR formatting, using the GPU
CHECK_CUSPARSE(cusparseSdense2csr(handle, M, N, descr, dA, M, dNnzPerRow,
                                  dCsrValA, dCsrRowPtrA, dCsrColIndA));


// Perform matrix-vector multiplication with the CSR-formatted matrix A
CHECK_CUSPARSE(cusparseScsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                              M, N, totalNnz, &alpha, descr, dCsrValA,
                              dCsrRowPtrA, dCsrColIndA, dX, &beta, dY));

// Copy the result vector back to the host
CHECK(cudaMemcpy(Y, dY, sizeof(float) * M, cudaMemcpyDeviceToHost));

for (row = 0; row < 10; row++)
{
    printf("%2.2f\n", Y[row]);
}

printf("...\n");

free(A);
free(X);
free(Y);

CHECK(cudaFree(dX));
CHECK(cudaFree(dY));
CHECK(cudaFree(dA));
CHECK(cudaFree(dNnzPerRow));
CHECK(cudaFree(dCsrValA));
CHECK(cudaFree(dCsrRowPtrA));
CHECK(cudaFree(dCsrColIndA));

CHECK_CUSPARSE(cusparseDestroyMatDescr(descr));
CHECK_CUSPARSE(cusparseDestroy(handle));


return 0;

}

This line of code is not correct:

cudaMemcpy(dNnzPerRow,nnzh, N*sizeof(int), cudaMemcpyDeviceToHost);

If you had wrapped that in a CHECK() macro, you would see that it is throwing an error. You have the source and destination pointers reversed, it should be:

cudaMemcpy(nnzh, dNnzPerRow, N*sizeof(int), cudaMemcpyDeviceToHost);

This also doesn’t make sense:

for (row = 0; row < 10; row++)
{
printf("%2.2f\n", Y[row]);
}

Y is only allocated to a size of M which is 3. So iterating row up to 10 is out-of-bounds.