Problem of two large sparse matrices multiplication in cuParse

Hi,

I am the new guy to use cuSparse Library to compute the sparse matrix computations. Now I met problems to compute the multiplication of two large sparse matrices. I have used the sample code (by using level 3 routines) as provided at: http://docs.nvidia.com/cuda/cusparse/#appendix-b-cusparse-library-c---example

The code works fine with (5, 5)x(5, 5) matrices. When I tried large matrices, such as: (60000, 40000)x (40000, 10000), there will be no problem in “Converts the matrix from COO to CSR format” step; but the problem happens at the step of determining csrRowPtrC and the total number of nonzero elements:

status = cusparseXcsrgeamNnz(handle, m, n,
descrA, nnzA, csrRowPtrA, csrColIndA,
descrB, nnzB, csrRowPtrB, csrColIndB,
descrC, csrRowPtrC, nnzTotalDevHostPtr);

The “status” returns: CUSPARSE_STATUS_EXECUTION_FAILED

So where is the problem? Is it because exceeding the maximal size of the sparse matrix? I used the same code as 5x5 matrix

Thanks so much for any help and suggestions!

By the way, I have double checked that the same two large sparse matrices, such as (60000, 40000)x (40000, 10000), work fine in Matlab code:
sps_a = sparse(ps_a(:,1),ps_a(:,2),ps_a(:,3));
sps_b = sparse(ps_b(:,1),ps_b(:,2),ps_b(:,3));
sps_ab = sps_a*sps_b;

Could some one help me about it?

If you want to provide a complete code that I can copy, paste, compile, and run, without having to add anything or change anything, I’ll take a look. You should also provide the operating system, CUDA version, compile command line, and GPU used.

note that cusparseXcsrgeamNnz as you mention in your first posting doesn’t have anything to do with sparse matrix-sparse matrix multiplication.

I believe there is a good chance that a failure in cusparseXcsrgemmNnz could indicate an underlying problem in CSR matrix formatting. Just because COO-to-CSR format conversion functions don’t return an error does not mean that the CSR data is correct if the source COO data is not correct. You are expected/required to pass a properly formatted COO matrix to that function. If you don’t, the behavior is undefined and it is not guaranteed to catch any and all formatting errors.

The following is a complete code which performs sparse matrix multiplication of 2 CSR format sparse matrices, C = A(200000, 100000)xB(100000,100000).

compile with:

nvcc -o t727 t727.cu -lcusparse

tested on Fedora 20, CUDA 7, Quadro5000 GPU.

$ cat t727.cu
#include <cusparse_v2.h>
#include <stdio.h>

#define N 100000

// matrix generation and validation depends on these relationships:
#define SCL 2
#define K N
#define M (SCL*N)
// A: MxK  B: KxN  C: MxN

// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// perform sparse-matrix multiplication C=AxB
int main(){

  cusparseStatus_t stat;
  cusparseHandle_t hndl;
  cusparseMatDescr_t descrA, descrB, descrC;
  int *csrRowPtrA, *csrRowPtrB, *csrRowPtrC, *csrColIndA, *csrColIndB, *csrColIndC;
  int *h_csrRowPtrA, *h_csrRowPtrB, *h_csrRowPtrC, *h_csrColIndA, *h_csrColIndB, *h_csrColIndC;
  float *csrValA, *csrValB, *csrValC, *h_csrValA, *h_csrValB, *h_csrValC;
  int nnzA, nnzB, nnzC;
  int m,n,k;
  m = M;
  n = N;
  k = K;

// generate A, B=2I

/* A:
   |1.0 0.0 0.0 ...|
   |1.0 0.0 0.0 ...|
   |0.0 1.0 0.0 ...|
   |0.0 1.0 0.0 ...|
   |0.0 0.0 1.0 ...|
   |0.0 0.0 1.0 ...|
   ...

   B:
   |2.0 0.0 0.0 ...|
   |0.0 2.0 0.0 ...|
   |0.0 0.0 2.0 ...|
   ...                */

  nnzA = m;
  nnzB = n;
  h_csrRowPtrA = (int *)malloc((m+1)*sizeof(int));
  h_csrRowPtrB = (int *)malloc((n+1)*sizeof(int));
  h_csrColIndA = (int *)malloc(m*sizeof(int));
  h_csrColIndB = (int *)malloc(n*sizeof(int));
  h_csrValA  = (float *)malloc(m*sizeof(float));
  h_csrValB  = (float *)malloc(n*sizeof(float));
  if ((h_csrRowPtrA == NULL) || (h_csrRowPtrB == NULL) || (h_csrColIndA == NULL) || (h_csrColIndB == NULL) || (h_csrValA == NULL) || (h_csrValB == NULL))
    {printf("malloc fail\n"); return -1;}
  for (int i = 0; i < m; i++){
    h_csrValA[i] = 1.0f;
    h_csrRowPtrA[i] = i;
    h_csrColIndA[i] = i/SCL;
    if (i < n){
      h_csrValB[i] = 2.0f;
      h_csrRowPtrB[i] = i;
      h_csrColIndB[i] = i;}
    }
  h_csrRowPtrA[m] = m;
  h_csrRowPtrB[n] = n;

// transfer data to device

  cudaMalloc(&csrRowPtrA, (m+1)*sizeof(int));
  cudaMalloc(&csrRowPtrB, (n+1)*sizeof(int));
  cudaMalloc(&csrColIndA, m*sizeof(int));
  cudaMalloc(&csrColIndB, n*sizeof(int));
  cudaMalloc(&csrValA, m*sizeof(float));
  cudaMalloc(&csrValB, n*sizeof(float));
  cudaCheckErrors("cudaMalloc fail");
  cudaMemcpy(csrRowPtrA, h_csrRowPtrA, (m+1)*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(csrRowPtrB, h_csrRowPtrB, (n+1)*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(csrColIndA, h_csrColIndA, m*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(csrColIndB, h_csrColIndB, n*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(csrValA, h_csrValA, m*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(csrValB, h_csrValB, n*sizeof(float), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy fail");

// set cusparse matrix types
  CUSPARSE_CHECK(cusparseCreate(&hndl));
  stat = cusparseCreateMatDescr(&descrA);
  CUSPARSE_CHECK(stat);
  stat = cusparseCreateMatDescr(&descrB);
  CUSPARSE_CHECK(stat);
  stat = cusparseCreateMatDescr(&descrC);
  CUSPARSE_CHECK(stat);
  stat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
  CUSPARSE_CHECK(stat);
  stat = cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
  CUSPARSE_CHECK(stat);
  stat = cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
  CUSPARSE_CHECK(stat);
  stat = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
  CUSPARSE_CHECK(stat);
  stat = cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
  CUSPARSE_CHECK(stat);
  stat = cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
  CUSPARSE_CHECK(stat);
  cusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
  cusparseOperation_t transB = CUSPARSE_OPERATION_NON_TRANSPOSE;

// figure out size of C
  int baseC;
// nnzTotalDevHostPtr points to host memory
  int *nnzTotalDevHostPtr = &nnzC;
  stat = cusparseSetPointerMode(hndl, CUSPARSE_POINTER_MODE_HOST);
  CUSPARSE_CHECK(stat);
  cudaMalloc((void**)&csrRowPtrC, sizeof(int)*(m+1));
  cudaCheckErrors("cudaMalloc fail");
  stat = cusparseXcsrgemmNnz(hndl, transA, transB, m, n, k,
        descrA, nnzA, csrRowPtrA, csrColIndA,
        descrB, nnzB, csrRowPtrB, csrColIndB,
        descrC, csrRowPtrC, nnzTotalDevHostPtr );
  CUSPARSE_CHECK(stat);
  if (NULL != nnzTotalDevHostPtr){
    nnzC = *nnzTotalDevHostPtr;}
  else{
    cudaMemcpy(&nnzC, csrRowPtrC+m, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&baseC, csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail");
    nnzC -= baseC;}
  cudaMalloc((void**)&csrColIndC, sizeof(int)*nnzC);
  cudaMalloc((void**)&csrValC, sizeof(float)*nnzC);
  cudaCheckErrors("cudaMalloc fail");
// perform multiplication C = A*B
  stat = cusparseScsrgemm(hndl, transA, transB, m, n, k,
        descrA, nnzA,
        csrValA, csrRowPtrA, csrColIndA,
        descrB, nnzB,
        csrValB, csrRowPtrB, csrColIndB,
        descrC,
        csrValC, csrRowPtrC, csrColIndC);
  CUSPARSE_CHECK(stat);

// copy result (C) back to host
  h_csrRowPtrC = (int *)malloc((m+1)*sizeof(int));
  h_csrColIndC = (int *)malloc(nnzC *sizeof(int));
  h_csrValC  = (float *)malloc(nnzC *sizeof(float));
  if ((h_csrRowPtrC == NULL) || (h_csrColIndC == NULL) || (h_csrValC == NULL))
    {printf("malloc fail\n"); return -1;}
  cudaMemcpy(h_csrRowPtrC, csrRowPtrC, (m+1)*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(h_csrColIndC, csrColIndC,  nnzC*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(h_csrValC, csrValC, nnzC*sizeof(float), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy fail");

// check result, C = 2A
  if (nnzC != m) {printf("invalid matrix size C: %d, should be: %d\n", nnzC, m); return -1;}
  for (int i = 0; i < m; i++){
    if (h_csrRowPtrA[i] != h_csrRowPtrC[i]) {printf("A/C row ptr mismatch at %d, A: %d, C: %d\n", i, h_csrRowPtrA[i], h_csrRowPtrC[i]); return -1;}
    if (h_csrColIndA[i] != h_csrColIndC[i]) {printf("A/C col ind mismatch at %d, A: %d, C: %d\n", i, h_csrColIndA[i], h_csrColIndC[i]); return -1;}
    if ((h_csrValA[i]*2.0f) != h_csrValC[i]) {printf("A/C value mismatch at %d, A: %f, C: %f\n", i, h_csrValA[i]*2.0f, h_csrValC[i]); return -1;}
    }
  printf("Success!\n");
  return 0;
}
$

to verify the above (not that it needs it, rather to show how the function calls to cusparse shows up in the nvprof profiling data).

==3084== NVPROF is profiling process 3084, command: ConsoleApplication1.exe
Success!
==3084== Profiling application: ConsoleApplication1.exe
==3084== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
213.80ms  69.024us                    -               -         -         -         -  800.00KB  11.590GB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
213.96ms  35.712us                    -               -         -         -         -  400.00KB  11.201GB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
214.14ms  68.704us                    -               -         -         -         -  800.00KB  11.644GB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
214.30ms  36.031us                    -               -         -         -         -  400.00KB  11.102GB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
214.47ms  69.728us                    -               -         -         -         -  800.00KB  11.473GB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
217.22ms  36.000us                    -               -         -         -         -  400.00KB  11.111GB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
731.58ms  11.008us                    -               -         -         -         -  800.00KB  72.675GB/s  GeForce GTX TIT         1         7  [CUDA memset]
731.59ms  9.6960us                    -               -         -         -         -  800.00KB  82.508GB/s  GeForce GTX TIT         1         7  [CUDA memset]
731.61ms  505.18us          (12500 1 1)       (128 1 1)        26  1.6240KB        0B         -           -  GeForce GTX TIT         1         7  void csrgemmNnz_kernel2<int=128, int=32, int=0, int=16
>(csrgemmNnz_params) [386]
732.12ms  2.8800us                    -               -         -         -         -      128B  44.444MB/s  GeForce GTX TIT         1         7  [CUDA memset]
732.12ms  2.7520us                    -               -         -         -         -        4B  1.4535MB/s  GeForce GTX TIT         1         7  [CUDA memset]
732.13ms  6.7840us            (196 1 1)       (256 1 1)        21      256B        0B         -           -  GeForce GTX TIT         1         7  void cusparseIinclusive_localscan_core<int=256, int=4>
(int, int*, int*, int*) [396]
732.14ms  3.9360us              (1 1 1)       (256 1 1)        22      264B        0B         -           -  GeForce GTX TIT         1         7  void cusparseIinclusive_scan_domino_v1_core<int=256, i
nt=4>(int, int*, int*, int*, int*, int*) [405]
732.14ms  5.6320us            (196 1 1)       (256 1 1)        10        4B        0B         -           -  GeForce GTX TIT         1         7  void cusparseIinclusive_scan_merge_core<int=256, int=4
>(int, int, int*, int*, int*) [413]
732.15ms  2.3680us                    -               -         -         -         -        4B  1.6892MB/s  GeForce GTX TIT         1         7  [CUDA memcpy DtoH]
732.79ms  11.264us                    -               -         -         -         -  800.00KB  71.023GB/s  GeForce GTX TIT         1         7  [CUDA memset]
732.80ms  9.6640us                    -               -         -         -         -  800.00KB  82.782GB/s  GeForce GTX TIT         1         7  [CUDA memset]
732.82ms  530.08us          (12500 1 1)       (128 1 1)        29  3.7440KB        0B         -           -  GeForce GTX TIT         1         7  void csrgemm_kernel2<float, int=128, int=32, int=0, in
t=16>(csrgemm_params<float>) [426]
733.46ms  64.511us                    -               -         -         -         -  800.00KB  12.401GB/s  GeForce GTX TIT         1         7  [CUDA memcpy DtoH]
733.73ms  63.712us                    -               -         -         -         -  800.00KB  12.557GB/s  GeForce GTX TIT         1         7  [CUDA memcpy DtoH]
734.02ms  63.872us                    -               -         -         -         -  800.00KB  12.525GB/s  GeForce GTX TIT         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.

Thanks for posting the code sample txbob.
While I have worked with cusparse before, it is always good to see examples of how to use with code samples.

I came across similar problem. everything goes well when using a small matrix (80180), but something goes wrong when increase the matrix to (610009003).
the problem is: cusparseScsrmm operation can return CUSPARSE_STATUS_SUCCESS, but after that, when I access GPU memory again, an ‘illegal memory access’ error came out.
does anyone can help me?
btw, cuda memory is enough large.