Does cusparseSbsrmm function work for large values of n?

cusparseSbsrmm(cusparseHandle_t handle,
cusparseDirection_t dirA,
cusparseOperation_t transA,
cusparseOperation_t transB,
int mb,
int n,
int kb,
int nnzb,
const float* alpha,
const cusparseMatDescr_t descrA,
const float* bsrValA,
const int* bsrRowPtrA,
const int* bsrColIndA,
int blockDim,
const float* B,
int ldb,
const float* beta,
float* C,
int ldc)

The value of n in my case is 786432 and the values of m and k are 16 and 144. On running the cusparseSbsrmm with the above m,k,n values, the function gives the error as “CUSPARSE call failed with code : 3”.Is there any upper bound on n beyond which the cusparseSbsrmm function fails to work?

Do you have a complete reproducer?

Here is a minimal reproducer:

$ cat t1943.cu
#include <cusparse_v2.h>
#include <iostream>
#include <cstdlib>

int main(int argc, char *argv[]){

  int my_n = 786432;
  if (argc > 1) my_n = atoi(argv[1]);
  std::cout << "n = " << my_n << std::endl;

  cusparseStatus_t         cstat;
  cusparseHandle_t         handle;
  cusparseDirection_t      dirA = CUSPARSE_DIRECTION_ROW;
  cusparseOperation_t      transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
  cusparseOperation_t      transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
  int                      mb = 16;
  int                      n = my_n;
  int                      kb = 144;
  int                      nnzb = mb;
  float                    alpha = 1.0f;
  cusparseMatDescr_t       descrA;
  float*                   bsrValA;
  int*                     bsrRowPtrA;
  int*                     bsrColIndA;
  int                      blockDim = 2;
  int                      k = kb*blockDim;
  int                      m = mb*blockDim;
  float*                   B;
  int                      ldb = k;
  float                    beta = 0.0f;
  float*                   C;
  int                      ldc = m;

  cusparseCreate(&handle);
  cusparseCreateMatDescr(&descrA);
  cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
  cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
  cudaMallocManaged(&B, sizeof(B[0])*k*n);
  cudaMallocManaged(&C, sizeof(C[0])*m*n);
  cudaMallocManaged(&bsrValA, sizeof(bsrValA[0])*nnzb*blockDim*blockDim);
  for (int i = 0; i < nnzb*blockDim*blockDim; i++) bsrValA[i] = 1.0f;
  memset(B, 0, sizeof(B[0])*k*n);
  memset(C, 0, sizeof(C[0])*m*n);
  cudaMallocManaged(&bsrColIndA, sizeof(bsrColIndA[0])*nnzb);
  cudaMallocManaged(&bsrRowPtrA, sizeof(bsrRowPtrA[0])*(mb+1));
  for (int i = 0; i < nnzb; i++) bsrColIndA[i] = i;
  for (int i = 0; i < (mb+1); i++) bsrRowPtrA[i] = i;

  cstat = cusparseSbsrmm(handle,
                         dirA,
                         transA,
                         transB,
                         mb,
                         n,
                         kb,
                         nnzb,
                         &alpha,
                         descrA,
                         bsrValA,
                         bsrRowPtrA,
                         bsrColIndA,
                         blockDim,
                         B,
                         ldb,
                         &beta,
                         C,
                         ldc);

  std::cout << "status = " <<  (int)cstat << std::endl;
}

$ nvcc -o t1943 t1943.cu -lcusparse
$ ./t1943 500000
n = 500000
status = 0
$ ./t1943 600000
n = 600000
status = 3
$

CUDA 11.4.1, V100

On my machine, the failure point seems to be between n = 524280 (fails) and n = 524270 (passes). I note that this is in the ballpark of 512*1024 (524288).

(compute-sanitizer reports no issues in either passing or failing cases)

I have filed (internal) Bug 3484035 for this issue (to have documentation updated).

1 Like