Massive performance decrease in cuSPARSE spMM from v10.1 to v10.2

Hi there!

I was checking on some performance numbers again and recompiled and rerun my programs for that purpose. After wondering why I got such bad results compared to the ones I had before I was able to isolate the problem to the cuSPARSE spMM routine and a change from CUDA version 10.1 to 10.2. Depending on the exact layout of the CSR matrix my spMM-runtime could go up by a factor of five.

Is that a (known) bug or is it more likely I am doing something wrong?

Here is a complete example for which I witness a roughly doubled runtime when using CUDA version 10.2.89 instead of 10.1.243 on a V100 and on a P100: (I got a worse increase of runtime for a more complex CSR matrix but this program is already long enough for a showcase.)

#include <cuda_runtime.h>
#include <cusparse.h>

#include <iostream>
#include <iomanip>
#include <cassert>
#include <limits>

int main()
{
  std::cout << "CUDART_VERSION = " << CUDART_VERSION << std::endl;

  cusparseStatus_t status;
  cudaError_t      error;

  constexpr int n   = 1024000; // size of csr matrix
  constexpr int k   =     512; // number of columns of dense matrices
  constexpr int nnz = 3*n-2*1000;

  // * * * * * raw data * * * * * //
  float alpha = 3.1415f;
  float beta  = 2.1718f;
  int*   csr_rowptr = nullptr;
  int*   csr_colidx = nullptr;
  float* csr_values = nullptr;
  float* dense_x    = nullptr;
  float* dense_y    = nullptr;

  error = cudaMallocManaged(&csr_rowptr, (n+1) * sizeof(int  )); assert(error == cudaSuccess);
  error = cudaMallocManaged(&csr_colidx,  nnz  * sizeof(int  )); assert(error == cudaSuccess);
  error = cudaMallocManaged(&csr_values,  nnz  * sizeof(float)); assert(error == cudaSuccess);
  error = cudaMallocManaged(&dense_x, n*k * sizeof(float)); assert(error == cudaSuccess);
  error = cudaMallocManaged(&dense_y, n*k * sizeof(float)); assert(error == cudaSuccess);

  error = cudaDeviceSynchronize(); assert(error == cudaSuccess); // not needed

  // fill arrays with values
  // three diagonals:
  // main diagonal, +1000 offset and -1000 offset
  // the two or three entries in row i are i
  int off = 0;
  for (int i(0); i < 1000; ++i)
  {
    csr_rowptr[i] = 2*i;
    csr_colidx[off  ] = i;
    csr_values[off++] = float(i);
    csr_colidx[off  ] = i+1000;
    csr_values[off++] = float(i);
  }
  assert(off == 2000);
  for (int i(1000); i < n-1000; ++i)
  {
    csr_rowptr[i] = 2*i;
    csr_colidx[off  ] = i-1000;
    csr_values[off++] = float(i);
    csr_colidx[off  ] = i;
    csr_values[off++] = float(i);
    csr_colidx[off  ] = i+1000;
    csr_values[off++] = float(i);
  }
  assert(off == nnz - 2000);
  for (int i(n-1000); i < n; ++i)
  {
    csr_rowptr[i] = 2*i;
    csr_colidx[off  ] = i-1000;
    csr_values[off++] = float(i);
    csr_colidx[off  ] = i;
    csr_values[off++] = float(i);
  }
  assert(off == nnz);
  csr_rowptr[n] = n;
  for (int i(0); i < n*k; ++i)
  {
    dense_x[i] = 123.45f;
    dense_y[i] = 54.321f;
  }

  error = cudaDeviceSynchronize(); assert(error == cudaSuccess); // not needed

  // * * * * * create cuSPARSE matrices * * * * * //
  cusparseSpMatDescr_t descr_csr;
  cusparseDnMatDescr_t descr_dense_x, descr_dense_y;
  status = cusparseCreateCsr(&descr_csr, n, n, n,
                             csr_rowptr, csr_colidx, csr_values,
                             CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
  assert(status == CUSPARSE_STATUS_SUCCESS);
  status = cusparseCreateDnMat(&descr_dense_x, n, k, n,
                               dense_x, CUDA_R_32F, CUSPARSE_ORDER_COL);
  assert(status == CUSPARSE_STATUS_SUCCESS);
  status = cusparseCreateDnMat(&descr_dense_y, n, k, n,
                               dense_y, CUDA_R_32F, CUSPARSE_ORDER_COL);
  assert(status == CUSPARSE_STATUS_SUCCESS);

  error = cudaDeviceSynchronize(); assert(error == cudaSuccess); // not needed

  // * * * * * last preparations * * * * * //
  cusparseHandle_t handle;
  status = cusparseCreate(&handle); assert(status == CUSPARSE_STATUS_SUCCESS);

  // initialize cusparse matmat routine
  size_t buffer_size = 0;
  void*  buffer      = nullptr;
  status = cusparseSpMM_bufferSize(handle,
                                   CUSPARSE_OPERATION_NON_TRANSPOSE,
                                   CUSPARSE_OPERATION_NON_TRANSPOSE,
                                   (const void*)&alpha,
                                   descr_csr,
                                   descr_dense_x,
                                   (const void*)&beta,
                                   descr_dense_y,
                                   CUDA_R_32F,
                                   CUSPARSE_CSRMM_ALG1,
                                   &buffer_size);
  assert(status == CUSPARSE_STATUS_SUCCESS);
  if (buffer_size > 0)
  { error = cudaMallocManaged(&buffer, buffer_size); assert(error == cudaSuccess); }

  // wrap matmat routine in a lambda
  auto spmm = [&]() {
    return cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
                        &alpha, descr_csr, descr_dense_x, &beta, descr_dense_y,
                        CUDA_R_32F, CUSPARSE_CSRMM_ALG1, buffer);
  };

  cudaEvent_t start, stop;
  error = cudaEventCreate(&start); assert(error == cudaSuccess);
  error = cudaEventCreate(&stop); assert(error == cudaSuccess);

  float t_min   = std::numeric_limits<float>::max();
  float t_max   = 0.f;
  float t_total = 0.f;

  // * * * * * timing * * * * * //
  // warmup
  status = spmm(); assert(status == CUSPARSE_STATUS_SUCCESS);
  status = spmm(); assert(status == CUSPARSE_STATUS_SUCCESS);
  error = cudaDeviceSynchronize(); assert(error == cudaSuccess); // not needed

  for (int i(0); i < 50; ++i)
  {
    error = cudaEventRecord(start, NULL); assert(error == cudaSuccess);
    status = spmm(); assert(status == CUSPARSE_STATUS_SUCCESS);
    error = cudaEventRecord(stop, NULL); assert(error == cudaSuccess);
    error = cudaEventSynchronize(stop); assert(error == cudaSuccess);
    float msec;
    error = cudaEventElapsedTime(&msec, start, stop); assert(error == cudaSuccess);
    t_min = std::min(t_min, msec);
    t_max = std::max(t_max, msec);
    t_total += msec;
  }

  // * * * * * output * * * * * //
  // informative output of setup
  std::cout << "operation: csrmm: Y = a*A*X + b*Y" << std::endl
            << "                  A: csr matrix in R^NxN" << std::endl
            << "                  X,Y: dense matrices in R^NxK" << std::endl
            << "                  a,b: real values" << std::endl
            << "n  : " << n   << std::endl
            << "k  : " << k   << std::endl
            << "nnz: " << nnz << std::endl;
  float flop = ((long)(2)*nnz*k // alpha * A*X [=: Z]
                + 2*n*k         // Y += beta * Z
               );
  float byte = (  sizeof(float) * 3*n*k   // 2 dense loads, 1 dense store
                + sizeof(int)   * n       // csr_colidx
                + sizeof(int)   * (nnz+1) // csr_rowptr
                + sizeof(float) * n       // csr_values
               );
  std::cout << "flop: " << flop << std::endl;
  std::cout << "byte: " << byte << std::endl;

  // informative output of measurement
  std::cout << "msec min/avg/max   : " << t_min << " / " << t_total/50.f << " / " << t_max << std::endl;
  std::cout << std::fixed << std::setprecision(0);
  std::cout << "GLOP/s min/avg/max : " << flop/t_min*1e-6f << " / " << flop/(t_total/50.f)*1e-6f << " / " << flop/t_max*1e-6f << std::endl;
  std::cout << "GByte/s min/avg/max: " << byte/t_min*1e-6f << " / " << byte/(t_total/50.f)*1e-6f << " / " << byte/t_max*1e-6f << std::endl;

  // * * * * * cleanup * * * * * //
  error = cudaFree(csr_rowptr); assert(error == cudaSuccess);
  error = cudaFree(csr_colidx); assert(error == cudaSuccess);
  error = cudaFree(csr_values); assert(error == cudaSuccess);
  error = cudaFree(dense_x   ); assert(error == cudaSuccess);
  error = cudaFree(dense_y   ); assert(error == cudaSuccess);

  return 0;
}

I cannot say anything about version 11 yet. I hope I’m doing something wrong here or it is fixed there.

Hi,
I just moved from CUDA 10.1 and using cusparseZ(D)csrmm2 (deprecated) to CUDA 11.0 using cusparseSpMM now. I also see a significant performance decrease. A similar issue has also been reported in (Performance Downgrade when changing [deprecated] cusparse<t>csrmm() to cusparseSpMM()).
Is there any progress here?
Many thanks,
Regards,
Reto