Hello everyone!

I am currently writing a dynamic library for Mathematica that performs some sparse tensor contractions. One of the key functions that I use is


(to multiply a sparse matrix in CSR form with a dense one). I kept getting the


error every time I was calling


(required before the actual matrix multiplication).

In an effort to identify the error, I copied the example for


from, which worked flawlessly on my computer. I then modified the script such that it performs a matrix-matrix multiplication (by regarding the vectors as n-by-1 matrices). Now it also compiles fine, but I get the same CUSPARSE_STATUS_INVALID_VALUE runtime error.

I am including the code bellow.

// *** sp_example.c ***
// How to compile (assume CUDA is installed at /usr/local/cuda/)
//   nvcc sp_example.c -o sp_example -L/usr/local/cuda/lib64 -lcusparse -lcudart
#include <stdio.h>         // printf
#include <stdlib.h>        // EXIT_FAILURE
#include <cuda_runtime.h>  // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h>      // cusparseSpMM

#define CHECK_CUDA(func)                                                       \
{                                                                              \
    cudaError_t status = (func);                                               \
    if (status != cudaSuccess) {                                               \
        printf("CUDA API failed with error (%d) at line %d\n",                 \
               status, __LINE__);                                              \
        return EXIT_FAILURE;                                                   \
    }                                                                          \

#define CHECK_CUSPARSE(func)                                                   \
{                                                                              \
    cusparseStatus_t status = (func);                                          \
    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
        printf("CUSPARSE API failed with error (%d) at line %d\n",             \
               status, __LINE__);                                              \
        return EXIT_FAILURE;                                                   \
    }                                                                          \

int main() {
    // Host problem definition
    const int A_num_rows = 4;
    const int A_num_cols = 4;
    const int A_num_nnz  = 9;
    int   hA_csrOffsets[] = { 0, 3, 4, 7, 9 };
    int   hA_columns[]    = { 0, 2, 3, 1, 0, 2, 3, 1, 3 };
    double hA_Values[]     = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                              6.0f, 7.0f, 8.0f, 9.0f };
    double h_X[]           = { 1.0f, 2.0f, 3.0f, 4.0f };
    const double result[]  = { 19.0f, 8.0f, 51.0f, 52.0f };
    double alpha = 1.0f;
    double beta  = 0.0f;
    // Device memory management
    int   *dA_csrOffsets, *dA_columns;
    double *dA_values, *d_X, *d_Y;
    CHECK_CUDA( cudaMalloc((void**) &dA_csrOffsets,
                           (A_num_rows + 1) * sizeof(int)) )
    CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_num_nnz * sizeof(int)) )
    CHECK_CUDA( cudaMalloc((void**) &dA_values, A_num_nnz * sizeof(double)) )
    CHECK_CUDA( cudaMalloc((void**) &d_X, A_num_cols * sizeof(double)) )
    CHECK_CUDA( cudaMalloc((void**) &d_Y, A_num_rows * sizeof(double)) )

    CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,
                           (A_num_rows + 1) * sizeof(int),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, A_num_nnz * sizeof(int),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_values, hA_Values,
                           A_num_nnz * sizeof(double), cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(d_X, h_X, A_num_rows * sizeof(double),
                           cudaMemcpyHostToDevice) )
    cusparseHandle_t     handle = 0;
    cusparseSpMatDescr_t matA;
    cusparseDnMatDescr_t vecX, vecY;
    void*  dBuffer    = NULL;
    size_t bufferSize = 0;
    CHECK_CUSPARSE( cusparseCreate(&handle) )
    // Create sparse matrix A in CSR format
    CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_num_nnz,
                                      dA_csrOffsets, dA_columns, dA_values,
                                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F) )
    // Create dense vector X
    CHECK_CUSPARSE( cusparseCreateDnMat(&vecX, A_num_cols, 1, A_num_cols, d_X, CUDA_R_64F, CUSPARSE_ORDER_COL) )
    // Create dense vector Y
    CHECK_CUSPARSE( cusparseCreateDnMat(&vecY, A_num_rows, 1, A_num_rows, d_Y, CUDA_R_64F, CUSPARSE_ORDER_COL) )
    // allocate an external buffer if needed
    CHECK_CUSPARSE( cusparseSpMM_bufferSize(
                                 handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
                                 CUSPARSE_MM_ALG_DEFAULT, &bufferSize) )
    CHECK_CUSPARSE( cudaMalloc(&dBuffer, bufferSize) )

    // execute SpMM
                                 &alpha, matA, vecX, &beta, vecY, CUDA_R_64F,
                                 CUSPARSE_MM_ALG_DEFAULT, dBuffer) )

    // destroy matrix/vector descriptors
    CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
    CHECK_CUSPARSE( cusparseDestroyDnMat(vecX) )
    CHECK_CUSPARSE( cusparseDestroyDnMat(vecY) )
    CHECK_CUSPARSE( cusparseDestroy(handle) )
    // device result check
    double h_Y[A_num_rows];
    CHECK_CUDA( cudaMemcpy(h_Y, d_Y, A_num_rows * sizeof(double),
                           cudaMemcpyDeviceToHost) )

    int correct = 1;
    for (int i = 0; i < A_num_rows; i++) {
        if (h_Y[i] != result[i]) {
            correct = 0;
    if (correct)
        printf("sp_example test PASSED\n");
        printf("sp_example test FAILED: wrong result\n");
    // device memory deallocation
    CHECK_CUDA( cudaFree(dBuffer) )
    CHECK_CUDA( cudaFree(dA_csrOffsets) )
    CHECK_CUDA( cudaFree(dA_columns) )
    CHECK_CUDA( cudaFree(dA_values) )
    return EXIT_SUCCESS;

The code was compiled with

nvcc sp_example.c -o sp_example -L/usr/local/cuda/lib64 -lcusparse -cudart

. The most probable cause is that I somehow mess up the indices when declaring the dense matrices, but I really cannot figure it out.

Thank you very much in advance!

Also, I forgot to mention that the error output is

CUSPARSE API failed with error (3) at line 84

which corresponds to the cusparseSpMM_bufferSize call.

I suggest filing a bug. The instructions are linked at the top of the CUDA programming forum.

Thanks for reporting the issue. It is a defect in the CUDA 10.1 library cusparseSpMM_bufferSize call, when the sparse matrix descriptor indicates CSR.

It should be fixed in a future CUDA release.

It happens that when the sparse matrix descriptor is CSR, cusparseSpMM (for CUDA 10.1) does not require any additional buffer memory. A workaround for this issue in CUDA 10.1 is to not call cusparseSpMM_bufferSize (if the matrix descriptor is CSR) and instead pass NULL for the corresponding pointer value to cusparseSpMM. Again, this is only applicable to the case where the sparse matrix descriptor passed to cusparseSpMM indicates CSR, and this workaround should only be used for CUDA 10.1


Thank you very much for your reply! I ended up using the deprecated cusparseDcsrmm2, but I will change that to cusparseSpMM once this bug is fixed.