CuSparse Matrix Multiplication Fails Silently

I’m seeing what appears to be a bug in cusparseSpMM when using CUSPARSE_SPMM_CSR_ALG2 on an H100 (CUDA Toolkit 12.5). The issue happens when multiplying a small CSR matrix (13×9) with a very wide dense matrix (9×10,000,000). CUSPARSE_SPMM_CSR_ALG3 returns correct results, but ALG2 produces an output matrix full of zeros even though the call reports success. For context, though it is weird to store the first matrix as sparse this comes from an application where the first matrix might be extremely large and sparse.

The minimal reproducer is below. It builds a fixed CSR matrix A (45 nnz), creates a dense B of size 9×10,000,000 with random binary entries, and runs SpMM twice: once with ALG2 and once with ALG3. Both calls return CUSPARSE_STATUS_SUCCESS, no CUDA errors occur, and buffer sizes are computed normally.

#include <cuda_runtime.h>
#include <cusparse.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <ctime>

#define CHECK_CUDA(func)                                                       \
{                                                                              \
    cudaError_t status = (func);                                               \
    if (status != cudaSuccess) {                                               \
        printf("CUDA API failed at line %d with error: %s (%d)\n",            \
               __LINE__, cudaGetErrorString(status), status);                  \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

#define CHECK_CUSPARSE(func)                                                   \
{                                                                              \
    cusparseStatus_t status = (func);                                          \
    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
        printf("cuSPARSE API failed at line %d with error: %d\n",             \
               __LINE__, status);                                              \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

int main() {
    // Matrix dimensions
    const int A_num_rows = 13;
    const int A_num_cols = 9;
    const int B_num_cols = 10000000;  // Large column count triggers the issue

    std::cout << "Testing CUSPARSE_SPMM_CSR_ALG2 vs CUSPARSE_SPMM_CSR_ALG3" << std::endl;
    std::cout << "Sparse matrix A: " << A_num_rows << " x " << A_num_cols << std::endl;
    std::cout << "Dense matrix B: " << A_num_cols << " x " << B_num_cols << std::endl;

    // Create a simple sparse matrix A (13x9) in CSR format
    // This matrix has 3 non-zeros per row (except last row which has all 9)
    std::vector<int> h_A_csrOffsets = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 45};
    std::vector<int> h_A_columns = {
        1, 2, 3,
        0, 2, 4,
        0, 1, 5,
        4, 5, 6,
        3, 5, 7,
        3, 4, 8,
        0, 7, 8,
        1, 6, 8,
        2, 6, 7,
        0, 3, 6,
        1, 4, 7,
        2, 5, 8,
        0, 1, 2, 3, 4, 5, 6, 7, 8
    };
    std::vector<float> h_A_values(45, 1.0f);  // All non-zero values are 1.0
    const int A_nnz = h_A_values.size();

    // Allocate and copy sparse matrix A to device
    int *d_A_csrOffsets, *d_A_columns;
    float *d_A_values;
    CHECK_CUDA(cudaMalloc((void**)&d_A_csrOffsets, (A_num_rows + 1) * sizeof(int)));
    CHECK_CUDA(cudaMalloc((void**)&d_A_columns, A_nnz * sizeof(int)));
    CHECK_CUDA(cudaMalloc((void**)&d_A_values, A_nnz * sizeof(float)));
    CHECK_CUDA(cudaMemcpy(d_A_csrOffsets, h_A_csrOffsets.data(),
                          (A_num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(d_A_columns, h_A_columns.data(),
                          A_nnz * sizeof(int), cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(d_A_values, h_A_values.data(),
                          A_nnz * sizeof(float), cudaMemcpyHostToDevice));

    // Create random dense matrix B with binary values (0 or 1)
    std::cout << "Generating random dense matrix B..." << std::endl;
    std::srand(42);  // Fixed seed for reproducibility
    std::vector<float> h_B(A_num_cols * B_num_cols);
    for (size_t i = 0; i < h_B.size(); ++i) {
        h_B[i] = (std::rand() % 2 == 0) ? 0.0f : 1.0f;
    }

    float *d_B;
    CHECK_CUDA(cudaMalloc((void**)&d_B, A_num_cols * B_num_cols * sizeof(float)));
    CHECK_CUDA(cudaMemcpy(d_B, h_B.data(),
                          A_num_cols * B_num_cols * sizeof(float),
                          cudaMemcpyHostToDevice));

    // Allocate output matrices for both algorithms
    float *d_C_alg2, *d_C_alg3;
    CHECK_CUDA(cudaMalloc((void**)&d_C_alg2, A_num_rows * B_num_cols * sizeof(float)));
    CHECK_CUDA(cudaMalloc((void**)&d_C_alg3, A_num_rows * B_num_cols * sizeof(float)));
    CHECK_CUDA(cudaMemset(d_C_alg2, 0, A_num_rows * B_num_cols * sizeof(float)));
    CHECK_CUDA(cudaMemset(d_C_alg3, 0, A_num_rows * B_num_cols * sizeof(float)));

    // Create cuSPARSE handle and matrix descriptors
    cusparseHandle_t handle;
    CHECK_CUSPARSE(cusparseCreate(&handle));

    cusparseSpMatDescr_t matA;
    CHECK_CUSPARSE(cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
                                     d_A_csrOffsets, d_A_columns, d_A_values,
                                     CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                                     CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));

    cusparseDnMatDescr_t matB;
    CHECK_CUSPARSE(cusparseCreateDnMat(&matB, A_num_cols, B_num_cols, B_num_cols,
                                       d_B, CUDA_R_32F, CUSPARSE_ORDER_ROW));

    cusparseDnMatDescr_t matC_alg2, matC_alg3;
    CHECK_CUSPARSE(cusparseCreateDnMat(&matC_alg2, A_num_rows, B_num_cols, B_num_cols,
                                       d_C_alg2, CUDA_R_32F, CUSPARSE_ORDER_ROW));
    CHECK_CUSPARSE(cusparseCreateDnMat(&matC_alg3, A_num_rows, B_num_cols, B_num_cols,
                                       d_C_alg3, CUDA_R_32F, CUSPARSE_ORDER_ROW));

    float alpha = 1.0f;
    float beta = 0.0f;

    // Execute SpMM with CUSPARSE_SPMM_CSR_ALG2
    std::cout << "\nRunning CUSPARSE_SPMM_CSR_ALG2..." << std::endl;
    size_t bufferSize_alg2 = 0;
    CHECK_CUSPARSE(cusparseSpMM_bufferSize(handle,
                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
                                           &alpha, matA, matB, &beta, matC_alg2,
                                           CUDA_R_32F, CUSPARSE_SPMM_CSR_ALG2,
                                           &bufferSize_alg2));
    void* dBuffer_alg2 = nullptr;
    if (bufferSize_alg2 > 0) {
        CHECK_CUDA(cudaMalloc(&dBuffer_alg2, bufferSize_alg2));
    }
    CHECK_CUSPARSE(cusparseSpMM(handle,
                                CUSPARSE_OPERATION_NON_TRANSPOSE,
                                CUSPARSE_OPERATION_NON_TRANSPOSE,
                                &alpha, matA, matB, &beta, matC_alg2,
                                CUDA_R_32F, CUSPARSE_SPMM_CSR_ALG2,
                                dBuffer_alg2));
    CHECK_CUDA(cudaDeviceSynchronize());

    // Execute SpMM with CUSPARSE_SPMM_CSR_ALG3
    std::cout << "Running CUSPARSE_SPMM_CSR_ALG3..." << std::endl;
    size_t bufferSize_alg3 = 0;
    CHECK_CUSPARSE(cusparseSpMM_bufferSize(handle,
                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
                                           &alpha, matA, matB, &beta, matC_alg3,
                                           CUDA_R_32F, CUSPARSE_SPMM_CSR_ALG3,
                                           &bufferSize_alg3));
    void* dBuffer_alg3 = nullptr;
    if (bufferSize_alg3 > 0) {
        CHECK_CUDA(cudaMalloc(&dBuffer_alg3, bufferSize_alg3));
    }
    CHECK_CUSPARSE(cusparseSpMM(handle,
                                CUSPARSE_OPERATION_NON_TRANSPOSE,
                                CUSPARSE_OPERATION_NON_TRANSPOSE,
                                &alpha, matA, matB, &beta, matC_alg3,
                                CUDA_R_32F, CUSPARSE_SPMM_CSR_ALG3,
                                dBuffer_alg3));
    CHECK_CUDA(cudaDeviceSynchronize());

    // Copy results back to host
    std::cout << "\nComparing results..." << std::endl;
    std::vector<float> h_C_alg2(A_num_rows * B_num_cols);
    std::vector<float> h_C_alg3(A_num_rows * B_num_cols);
    CHECK_CUDA(cudaMemcpy(h_C_alg2.data(), d_C_alg2,
                          A_num_rows * B_num_cols * sizeof(float),
                          cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(h_C_alg3.data(), d_C_alg3,
                          A_num_rows * B_num_cols * sizeof(float),
                          cudaMemcpyDeviceToHost));

    // Compare results
    size_t num_differences = 0;
    size_t num_zeros_alg2 = 0;
    size_t num_zeros_alg3 = 0;
    double sum_alg2 = 0.0;
    double sum_alg3 = 0.0;

    for (size_t i = 0; i < h_C_alg2.size(); ++i) {
        if (h_C_alg2[i] == 0.0f) num_zeros_alg2++;
        if (h_C_alg3[i] == 0.0f) num_zeros_alg3++;
        if (h_C_alg2[i] != h_C_alg3[i]) num_differences++;
        sum_alg2 += h_C_alg2[i];
        sum_alg3 += h_C_alg3[i];
    }

    size_t total = h_C_alg2.size();
    std::cout << "\nResults:" << std::endl;
    std::cout << "  ALG2 - Sum: " << sum_alg2 << ", Zeros: " << num_zeros_alg2
              << " (" << (100.0 * num_zeros_alg2 / total) << "%)" << std::endl;
    std::cout << "  ALG3 - Sum: " << sum_alg3 << ", Zeros: " << num_zeros_alg3
              << " (" << (100.0 * num_zeros_alg3 / total) << "%)" << std::endl;
    std::cout << "  Differences: " << num_differences
              << " (" << (100.0 * num_differences / total) << "%)" << std::endl;

    // Sample output
    std::cout << "\nSample (first 10 elements):" << std::endl;
    std::cout << "  ALG2: ";
    for (int i = 0; i < 10; ++i) std::cout << h_C_alg2[i] << " ";
    std::cout << std::endl;
    std::cout << "  ALG3: ";
    for (int i = 0; i < 10; ++i) std::cout << h_C_alg3[i] << " ";
    std::cout << std::endl;

    // Verdict
    std::cout << "\n*** ";
    if (num_zeros_alg2 == total && sum_alg3 > 0) {
        std::cout << "BUG CONFIRMED: ALG2 returns all zeros, ALG3 returns correct results!";
    } else if (num_differences > 0) {
        std::cout << "INCONSISTENCY: ALG2 and ALG3 produce different results!";
    } else {
        std::cout << "PASS: Both algorithms agree.";
    }
    std::cout << " ***" << std::endl;

    // Cleanup
    CHECK_CUSPARSE(cusparseDestroySpMat(matA));
    CHECK_CUSPARSE(cusparseDestroyDnMat(matB));
    CHECK_CUSPARSE(cusparseDestroyDnMat(matC_alg2));
    CHECK_CUSPARSE(cusparseDestroyDnMat(matC_alg3));
    CHECK_CUSPARSE(cusparseDestroy(handle));
    CHECK_CUDA(cudaFree(d_A_csrOffsets));
    CHECK_CUDA(cudaFree(d_A_columns));
    CHECK_CUDA(cudaFree(d_A_values));
    CHECK_CUDA(cudaFree(d_B));
    CHECK_CUDA(cudaFree(d_C_alg2));
    CHECK_CUDA(cudaFree(d_C_alg3));
    if (dBuffer_alg2) CHECK_CUDA(cudaFree(dBuffer_alg2));
    if (dBuffer_alg3) CHECK_CUDA(cudaFree(dBuffer_alg3));

    return EXIT_SUCCESS;
}

On my system (H100, NVIDIA Cuda Toolkit 12.5) this outputs

Testing CUSPARSE_SPMM_CSR_ALG2 vs CUSPARSE_SPMM_CSR_ALG3
Sparse matrix A: 13 x 9
Dense matrix B: 9 x 10000000
Generating random dense matrix B...

Running CUSPARSE_SPMM_CSR_ALG2...
Running CUSPARSE_SPMM_CSR_ALG3...

Comparing results...

Results:
  ALG2 - Sum: 0, Zeros: 130000000 (100%)
  ALG3 - Sum: 2.24982e+08, Zeros: 15025750 (11.5583%)
  Differences: 114974250 (88.4417%)

Sample (first 10 elements):
  ALG2: 0 0 0 0 0 0 0 0 0 0 
  ALG3: 2 2 1 2 2 2 0 3 2 2 

*** BUG CONFIRMED: ALG2 returns all zeros, ALG3 returns correct results! ***

Thanks in advance and happy to provide additional info if nessecary!

I suggest that you file a bug. The bug report could be fairly simple, linking back to this post if you wish.

I see the same behavior on CUDA 13, L4 GPU. I note that if I run your test case with compute-sanitizer I can get additional information about what may be going wrong.

Hi @gripper1908 . Thanks for reporting the issue. We already have a fix for it and it will be published in CUDA CTK 13.1.

Hi @Robert_Crovella and @qanhpham,

Thanks for the quick reply and support! Looking forward for the fix.

Cheers

@gripper1908 In the meantime, there’s a workaround for this if you want to use ALG2: you can break the dense matrix B into smaller ones and run batched SpMM on them.