Results divergence between cuBLAS Sgemm and cuSPARSE BLOCKED-ELL SpMM

Greetings,

I have been evaluating cuSPARSE’s BLOCKED-ELL format for accelerating block-sparse matrix multiplication. Modifying the example code to compute the following matrix multiplication:

, and comparing the results with cuBLAS, I observe there is a significant divergence between the two’s results:

i       j       result_cusparse result_cublas
0       0       1338.007812     1338.000000
0       1       6338.515625     6338.000000
1       0       1338.007812     1338.000000
1       1       6338.515625     6338.000000
2       0       1338.007812     1338.000000
2       1       6338.515625     6338.000000
3       0       2530.156250     2530.000000
3       1       15337.187500    15338.200195
4       0       2530.156250     2530.000000
4       1       15337.187500    15338.200195
5       0       2530.156250     2530.000000
5       1       15337.187500    15338.200195

Is this expected, or is there an error in my code?

I am appending my code below:

/*
 * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <cublas_v2.h>
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h>         // cusparseSpMM
#include <cstdio>            // printf
#include <cstdlib>           // EXIT_FAILURE

#define CHECK_CUDA(func)                                                       \
{                                                                              \
    cudaError_t status = (func);                                               \
    if (status != cudaSuccess) {                                               \
        std::printf("CUDA API failed at line %d with error: %s (%d)\n",        \
               __LINE__, cudaGetErrorString(status), status);                  \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

#define CHECK_CUSPARSE(func)                                                   \
{                                                                              \
    cusparseStatus_t status = (func);                                          \
    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
        std::printf("CUSPARSE API failed at line %d with error: %s (%d)\n",    \
               __LINE__, cusparseGetErrorString(status), status);              \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

#define CHECK_CUBLAS(err)                                                      \
{                                                                              \
    cublasStatus_t err_ = (err);                                               \
    if (err_ != CUBLAS_STATUS_SUCCESS) {                                       \
        std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__);   \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

const int EXIT_UNSUPPORTED = 2;

int main() {
    // Host problem definition
    int   A_num_rows      = 6;
    int   A_num_cols      = 6;
    int   A_ell_blocksize = 3;
    int   A_ell_cols      = 3;
    int   lda             = A_num_rows;
    int   A_size          = lda * A_num_cols;
    int   A_num_blocks    = A_ell_cols * A_num_rows /
                           (A_ell_blocksize * A_ell_blocksize);
    int   B_num_rows      = 6;
    int   B_num_cols      = 2;
    int   ldb             = B_num_rows;
    int   ldc             = B_num_rows;
    int   B_size          = ldb * B_num_cols;
    int   C_size          = ldc * B_num_cols;

    // A in dense format (column-major)
    float hA_dense[]     = { 90.0f, 90.0f, 90.0f, 0.0f, 0.0f, 0.0f,
                             100.0f, 100.0f, 100.0f, 0.0f, 0.0f, 0.0f,
                             110.0f, 110.0f, 110.0f, 0.0f, 0.0f, 0.0f,
                             0.0f, 0.0f, 0.0f, 90.0f, 90.0f, 90.0f,
                             0.0f, 0.0f, 0.0f, 100.0f, 100.0f, 100.0f,
                             0.0f, 0.0f, 0.0f, 110.0f, 110.0f, 110.0f };
    // A in blocked-ELL format
    int   hA_columns[]    = { 0, 1};
    float hA_values[]    = { 90.0f, 100.0f, 110.0f,
                             90.0f, 100.0f, 110.0f,
                             90.0f, 100.0f, 110.f,
                             90.0f, 100.f, 110.0f,
                             90.0f, 100.0f, 110.0f,
                             90.0f, 100.0f, 110.0f };
    // B in dense format (column-major)
    float hB[]           = { 2.5f,
                             4.2f,
                             6.3f,
                             7.1f,
                             8.9,
                             9.1f,
                             10.1f,
                             20.3f,
                             30.9f,
                             40.3f,
                             50.1f,
                             60.92f };
    float hC[]           = { 0.0f, 0.0f, 0.0f,
                             0.0f, 0.0f, 0.0f,
                             0.0f, 0.0f, 0.0f,
                             0.0f, 0.0f, 0.0f};
    float hC_cublas[]    = { 0.0f, 0.0f, 0.0f,
                             0.0f, 0.0f, 0.0f,
                             0.0f, 0.0f, 0.0f,
                             0.0f, 0.0f, 0.0f};
    float alpha           = 1.0f;
    float beta            = 0.0f;

    //--------------------------------------------------------------------------
    // Check compute capability
    cudaDeviceProp props;
    CHECK_CUDA( cudaGetDeviceProperties(&props, 0) )
    if (props.major < 7) {
      std::printf("cusparseSpMM with blocked ELL format is supported only "
                  "with compute capability at least 7.0\n");
      return EXIT_UNSUPPORTED;
    }
    //--------------------------------------------------------------------------
    // Device memory management
    int    *dA_columns;
    float *dA_dense, *dA_values, *dB, *dC, *dC_cublas;
    CHECK_CUDA( cudaMalloc((void**) &dA_dense, A_size * sizeof(float)) )
    CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_num_blocks * sizeof(int)) )
    CHECK_CUDA( cudaMalloc((void**) &dA_values,
                                    A_ell_cols * A_num_rows * sizeof(float)) )
    CHECK_CUDA( cudaMalloc((void**) &dB, B_size * sizeof(float)) )
    CHECK_CUDA( cudaMalloc((void**) &dC, C_size * sizeof(float)) )
    CHECK_CUDA( cudaMalloc((void**) &dC_cublas, C_size * sizeof(float)) )

    CHECK_CUDA( cudaMemcpy(dA_dense, hA_dense,
                           A_size * sizeof(float),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns,
                           A_num_blocks * sizeof(int),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_values, hA_values,
                           A_ell_cols * A_num_rows * sizeof(float),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dB, hB, B_size * sizeof(float),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dC, hC, C_size * sizeof(float),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dC_cublas, hC, C_size * sizeof(float),
                           cudaMemcpyHostToDevice) )
    //--------------------------------------------------------------------------
    // CUSPARSE APIs
    cusparseHandle_t     handle_cusparse = NULL;
    cublasHandle_t       handle_cublas = NULL;
    cusparseSpMatDescr_t matA;
    cusparseDnMatDescr_t matB, matC;
    void*                dBuffer    = NULL;
    size_t               bufferSize = 0;
    CHECK_CUSPARSE( cusparseCreate(&handle_cusparse) )

    // Init cuBLAS & do dense gemm
    CHECK_CUBLAS( cublasCreate(&handle_cublas) )
    CHECK_CUBLAS( cublasSgemm(
        handle_cublas,
        CUBLAS_OP_N,
        CUBLAS_OP_N,
        A_num_rows,
        B_num_cols,
        A_num_cols,
        &alpha,
        dA_dense,
        lda,
        dB,
        ldb,
        &beta,
        dC_cublas,
        ldc
    ) )

    // Create sparse matrix A in blocked ELL format
    CHECK_CUSPARSE( cusparseCreateBlockedEll(
                                      &matA,
                                      A_num_rows, A_num_cols, A_ell_blocksize,
                                      A_ell_cols, dA_columns, dA_values,
                                      CUSPARSE_INDEX_32I,
                                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F) )
    // Create dense matrix B
    CHECK_CUSPARSE( cusparseCreateDnMat(&matB, A_num_cols, B_num_cols, ldb, dB,
                                        CUDA_R_32F, CUSPARSE_ORDER_COL) )
    // Create dense matrix C
    CHECK_CUSPARSE( cusparseCreateDnMat(&matC, A_num_rows, B_num_cols, ldc, dC,
                                        CUDA_R_32F, CUSPARSE_ORDER_COL) )
    // allocate an external buffer if needed
    CHECK_CUSPARSE( cusparseSpMM_bufferSize(
                                 handle_cusparse,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &alpha, matA, matB, &beta, matC, CUDA_R_32F,
                                 CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize) )
    CHECK_CUDA( cudaMalloc(&dBuffer, bufferSize) )

    // execute SpMM
    CHECK_CUSPARSE( cusparseSpMM(handle_cusparse,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &alpha, matA, matB, &beta, matC, CUDA_R_32F,
                                 CUSPARSE_SPMM_ALG_DEFAULT, dBuffer) )

    // destroy matrix/vector descriptors
    CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
    CHECK_CUSPARSE( cusparseDestroyDnMat(matB) )
    CHECK_CUSPARSE( cusparseDestroyDnMat(matC) )
    CHECK_CUSPARSE( cusparseDestroy(handle_cusparse) )
    CHECK_CUBLAS( cublasDestroy(handle_cublas) )
    //--------------------------------------------------------------------------
    // Compare cuSparse and cuBLAS results
    CHECK_CUDA( cudaMemcpy(hC, dC, C_size * sizeof(float),
                           cudaMemcpyDeviceToHost) )
    CHECK_CUDA( cudaMemcpy(hC_cublas, dC_cublas, C_size * sizeof(float),
                           cudaMemcpyDeviceToHost) )
    printf("i\tj\tresult_cusparse\tresult_cublas\n");
    for (int i = 0; i < A_num_rows; i++) {
        for (int j = 0; j < B_num_cols; j++) {
            float c_cusparse  = static_cast<float>(hC[i + j * ldc]);
            float c_cublas = static_cast<float>(hC_cublas[i + j * ldc]);
	        printf("%d\t%d\t%f\t%f\n", i, j, c_cusparse, c_cublas);
        }
    }
    // device memory deallocation
    CHECK_CUDA( cudaFree(dBuffer) )
    CHECK_CUDA( cudaFree(dA_columns) )
    CHECK_CUDA( cudaFree(dA_values) )
    CHECK_CUDA( cudaFree(dA_dense) )
    CHECK_CUDA( cudaFree(dB) )
    CHECK_CUDA( cudaFree(dC) )
    CHECK_CUDA( cudaFree(dC_cublas) )
    return EXIT_SUCCESS;
}

Thank you very much in advance!

Hi @user27604165. The 2 libraries have different ways to compute the GEMM, so the difference is expected. Can you try with bigger block sizes for cuSPARSE BlockedELL SpMM? The results can be closer for large blocks.