Greetings,
I have been evaluating cuSPARSE’s BLOCKED-ELL format for accelerating block-sparse matrix multiplication. Modifying the example code to compute the following matrix multiplication:
, and comparing the results with cuBLAS, I observe there is a significant divergence between the two’s results:
i j result_cusparse result_cublas
0 0 1338.007812 1338.000000
0 1 6338.515625 6338.000000
1 0 1338.007812 1338.000000
1 1 6338.515625 6338.000000
2 0 1338.007812 1338.000000
2 1 6338.515625 6338.000000
3 0 2530.156250 2530.000000
3 1 15337.187500 15338.200195
4 0 2530.156250 2530.000000
4 1 15337.187500 15338.200195
5 0 2530.156250 2530.000000
5 1 15337.187500 15338.200195
Is this expected, or is there an error in my code?
I am appending my code below:
/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cublas_v2.h>
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h> // cusparseSpMM
#include <cstdio> // printf
#include <cstdlib> // EXIT_FAILURE
#define CHECK_CUDA(func) \
{ \
cudaError_t status = (func); \
if (status != cudaSuccess) { \
std::printf("CUDA API failed at line %d with error: %s (%d)\n", \
__LINE__, cudaGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
#define CHECK_CUSPARSE(func) \
{ \
cusparseStatus_t status = (func); \
if (status != CUSPARSE_STATUS_SUCCESS) { \
std::printf("CUSPARSE API failed at line %d with error: %s (%d)\n", \
__LINE__, cusparseGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
#define CHECK_CUBLAS(err) \
{ \
cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \
std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__); \
return EXIT_FAILURE; \
} \
}
const int EXIT_UNSUPPORTED = 2;
int main() {
// Host problem definition
int A_num_rows = 6;
int A_num_cols = 6;
int A_ell_blocksize = 3;
int A_ell_cols = 3;
int lda = A_num_rows;
int A_size = lda * A_num_cols;
int A_num_blocks = A_ell_cols * A_num_rows /
(A_ell_blocksize * A_ell_blocksize);
int B_num_rows = 6;
int B_num_cols = 2;
int ldb = B_num_rows;
int ldc = B_num_rows;
int B_size = ldb * B_num_cols;
int C_size = ldc * B_num_cols;
// A in dense format (column-major)
float hA_dense[] = { 90.0f, 90.0f, 90.0f, 0.0f, 0.0f, 0.0f,
100.0f, 100.0f, 100.0f, 0.0f, 0.0f, 0.0f,
110.0f, 110.0f, 110.0f, 0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f, 90.0f, 90.0f, 90.0f,
0.0f, 0.0f, 0.0f, 100.0f, 100.0f, 100.0f,
0.0f, 0.0f, 0.0f, 110.0f, 110.0f, 110.0f };
// A in blocked-ELL format
int hA_columns[] = { 0, 1};
float hA_values[] = { 90.0f, 100.0f, 110.0f,
90.0f, 100.0f, 110.0f,
90.0f, 100.0f, 110.f,
90.0f, 100.f, 110.0f,
90.0f, 100.0f, 110.0f,
90.0f, 100.0f, 110.0f };
// B in dense format (column-major)
float hB[] = { 2.5f,
4.2f,
6.3f,
7.1f,
8.9,
9.1f,
10.1f,
20.3f,
30.9f,
40.3f,
50.1f,
60.92f };
float hC[] = { 0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f};
float hC_cublas[] = { 0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f};
float alpha = 1.0f;
float beta = 0.0f;
//--------------------------------------------------------------------------
// Check compute capability
cudaDeviceProp props;
CHECK_CUDA( cudaGetDeviceProperties(&props, 0) )
if (props.major < 7) {
std::printf("cusparseSpMM with blocked ELL format is supported only "
"with compute capability at least 7.0\n");
return EXIT_UNSUPPORTED;
}
//--------------------------------------------------------------------------
// Device memory management
int *dA_columns;
float *dA_dense, *dA_values, *dB, *dC, *dC_cublas;
CHECK_CUDA( cudaMalloc((void**) &dA_dense, A_size * sizeof(float)) )
CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_num_blocks * sizeof(int)) )
CHECK_CUDA( cudaMalloc((void**) &dA_values,
A_ell_cols * A_num_rows * sizeof(float)) )
CHECK_CUDA( cudaMalloc((void**) &dB, B_size * sizeof(float)) )
CHECK_CUDA( cudaMalloc((void**) &dC, C_size * sizeof(float)) )
CHECK_CUDA( cudaMalloc((void**) &dC_cublas, C_size * sizeof(float)) )
CHECK_CUDA( cudaMemcpy(dA_dense, hA_dense,
A_size * sizeof(float),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns,
A_num_blocks * sizeof(int),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_values, hA_values,
A_ell_cols * A_num_rows * sizeof(float),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dB, hB, B_size * sizeof(float),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dC, hC, C_size * sizeof(float),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dC_cublas, hC, C_size * sizeof(float),
cudaMemcpyHostToDevice) )
//--------------------------------------------------------------------------
// CUSPARSE APIs
cusparseHandle_t handle_cusparse = NULL;
cublasHandle_t handle_cublas = NULL;
cusparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC;
void* dBuffer = NULL;
size_t bufferSize = 0;
CHECK_CUSPARSE( cusparseCreate(&handle_cusparse) )
// Init cuBLAS & do dense gemm
CHECK_CUBLAS( cublasCreate(&handle_cublas) )
CHECK_CUBLAS( cublasSgemm(
handle_cublas,
CUBLAS_OP_N,
CUBLAS_OP_N,
A_num_rows,
B_num_cols,
A_num_cols,
&alpha,
dA_dense,
lda,
dB,
ldb,
&beta,
dC_cublas,
ldc
) )
// Create sparse matrix A in blocked ELL format
CHECK_CUSPARSE( cusparseCreateBlockedEll(
&matA,
A_num_rows, A_num_cols, A_ell_blocksize,
A_ell_cols, dA_columns, dA_values,
CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F) )
// Create dense matrix B
CHECK_CUSPARSE( cusparseCreateDnMat(&matB, A_num_cols, B_num_cols, ldb, dB,
CUDA_R_32F, CUSPARSE_ORDER_COL) )
// Create dense matrix C
CHECK_CUSPARSE( cusparseCreateDnMat(&matC, A_num_rows, B_num_cols, ldc, dC,
CUDA_R_32F, CUSPARSE_ORDER_COL) )
// allocate an external buffer if needed
CHECK_CUSPARSE( cusparseSpMM_bufferSize(
handle_cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, matB, &beta, matC, CUDA_R_32F,
CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize) )
CHECK_CUDA( cudaMalloc(&dBuffer, bufferSize) )
// execute SpMM
CHECK_CUSPARSE( cusparseSpMM(handle_cusparse,
CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, matB, &beta, matC, CUDA_R_32F,
CUSPARSE_SPMM_ALG_DEFAULT, dBuffer) )
// destroy matrix/vector descriptors
CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
CHECK_CUSPARSE( cusparseDestroyDnMat(matB) )
CHECK_CUSPARSE( cusparseDestroyDnMat(matC) )
CHECK_CUSPARSE( cusparseDestroy(handle_cusparse) )
CHECK_CUBLAS( cublasDestroy(handle_cublas) )
//--------------------------------------------------------------------------
// Compare cuSparse and cuBLAS results
CHECK_CUDA( cudaMemcpy(hC, dC, C_size * sizeof(float),
cudaMemcpyDeviceToHost) )
CHECK_CUDA( cudaMemcpy(hC_cublas, dC_cublas, C_size * sizeof(float),
cudaMemcpyDeviceToHost) )
printf("i\tj\tresult_cusparse\tresult_cublas\n");
for (int i = 0; i < A_num_rows; i++) {
for (int j = 0; j < B_num_cols; j++) {
float c_cusparse = static_cast<float>(hC[i + j * ldc]);
float c_cublas = static_cast<float>(hC_cublas[i + j * ldc]);
printf("%d\t%d\t%f\t%f\n", i, j, c_cusparse, c_cublas);
}
}
// device memory deallocation
CHECK_CUDA( cudaFree(dBuffer) )
CHECK_CUDA( cudaFree(dA_columns) )
CHECK_CUDA( cudaFree(dA_values) )
CHECK_CUDA( cudaFree(dA_dense) )
CHECK_CUDA( cudaFree(dB) )
CHECK_CUDA( cudaFree(dC) )
CHECK_CUDA( cudaFree(dC_cublas) )
return EXIT_SUCCESS;
}
Thank you very much in advance!
