Hi,
I follow the blog and github example trying to reproduce the experiment in the blog.
However, I could not get the speed up as stated in the blog (dense gemm1133 ms v.s. sparse gemm 1226 ms). I am wondering if my configurations are wrong so the throughput gain no presenting. I attach my code and results below. The code is modified from the github example. Thanks.
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparseLt.h> // cusparseLt header
#include <cublas_v2.h> // cublas header
#include <cstdio> // printf
#include <cstdlib> // std::rand
#include <random>
// Define some error checking macros.
#define cudaErrCheck(stat) { cudaErrCheck_((stat), __FILE__, __LINE__); }
void cudaErrCheck_(cudaError_t stat, const char *file, int line) {
if (stat != cudaSuccess) {
fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), file, line);
}
}
#define cublasErrCheck(stat) { cublasErrCheck_((stat), __FILE__, __LINE__); }
void cublasErrCheck_(cublasStatus_t stat, const char *file, int line) {
if (stat != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "cuBLAS Error: %d %s %d\n", stat, file, line);
}
}
#define CHECK_CUDA(func) \
{ \
cudaError_t status = (func); \
if (status != cudaSuccess) { \
printf("CUDA API failed at line %d with error: %s (%d)\n", \
__LINE__, cudaGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
#define CHECK_CUSPARSE(func) \
{ \
cusparseStatus_t status = (func); \
if (status != CUSPARSE_STATUS_SUCCESS) { \
printf("CUSPARSE API failed at line %d with error: %s (%d)\n", \
__LINE__, cusparseGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
constexpr int EXIT_UNSUPPORTED = 2;
// Host problem definition, row-major order
constexpr int m = 3072; // bigger sizes may require dynamic allocations
constexpr int n = 3072; // bigger sizes may require dynamic allocations
constexpr int k = 10240; // bigger sizes may require dynamic allocations
auto order = CUSPARSE_ORDER_ROW;
auto opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
auto opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
auto type = CUDA_R_16F;
auto compute_type_cublas_tc = CUBLAS_COMPUTE_16F; // for cublas with tensorcore
auto compute_type_sparse = CUSPARSE_COMPUTE_16F;
bool is_rowmajor = (order == CUSPARSE_ORDER_ROW);
bool isA_transposed = (opA != CUSPARSE_OPERATION_NON_TRANSPOSE);
bool isB_transposed = (opB != CUSPARSE_OPERATION_NON_TRANSPOSE);
auto num_A_rows = (isA_transposed) ? k : m;
auto num_A_cols = (isA_transposed) ? m : k;
auto num_B_rows = (isB_transposed) ? n : k;
auto num_B_cols = (isB_transposed) ? k : n;
auto num_C_rows = m;
auto num_C_cols = n;
unsigned alignment = 16;
auto lda = (is_rowmajor) ? num_A_cols : num_A_rows;
auto ldb = (is_rowmajor) ? num_B_cols : num_B_rows;
auto ldc = (is_rowmajor) ? num_C_cols : num_C_rows;
auto A_height = (is_rowmajor) ? num_A_rows : num_A_cols;
auto B_height = (is_rowmajor) ? num_B_rows : num_B_cols;
auto C_height = (is_rowmajor) ? num_C_rows : num_C_cols;
auto A_size = A_height * lda * sizeof(__half);
auto B_size = B_height * ldb * sizeof(__half);
auto C_size = C_height * ldc * sizeof(__half);
int sparse_matmul(
__half * hA, __half *hB, __half *hC, float alpha, float beta) {
float elapse_time = 0;
//--------------------------------------------------------------------------
// Device memory management
__half *dA, *dB, *dC, *dD, *dA_compressed;
int *d_valid;
CHECK_CUDA( cudaMalloc((void**) &dA, A_size) )
CHECK_CUDA( cudaMalloc((void**) &dB, B_size) )
CHECK_CUDA( cudaMalloc((void**) &dC, C_size) )
CHECK_CUDA( cudaMalloc((void**) &d_valid, sizeof(int)) )
dD = dC;
CHECK_CUDA( cudaMemcpy(dA, hA, A_size, cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dB, hB, B_size, cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dC, hC, C_size, cudaMemcpyHostToDevice) )
//--------------------------------------------------------------------------
cusparseLtHandle_t handle;
cusparseLtMatDescriptor_t matA, matB, matC;
cusparseLtMatmulDescriptor_t matmul;
cusparseLtMatmulAlgSelection_t alg_sel;
cusparseLtMatmulPlan_t plan;
cudaStream_t stream = nullptr;
CHECK_CUSPARSE( cusparseLtInit(&handle) )
// matrix descriptor initialization
CHECK_CUSPARSE( cusparseLtStructuredDescriptorInit(
&handle, &matA, num_A_rows,
num_A_cols, lda, alignment,
type, order,
CUSPARSELT_SPARSITY_50_PERCENT) )
CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
&handle, &matB, num_B_rows,
num_B_cols, ldb, alignment,
type, order) )
CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
&handle, &matC, num_C_rows,
num_C_cols, ldc, alignment,
type, order) )
// matmul, algorithm selection, and plan initialization
CHECK_CUSPARSE( cusparseLtMatmulDescriptorInit(
&handle, &matmul, opA, opB,
&matA, &matB, &matC, &matC,
compute_type_sparse) )
CHECK_CUSPARSE( cusparseLtMatmulAlgSelectionInit(
&handle, &alg_sel, &matmul,
CUSPARSELT_MATMUL_ALG_DEFAULT) )
int alg = 0;
CHECK_CUSPARSE( cusparseLtMatmulAlgSetAttribute(
&handle, &alg_sel,
CUSPARSELT_MATMUL_ALG_CONFIG_ID,
&alg, sizeof(alg)))
size_t workspace_size;
CHECK_CUSPARSE( cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel,
workspace_size) )
//--------------------------------------------------------------------------
// Prune the A matrix (in-place) and check the correctness
CHECK_CUSPARSE( cusparseLtSpMMAPrune(&handle, &matmul, dA, dA,
CUSPARSELT_PRUNE_SPMMA_TILE, stream) )
CHECK_CUSPARSE( cusparseLtSpMMAPruneCheck(&handle, &matmul, dA,
d_valid, stream) )
int is_valid;
CHECK_CUDA( cudaMemcpyAsync(&is_valid, d_valid, sizeof(int),
cudaMemcpyDeviceToHost, stream) )
CHECK_CUDA( cudaStreamSynchronize(stream) )
if (is_valid != 0) {
std::printf("!!!! The matrix has been pruned in a wrong way. "
"cusparseLtMatmul will not provide correct results\n");
return EXIT_FAILURE;
}
//--------------------------------------------------------------------------
// Compress the A matrix
size_t compressed_size;
CHECK_CUSPARSE( cusparseLtSpMMACompressedSize(&handle, &plan,
&compressed_size) )
CHECK_CUDA( cudaMalloc((void**) &dA_compressed, compressed_size) )
CHECK_CUSPARSE( cusparseLtSpMMACompress(&handle, &plan, dA,
dA_compressed, stream) )
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Search the best kernel
void* d_workspace = nullptr;
int num_streams = 0;
cudaStream_t* streams = nullptr;
CHECK_CUSPARSE( cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel,
workspace_size) )
CHECK_CUSPARSE( cusparseLtMatmulGetWorkspace(&handle, &plan,
&workspace_size))
CHECK_CUDA( cudaMalloc((void**)&d_workspace, workspace_size) )
// Perform the matrix multiplication
CHECK_CUSPARSE( cusparseLtMatmul(&handle, &plan, &alpha, dA_compressed, dB,
&beta, dC, dD, d_workspace, streams,
num_streams) )
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// destroy plan and handle
CHECK_CUSPARSE( cusparseLtMatDescriptorDestroy(&matA) )
CHECK_CUSPARSE( cusparseLtMatDescriptorDestroy(&matB) )
CHECK_CUSPARSE( cusparseLtMatDescriptorDestroy(&matC) )
CHECK_CUSPARSE( cusparseLtMatmulPlanDestroy(&plan) )
CHECK_CUSPARSE( cusparseLtDestroy(&handle) )
//--------------------------------------------------------------------------
// device result check
// matrix A has been pruned
CHECK_CUDA( cudaMemcpy(hA, dA, A_size, cudaMemcpyDeviceToHost) )
CHECK_CUDA( cudaMemcpy(hC, dC, C_size, cudaMemcpyDeviceToHost) )
//--------------------------------------------------------------------------
// device memory deallocation
CHECK_CUDA( cudaFree(dA_compressed) )
CHECK_CUDA( cudaFree(dA) )
CHECK_CUDA( cudaFree(dB) )
CHECK_CUDA( cudaFree(dC) )
CHECK_CUDA( cudaFree(d_valid) )
CHECK_CUDA( cudaFree(d_workspace) )
return 0;
}
int dense_tensor_core_matmul(
__half * hA, __half *hB, __half *hC, __half alpha, __half beta) {
float elapse_time = 0;
//--------------------------------------------------------------------------
// Device memory management
__half *dA, *dB, *dC;
CHECK_CUDA( cudaMalloc((void**) &dA, A_size) )
CHECK_CUDA( cudaMalloc((void**) &dB, B_size) )
CHECK_CUDA( cudaMalloc((void**) &dC, C_size) )
CHECK_CUDA( cudaMemcpy(dA, hA, A_size, cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dB, hB, B_size, cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dC, hC, C_size, cudaMemcpyHostToDevice) )
//--------------------------------------------------------------------------
cublasHandle_t cublasHandle;
cublasErrCheck(cublasCreate(&cublasHandle));
cublasErrCheck(
cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH)
); // Use tensor cores
// Now using cuBLAS
// Perform the matrix multiplication
cublasErrCheck(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k,
&alpha,
dA, type, m,
dB, type, k,
&beta,
dC, type, m,
compute_type_cublas_tc, CUBLAS_GEMM_DFALT_TENSOR_OP));
CHECK_CUDA( cudaFree(dA) )
CHECK_CUDA( cudaFree(dB) )
CHECK_CUDA( cudaFree(dC) )
return 0;
}
int main(void) {
int major_cc, minor_cc;
CHECK_CUDA( cudaDeviceGetAttribute(&major_cc,
cudaDevAttrComputeCapabilityMajor, 0) )
CHECK_CUDA( cudaDeviceGetAttribute(&minor_cc,
cudaDevAttrComputeCapabilityMinor, 0) )
if (!(major_cc == 8 && minor_cc == 0) &&
!(major_cc == 8 && minor_cc == 6)) {
std::printf("\ncusparseLt is supported only on GPU devices with"
" compute capability == 8.0, 8.6 current: %d.%d\n\n",
major_cc, minor_cc);
return EXIT_UNSUPPORTED;
}
printf("Matmul size: %dx%dx%d\n", m, k, n);
std::random_device rd; // Will be used to obtain a seed for the random number engine
std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
std::uniform_real_distribution<> unifrom(0, 1); // uniform distribution in [0, 1]
std::normal_distribution<> normal{0, 1}; // normal distribution in mean 0, std 1
// allocate memory in host DRAM
__half *hA, *hB, *hC;
hA = (__half*) malloc(sizeof(__half)*m*k);
hB = (__half*) malloc(sizeof(__half)*k*n);
hC = (__half*) malloc(sizeof(__half)*m*n); // cpu result
memset(hC, 0, sizeof(__half)*m*n);
for (int i = 0; i < m * k; i++)
hA[i] = static_cast<__half>(normal(gen));
for (int i = 0; i < k * n; i++)
hB[i] = static_cast<__half>(normal(gen));
__half alpha = 1.0f;
__half beta = 0.0f;
#ifdef SPARSE_MATMUL
printf("\nRunning with cuSPARSELt...\n");
sparse_matmul(hA, hB, hC, static_cast<float>(alpha), static_cast<float>(beta));
#else
printf("\nRunning with cuBLAS...\n");
dense_tensor_core_matmul(hA, hB, hC, alpha, beta);
#endif
return EXIT_SUCCESS;
}
The results from nsys nvprof
:
### cublasGemmEx ###
Start (ns) Duration (ns) CorrId GrdX GrdY GrdZ BlkX BlkY BlkZ Reg/Trd StcSMem (MB) DymSMem (MB) Bytes (MB) Throughput (MBps) SrcMemKd DstMemKd Device Ctx Strm Name
----------- ------------- ------ ---- ---- ---- ---- ---- ---- ------- ------------ ------------ ---------- ----------------- -------- -------- ------------------------- --- ---- ---------------------------------------------
...
11231207716 1133892 3495 12 24 1 256 1 1 162 0.049 0.098 NVIDIA A100-PCIE-40GB (0) 1 7 ampere_h16816gemm_256x128_ldg8_stages_64x3_nn
...
### cusparseLtMatmul ###
Start (ns) Duration (ns) CorrId GrdX GrdY GrdZ BlkX BlkY BlkZ Reg/Trd StcSMem (MB) DymSMem (MB) Bytes (MB) Throughput (MBps) SrcMemKd DstMemKd Device Ctx Strm Name
----------- ------------- ------ ---- ---- ---- ---- ---- ---- ------- ------------ ------------ ---------- ----------------- -------- -------- ------------------------- --- ---- ----------------------------------------------------------------------------------------------------
...
10055888674 1226852 1285 24 48 1 128 1 1 168 0.000 0.070 NVIDIA A100-PCIE-40GB (0) 1 7 sm80_xmma_sparse_gemm_f16f16_f16f32_f32_tt_t_tilesize128x64x64_stage4_warpsize2x2x1_sptensor16x8x32…
...