I am working on a modified version of the cuSparse CSR sparse-dense matmul example in here.
The problem is, my code sometimes works and sometimes fails with CUDA API failed at line 234 with error: an illegal memory access was encountered (700) error for sparse-dense (768, 3072) x (3072, 1024) matmul test. You can see from my output terminal that sometimes the test passes with elapsed time 0.22 msec and in other times the test fails (outputs 0.0001msec, and I do not know where the “1” is coming from).
I am using CUDA 11.8 and A100 80GB PCIe GPU. Does anyone know why this is happening?
My code is as below:
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h> // cusparseSpMM
#include <stdio.h> // printf
#include <stdlib.h> // EXIT_FAILURE
#include <iostream>
// #include <cstdlib>
#define CHECK_CUDA(func) \
{ \
cudaError_t status = (func); \
if (status != cudaSuccess) { \
printf("CUDA API failed at line %d with error: %s (%d)\n", \
__LINE__, cudaGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
#define CHECK_CUSPARSE(func) \
{ \
cusparseStatus_t status = (func); \
if (status != CUSPARSE_STATUS_SUCCESS) { \
printf("CUSPARSE API failed at line %d with error: %s (%d)\n", \
__LINE__, cusparseGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
// Function to initialize a matrix with random data
void initMatrix(float* mat, int numRows, int numCols) {
for (int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
mat[i * numCols + j] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
}
}
}
void printMatrix(float* mat, int numRows, int numCols) {
for (int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
std::cout << mat[i * numCols + j] << std::endl;
}
}
}
void initMatrix(float **mat, int numRows, int numCols) {
for (int i = 0; i < numRows; i++) {
for (int j = 0; j < numCols; j++) {
mat[i][j] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
}
}
}
// AB (A sparse weight, B activation)
// A (M x K)
// B (K x N)
float run_spmm(
const float sparsity,
const int A_num_rows,
const int A_num_cols,
const int B_num_cols,
int num_tests=10000
) {
const int A_nnz = (1-sparsity) * A_num_rows * A_num_cols;
const int B_num_rows = A_num_cols;
int ldb = B_num_rows;
int ldc = A_num_rows;
int B_size = ldb * B_num_cols;
int C_size = ldc * B_num_cols;
float hA_values[A_nnz];
int hA_csrOffsets[A_num_rows + 1];
int hA_columns[A_nnz];
float *hB = new float[B_size];
float *hC = new float[C_size];
// Initialize sparse A
{
// Initialize values
initMatrix(hA_values, A_nnz, 1);
// Initialize the column indices with random indices (for demonstration)
for (int i = 0; i < A_nnz; ++i) {
hA_columns[i] = rand() % A_num_cols;
}
// Initialize the row pointers in a way that the non-zeros are roughly evenly distributed
// (This is just for demonstration purposes)
hA_csrOffsets[0] = 0;
for (int i = 1; i <= A_num_rows; ++i) {
hA_csrOffsets[i] = hA_csrOffsets[i - 1] + (A_nnz / A_num_rows);
}
}
// Initialize dense B & C
{
initMatrix(hB, B_size, 1);
initMatrix(hC, C_size, 1);
// printMatrix(hB, B_size, 1);
// std::cout << "print finished for Mat B" << std::endl;
// printMatrix(hC, C_size, 1);
// std::cout << "print finished for Mat C" << std::endl;
}
float alpha = 1.0f;
float beta = 0.0f;
//--------------------------------------------------------------------------
// Device memory management
int *dA_csrOffsets, *dA_columns;
float *dA_values;
CHECK_CUDA( cudaMalloc((void**) &dA_csrOffsets,
(A_num_rows + 1) * sizeof(int)) )
CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_nnz * sizeof(int)) )
CHECK_CUDA( cudaMalloc((void**) &dA_values, A_nnz * sizeof(float)) )
float *dB, *dC;
CHECK_CUDA( cudaMalloc((void**) &dB, B_size * sizeof(float)) )
CHECK_CUDA( cudaMalloc((void**) &dC, C_size * sizeof(float)) )
CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,
(A_num_rows + 1) * sizeof(int),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, A_nnz * sizeof(int),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_values, hA_values, A_nnz * sizeof(float),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dB, hB, B_size * sizeof(float),
cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dC, hC, C_size * sizeof(float),
cudaMemcpyHostToDevice) )
//--------------------------------------------------------------------------
// CUSPARSE APIs
cusparseHandle_t handle = NULL;
cusparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC;
void* dBuffer = NULL;
size_t bufferSize = 0;
CHECK_CUSPARSE( cusparseCreate(&handle) )
// Create sparse matrix A in CSR format
CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
dA_csrOffsets, dA_columns, dA_values,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F) )
// Create dense matrix B
CHECK_CUSPARSE( cusparseCreateDnMat(&matB, A_num_cols, B_num_cols, ldb, dB,
CUDA_R_32F, CUSPARSE_ORDER_COL) )
// Create dense matrix C
CHECK_CUSPARSE( cusparseCreateDnMat(&matC, A_num_rows, B_num_cols, ldc, dC,
CUDA_R_32F, CUSPARSE_ORDER_COL) )
// allocate an external buffer if needed
CHECK_CUSPARSE( cusparseSpMM_bufferSize(
handle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, matB, &beta, matC, CUDA_R_32F,
CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize) )
CHECK_CUDA( cudaMalloc(&dBuffer, bufferSize) )
float milliseconds = 0;
cudaEvent_t start, stop;
CHECK_CUDA(cudaEventCreate(&start));
CHECK_CUDA(cudaEventCreate(&stop));
////////////////////////////////////////////////////////////////////////////
// WARMUP
////////////////////////////////////////////////////////////////////////////
for (int i = 0; i < 100; i++){
CHECK_CUSPARSE( cusparseSpMM(handle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, matB, &beta, matC, CUDA_R_32F,
CUSPARSE_SPMM_ALG_DEFAULT, dBuffer) )
}
cudaDeviceSynchronize();
////////////////////////////////////////////////////////////////////////////
// RUN TEST
////////////////////////////////////////////////////////////////////////////
// Record the start event
CHECK_CUDA(cudaEventRecord(start));
// execute SpMM
for (int i = 0; i < num_tests; i++){
CHECK_CUSPARSE( cusparseSpMM(handle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, matB, &beta, matC, CUDA_R_32F,
CUSPARSE_SPMM_ALG_DEFAULT, dBuffer) )
}
// Record the end event
CHECK_CUDA(cudaEventRecord(stop));
// Wait for the stop event to complete
CHECK_CUDA(cudaEventSynchronize(stop));
// Calculate the elapsed time between the start and stop events
CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
/////////////////////////////////////////////////////
// CLEANUP
/////////////////////////////////////////////////////
delete hB;
delete hC;
hB=hC=NULL;
// destroy matrix/vector descriptors
CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
CHECK_CUSPARSE( cusparseDestroyDnMat(matB) )
CHECK_CUSPARSE( cusparseDestroyDnMat(matC) )
CHECK_CUSPARSE( cusparseDestroy(handle) )
// //--------------------------------------------------------------------------
// // device memory deallocation
// CHECK_CUDA( cudaFree(dBuffer) )
CHECK_CUDA( cudaFree(dA_csrOffsets) )
CHECK_CUDA( cudaFree(dA_columns) )
CHECK_CUDA( cudaFree(dA_values) )
CHECK_CUDA( cudaFree(dB) )
CHECK_CUDA( cudaFree(dC) )
return milliseconds;
}
int main(void) {
int M = 768; // dff for FFN1
int N = 2048; // num_tokens
int K = M * 4; // dmodel for FFN1
float sparsity = 0.98;
int num_tests = 10000;
float milliseconds = 0.0;
for (int n = 1024 ; n < N ; n *= 2){
std::cout << "Running for tokens : " << n << std::endl;
milliseconds = run_spmm(sparsity, K, M, n, num_tests) / num_tests;
std::cout << "Tokens " << n << " SpMM: " << milliseconds << " milliseconds" << std::endl;
}
return EXIT_SUCCESS;
}
