[CuSparse] Inconsistent illegal memory access error

I am working on a modified version of the cuSparse CSR sparse-dense matmul example in here.

The problem is, my code sometimes works and sometimes fails with CUDA API failed at line 234 with error: an illegal memory access was encountered (700) error for sparse-dense (768, 3072) x (3072, 1024) matmul test. You can see from my output terminal that sometimes the test passes with elapsed time 0.22 msec and in other times the test fails (outputs 0.0001msec, and I do not know where the “1” is coming from).

I am using CUDA 11.8 and A100 80GB PCIe GPU. Does anyone know why this is happening?
My code is as below:

#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h>         // cusparseSpMM
#include <stdio.h>            // printf
#include <stdlib.h>           // EXIT_FAILURE
#include <iostream>
// #include <cstdlib>
  
#define CHECK_CUDA(func)                                                       \
{                                                                              \
    cudaError_t status = (func);                                               \
    if (status != cudaSuccess) {                                               \
        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
               __LINE__, cudaGetErrorString(status), status);                  \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}
  
#define CHECK_CUSPARSE(func)                                                   \
{                                                                              \
    cusparseStatus_t status = (func);                                          \
    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
               __LINE__, cusparseGetErrorString(status), status);              \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}
  
// Function to initialize a matrix with random data
void initMatrix(float* mat, int numRows, int numCols) {
    for (int i = 0; i < numRows; i++) {
        for (int j = 0; j < numCols; j++) {
            mat[i * numCols + j] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
        }
    }
}
  
void printMatrix(float* mat, int numRows, int numCols) {
    for (int i = 0; i < numRows; i++) {
        for (int j = 0; j < numCols; j++) {
            std::cout << mat[i * numCols + j] << std::endl;
        }
    }
}
  
void initMatrix(float **mat, int numRows, int numCols) {
    for (int i = 0; i < numRows; i++) {
        for (int j = 0; j < numCols; j++) {
            mat[i][j] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
        }
    }
}
  
// AB (A sparse weight, B activation)
// A (M x K)
// B (K x N)
float run_spmm(
    const float sparsity,
    const int A_num_rows,
    const int A_num_cols,
    const int B_num_cols,
    int num_tests=10000
) {
  
    const int   A_nnz           = (1-sparsity) * A_num_rows * A_num_cols;
    const int   B_num_rows      = A_num_cols;
  
    int   ldb             = B_num_rows;
    int   ldc             = A_num_rows;
    int   B_size          = ldb * B_num_cols;
    int   C_size          = ldc * B_num_cols;
  
    float hA_values[A_nnz];
    int   hA_csrOffsets[A_num_rows + 1];
    int   hA_columns[A_nnz];
  
    float *hB = new float[B_size];
    float *hC = new float[C_size];
  
    // Initialize sparse A
    {
        // Initialize values
        initMatrix(hA_values, A_nnz, 1);
  
        // Initialize the column indices with random indices (for demonstration)
        for (int i = 0; i < A_nnz; ++i) {
            hA_columns[i] = rand() % A_num_cols;
        }
  
        // Initialize the row pointers in a way that the non-zeros are roughly evenly distributed
        // (This is just for demonstration purposes)
        hA_csrOffsets[0] = 0;
        for (int i = 1; i <= A_num_rows; ++i) {
            hA_csrOffsets[i] = hA_csrOffsets[i - 1] + (A_nnz / A_num_rows);
        }
    }
  
    // Initialize dense B & C
    {
        initMatrix(hB, B_size, 1);
        initMatrix(hC, C_size, 1);
  
        // printMatrix(hB, B_size, 1);
        // std::cout << "print finished for Mat B" << std::endl;
        // printMatrix(hC, C_size, 1);
        // std::cout << "print finished for Mat C" << std::endl;
    }
  
    float alpha           = 1.0f;
    float beta            = 0.0f;
    //--------------------------------------------------------------------------
    // Device memory management
    int   *dA_csrOffsets, *dA_columns;
    float *dA_values;
    CHECK_CUDA( cudaMalloc((void**) &dA_csrOffsets,
                           (A_num_rows + 1) * sizeof(int)) )
    CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_nnz * sizeof(int))    )
    CHECK_CUDA( cudaMalloc((void**) &dA_values,  A_nnz * sizeof(float))  )
  
    float *dB, *dC;
    CHECK_CUDA( cudaMalloc((void**) &dB,         B_size * sizeof(float)) )
    CHECK_CUDA( cudaMalloc((void**) &dC,         C_size * sizeof(float)) )
  
    CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,
                           (A_num_rows + 1) * sizeof(int),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, A_nnz * sizeof(int),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_values, hA_values, A_nnz * sizeof(float),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dB, hB, B_size * sizeof(float),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dC, hC, C_size * sizeof(float),
                           cudaMemcpyHostToDevice) )
    //--------------------------------------------------------------------------
    // CUSPARSE APIs
    cusparseHandle_t     handle = NULL;
    cusparseSpMatDescr_t matA;
    cusparseDnMatDescr_t matB, matC;
    void*                dBuffer    = NULL;
    size_t               bufferSize = 0;
    CHECK_CUSPARSE( cusparseCreate(&handle) )
    // Create sparse matrix A in CSR format
    CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
                                      dA_csrOffsets, dA_columns, dA_values,
                                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F) )
    // Create dense matrix B
    CHECK_CUSPARSE( cusparseCreateDnMat(&matB, A_num_cols, B_num_cols, ldb, dB,
                                        CUDA_R_32F, CUSPARSE_ORDER_COL) )
    // Create dense matrix C
    CHECK_CUSPARSE( cusparseCreateDnMat(&matC, A_num_rows, B_num_cols, ldc, dC,
                                        CUDA_R_32F, CUSPARSE_ORDER_COL) )
  
    // allocate an external buffer if needed
    CHECK_CUSPARSE( cusparseSpMM_bufferSize(
                                 handle,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &alpha, matA, matB, &beta, matC, CUDA_R_32F,
                                 CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize) )
    CHECK_CUDA( cudaMalloc(&dBuffer, bufferSize) )
  
    float milliseconds = 0;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));
    
    ////////////////////////////////////////////////////////////////////////////
    // WARMUP
    ////////////////////////////////////////////////////////////////////////////
    for (int i = 0; i < 100; i++){
        CHECK_CUSPARSE( cusparseSpMM(handle,
                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
                                    &alpha, matA, matB, &beta, matC, CUDA_R_32F,
                                    CUSPARSE_SPMM_ALG_DEFAULT, dBuffer) )
    }
    cudaDeviceSynchronize();
    
  
    ////////////////////////////////////////////////////////////////////////////
    // RUN TEST
    ////////////////////////////////////////////////////////////////////////////
  
    // Record the start event
    CHECK_CUDA(cudaEventRecord(start));
  
    // execute SpMM
    for (int i = 0; i < num_tests; i++){
        CHECK_CUSPARSE( cusparseSpMM(handle,
                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
                                    &alpha, matA, matB, &beta, matC, CUDA_R_32F,
                                    CUSPARSE_SPMM_ALG_DEFAULT, dBuffer) )
    }
  
    // Record the end event
    CHECK_CUDA(cudaEventRecord(stop));
  
    // Wait for the stop event to complete
    CHECK_CUDA(cudaEventSynchronize(stop));
  
    // Calculate the elapsed time between the start and stop events
    CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
  
  
    /////////////////////////////////////////////////////
    // CLEANUP
    /////////////////////////////////////////////////////
  
    delete hB;
    delete hC;
    hB=hC=NULL;
  
    // destroy matrix/vector descriptors
    CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
    CHECK_CUSPARSE( cusparseDestroyDnMat(matB) )
    CHECK_CUSPARSE( cusparseDestroyDnMat(matC) )
    CHECK_CUSPARSE( cusparseDestroy(handle) )
  
    // //--------------------------------------------------------------------------
    // // device memory deallocation
    // CHECK_CUDA( cudaFree(dBuffer) )
    CHECK_CUDA( cudaFree(dA_csrOffsets) )
    CHECK_CUDA( cudaFree(dA_columns) )
    CHECK_CUDA( cudaFree(dA_values) )
    CHECK_CUDA( cudaFree(dB) )
    CHECK_CUDA( cudaFree(dC) )
  
    return milliseconds;
}
  
  
int main(void) {
    int M = 768; // dff for FFN1
    int N = 2048; // num_tokens
    int K = M * 4; // dmodel for FFN1
    float sparsity = 0.98;
    int num_tests = 10000;
    float milliseconds = 0.0;
  
    for (int n = 1024 ; n < N ; n *= 2){
  
        std::cout << "Running for tokens : " << n << std::endl;
        milliseconds = run_spmm(sparsity, K, M, n, num_tests) / num_tests;
        std::cout << "Tokens " << n << " SpMM: " << milliseconds << " milliseconds" << std::endl;
    }
  
    return EXIT_SUCCESS;
}

Hi @taehyunzzz . I checked your code, there are a few places that look suspicious to me:

// Initialize values
initMatrix(hA_values, A_nnz, 1);

It seems you are using the initMatrix function for dense matrix on a sparse matrix.

hA_columns[i] = rand() % A_num_cols;

This can lead to duplicated columns in a single row. Our SpMM routine can work in that case but that may not be what you want. But the fact that you are getting errors randomly makes me think this could be the cause.

hA_csrOffsets[i] = hA_csrOffsets[i - 1] + (A_nnz / A_num_rows);

Because of the down-rounded division, this can lead to hA_csrOffsets not covering all the non-zeros, i.e.
hA_csrOffsets[A_num_rows] < A_nnz.

Can you fix the above potential issue and see whether you still get errors?

You are in fact correct. My fix is as below :
As always, coding is fun but difficult!

    // Initialize sparse A
    {
        // Initialize values
        initMatrix(hA_values, A_nnz, 1);

        // reset hA
        for (int i=0; i<A_size; i++){
            hA[i] = 0;
        }

        int nonzero_checked = 0;
        while (1){
            int row=rand() % A_num_rows;
            int col=rand() % A_num_cols;
            if (hA[row * A_num_cols + col] == 0){
                nonzero_checked++;
                hA[row * A_num_cols + col] = 1;
            }
            if (nonzero_checked == A_nnz){
                break;
            }
        }

        nonzero_checked = 0;
        for (int i=0; i<A_size; i++){
            if (hA[i] == 1){
                int row = i / A_num_cols;
                int col = i % A_num_cols;
                hA_columns[nonzero_checked++]=col;
                hA_csrOffsets[row]++;
            }
        }
    }

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.