[CuSparse] Inconsistent illegal memory access error

taehyunzzz · January 22, 2024, 9:30am

I am working on a modified version of the cuSparse CSR sparse-dense matmul example in here.

The problem is, my code sometimes works and sometimes fails with CUDA API failed at line 234 with error: an illegal memory access was encountered (700) error for sparse-dense (768, 3072) x (3072, 1024) matmul test. You can see from my output terminal that sometimes the test passes with elapsed time 0.22 msec and in other times the test fails (outputs 0.0001msec, and I do not know where the “1” is coming from).

I am using CUDA 11.8 and A100 80GB PCIe GPU. Does anyone know why this is happening?
My code is as below:

#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h>         // cusparseSpMM
#include <stdio.h>            // printf
#include <stdlib.h>           // EXIT_FAILURE
#include <iostream>
// #include <cstdlib>
  
#define CHECK_CUDA(func)                                                       \
{                                                                              \
    cudaError_t status = (func);                                               \
    if (status != cudaSuccess) {                                               \
        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
               __LINE__, cudaGetErrorString(status), status);                  \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}
  
#define CHECK_CUSPARSE(func)                                                   \
{                                                                              \
    cusparseStatus_t status = (func);                                          \
    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
               __LINE__, cusparseGetErrorString(status), status);              \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}
  
// Function to initialize a matrix with random data
void initMatrix(float* mat, int numRows, int numCols) {
    for (int i = 0; i < numRows; i++) {
        for (int j = 0; j < numCols; j++) {
            mat[i * numCols + j] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
        }
    }
}
  
void printMatrix(float* mat, int numRows, int numCols) {
    for (int i = 0; i < numRows; i++) {
        for (int j = 0; j < numCols; j++) {
            std::cout << mat[i * numCols + j] << std::endl;
        }
    }
}
  
void initMatrix(float **mat, int numRows, int numCols) {
    for (int i = 0; i < numRows; i++) {
        for (int j = 0; j < numCols; j++) {
            mat[i][j] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
        }
    }
}
  
// AB (A sparse weight, B activation)
// A (M x K)
// B (K x N)
float run_spmm(
    const float sparsity,
    const int A_num_rows,
    const int A_num_cols,
    const int B_num_cols,
    int num_tests=10000
) {
  
    const int   A_nnz           = (1-sparsity) * A_num_rows * A_num_cols;
    const int   B_num_rows      = A_num_cols;
  
    int   ldb             = B_num_rows;
    int   ldc             = A_num_rows;
    int   B_size          = ldb * B_num_cols;
    int   C_size          = ldc * B_num_cols;
  
    float hA_values[A_nnz];
    int   hA_csrOffsets[A_num_rows + 1];
    int   hA_columns[A_nnz];
  
    float *hB = new float[B_size];
    float *hC = new float[C_size];
  
    // Initialize sparse A
    {
        // Initialize values
        initMatrix(hA_values, A_nnz, 1);
  
        // Initialize the column indices with random indices (for demonstration)
        for (int i = 0; i < A_nnz; ++i) {
            hA_columns[i] = rand() % A_num_cols;
        }
  
        // Initialize the row pointers in a way that the non-zeros are roughly evenly distributed
        // (This is just for demonstration purposes)
        hA_csrOffsets[0] = 0;
        for (int i = 1; i <= A_num_rows; ++i) {
            hA_csrOffsets[i] = hA_csrOffsets[i - 1] + (A_nnz / A_num_rows);
        }
    }
  
    // Initialize dense B & C
    {
        initMatrix(hB, B_size, 1);
        initMatrix(hC, C_size, 1);
  
        // printMatrix(hB, B_size, 1);
        // std::cout << "print finished for Mat B" << std::endl;
        // printMatrix(hC, C_size, 1);
        // std::cout << "print finished for Mat C" << std::endl;
    }
  
    float alpha           = 1.0f;
    float beta            = 0.0f;
    //--------------------------------------------------------------------------
    // Device memory management
    int   *dA_csrOffsets, *dA_columns;
    float *dA_values;
    CHECK_CUDA( cudaMalloc((void**) &dA_csrOffsets,
                           (A_num_rows + 1) * sizeof(int)) )
    CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_nnz * sizeof(int))    )
    CHECK_CUDA( cudaMalloc((void**) &dA_values,  A_nnz * sizeof(float))  )
  
    float *dB, *dC;
    CHECK_CUDA( cudaMalloc((void**) &dB,         B_size * sizeof(float)) )
    CHECK_CUDA( cudaMalloc((void**) &dC,         C_size * sizeof(float)) )
  
    CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,
                           (A_num_rows + 1) * sizeof(int),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, A_nnz * sizeof(int),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_values, hA_values, A_nnz * sizeof(float),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dB, hB, B_size * sizeof(float),
                           cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dC, hC, C_size * sizeof(float),
                           cudaMemcpyHostToDevice) )
    //--------------------------------------------------------------------------
    // CUSPARSE APIs
    cusparseHandle_t     handle = NULL;
    cusparseSpMatDescr_t matA;
    cusparseDnMatDescr_t matB, matC;
    void*                dBuffer    = NULL;
    size_t               bufferSize = 0;
    CHECK_CUSPARSE( cusparseCreate(&handle) )
    // Create sparse matrix A in CSR format
    CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
                                      dA_csrOffsets, dA_columns, dA_values,
                                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                                      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F) )
    // Create dense matrix B
    CHECK_CUSPARSE( cusparseCreateDnMat(&matB, A_num_cols, B_num_cols, ldb, dB,
                                        CUDA_R_32F, CUSPARSE_ORDER_COL) )
    // Create dense matrix C
    CHECK_CUSPARSE( cusparseCreateDnMat(&matC, A_num_rows, B_num_cols, ldc, dC,
                                        CUDA_R_32F, CUSPARSE_ORDER_COL) )
  
    // allocate an external buffer if needed
    CHECK_CUSPARSE( cusparseSpMM_bufferSize(
                                 handle,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &alpha, matA, matB, &beta, matC, CUDA_R_32F,
                                 CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize) )
    CHECK_CUDA( cudaMalloc(&dBuffer, bufferSize) )
  
    float milliseconds = 0;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));
    
    ////////////////////////////////////////////////////////////////////////////
    // WARMUP
    ////////////////////////////////////////////////////////////////////////////
    for (int i = 0; i < 100; i++){
        CHECK_CUSPARSE( cusparseSpMM(handle,
                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
                                    &alpha, matA, matB, &beta, matC, CUDA_R_32F,
                                    CUSPARSE_SPMM_ALG_DEFAULT, dBuffer) )
    }
    cudaDeviceSynchronize();
    
  
    ////////////////////////////////////////////////////////////////////////////
    // RUN TEST
    ////////////////////////////////////////////////////////////////////////////
  
    // Record the start event
    CHECK_CUDA(cudaEventRecord(start));
  
    // execute SpMM
    for (int i = 0; i < num_tests; i++){
        CHECK_CUSPARSE( cusparseSpMM(handle,
                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
                                    &alpha, matA, matB, &beta, matC, CUDA_R_32F,
                                    CUSPARSE_SPMM_ALG_DEFAULT, dBuffer) )
    }
  
    // Record the end event
    CHECK_CUDA(cudaEventRecord(stop));
  
    // Wait for the stop event to complete
    CHECK_CUDA(cudaEventSynchronize(stop));
  
    // Calculate the elapsed time between the start and stop events
    CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
  
  
    /////////////////////////////////////////////////////
    // CLEANUP
    /////////////////////////////////////////////////////
  
    delete hB;
    delete hC;
    hB=hC=NULL;
  
    // destroy matrix/vector descriptors
    CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
    CHECK_CUSPARSE( cusparseDestroyDnMat(matB) )
    CHECK_CUSPARSE( cusparseDestroyDnMat(matC) )
    CHECK_CUSPARSE( cusparseDestroy(handle) )
  
    // //--------------------------------------------------------------------------
    // // device memory deallocation
    // CHECK_CUDA( cudaFree(dBuffer) )
    CHECK_CUDA( cudaFree(dA_csrOffsets) )
    CHECK_CUDA( cudaFree(dA_columns) )
    CHECK_CUDA( cudaFree(dA_values) )
    CHECK_CUDA( cudaFree(dB) )
    CHECK_CUDA( cudaFree(dC) )
  
    return milliseconds;
}
  
  
int main(void) {
    int M = 768; // dff for FFN1
    int N = 2048; // num_tokens
    int K = M * 4; // dmodel for FFN1
    float sparsity = 0.98;
    int num_tests = 10000;
    float milliseconds = 0.0;
  
    for (int n = 1024 ; n < N ; n *= 2){
  
        std::cout << "Running for tokens : " << n << std::endl;
        milliseconds = run_spmm(sparsity, K, M, n, num_tests) / num_tests;
        std::cout << "Tokens " << n << " SpMM: " << milliseconds << " milliseconds" << std::endl;
    }
  
    return EXIT_SUCCESS;
}

qanhpham · January 22, 2024, 6:42pm

Hi @taehyunzzz . I checked your code, there are a few places that look suspicious to me:

// Initialize values
initMatrix(hA_values, A_nnz, 1);

It seems you are using the initMatrix function for dense matrix on a sparse matrix.

hA_columns[i] = rand() % A_num_cols;

This can lead to duplicated columns in a single row. Our SpMM routine can work in that case but that may not be what you want. But the fact that you are getting errors randomly makes me think this could be the cause.

hA_csrOffsets[i] = hA_csrOffsets[i - 1] + (A_nnz / A_num_rows);

Because of the down-rounded division, this can lead to hA_csrOffsets not covering all the non-zeros, i.e.
hA_csrOffsets[A_num_rows] < A_nnz.

Can you fix the above potential issue and see whether you still get errors?

taehyunzzz · January 23, 2024, 7:35am

You are in fact correct. My fix is as below :
As always, coding is fun but difficult!

    // Initialize sparse A
    {
        // Initialize values
        initMatrix(hA_values, A_nnz, 1);

        // reset hA
        for (int i=0; i<A_size; i++){
            hA[i] = 0;
        }

        int nonzero_checked = 0;
        while (1){
            int row=rand() % A_num_rows;
            int col=rand() % A_num_cols;
            if (hA[row * A_num_cols + col] == 0){
                nonzero_checked++;
                hA[row * A_num_cols + col] = 1;
            }
            if (nonzero_checked == A_nnz){
                break;
            }
        }

        nonzero_checked = 0;
        for (int i=0; i<A_size; i++){
            if (hA[i] == 1){
                int row = i / A_num_cols;
                int col = i % A_num_cols;
                hA_columns[nonzero_checked++]=col;
                hA_csrOffsets[row]++;
            }
        }
    }

system · February 6, 2024, 7:36am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
CUSPARSE_STATUS_INVALID_VALUE when using cusparseSpMM GPU-Accelerated Libraries	3	2265	July 14, 2019
[Solved]Cusparse illegal memory access unless I increase the size of the matrices for large matrices CUDA Programming and Performance	0	1057	July 24, 2014
cusparseSpMM fails with complex data GPU-Accelerated Libraries cusparse	3	434	June 6, 2023
Upgrading to CUDA 12.4 broke down the application GPU-Accelerated Libraries cublas , cusparse	13	995	July 21, 2024
Performance characteristics of cusparseSpMM GPU-Accelerated Libraries nvbugs	1	668	June 28, 2023
Illegal memory access, __global__ read of size 4 CUDA Programming and Performance	1	1371	March 7, 2018
cusparseSpMatDescr_t lifecycle and resource management GPU-Accelerated Libraries cusparse	7	1237	March 14, 2022
an illegal memory access was encountered CUDA Programming and Performance	7	63263	November 10, 2017
Bugs when trying to perform tranpose of a matrix using cuSPARSE GPU-Accelerated Libraries	2	716	October 12, 2021
cusparseLtMatmul is slower than cublasGemmEx GPU-Accelerated Libraries cublas , cusparse	0	613	April 21, 2023

[CuSparse] Inconsistent illegal memory access error

Related topics