BUG : Can't use cuSparseLt GEMM for matrices with more than 2**31 elems using FP8

vgoudal · January 31, 2026, 5:17am

Hi,

I use CUDALibrarySamples/cuSPARSELt/matmul/matmul_example.cpp at master · NVIDIA/CUDALibrarySamples · GitHub for my experiments.

And I saw that if we select FP8, we cannot use a matrix A (the matrix which will be compressed) larger than with 2^31 - 1 elements.

However, it works using FP16.

The problem comes from cusparseLtSpMMACompressedSize. Without returning error, if the number of elems is more than 2^31 (I used m=n=k=65536), it will return compressedSize[out] = max(size_t)=2^64.

which causes an alloc of memory of 17x10^9 GB which of course, ends with a COOM

Here is the error :

compressed_size: 18446744072098938880, compressed_buffer_size: 49216
CUDA API failed at line 266 with error: out of memory (2)

Here you can see the error in the CuSparseLt log :

[2026-01-30 23:25:48][CUSPARSELT][268854][Api][cusparseLtSpMMACompressedSize] plan[in]={ptr=0x7fffffffaca0, matmul={ptr=0x7fffffffa8a0, m=65536, n=65536, k=65536, opA=NON_TRANSPOSE, opB=TRANSPOSE, matA={ptr=0x7fffffffa2a0, format=Structured, rows=65536, cols=65536, ld=65536, alignment=16, valuesType=CUDA_R_8F_E4M3}, matB={ptr=0x7fffffffa4a0, format=Dense, rows=65536, cols=65536, ld=65536, alignment=16, valuesType=CUDA_R_8F_E4M3}, matC={ptr=0x7fffffffa6a0, format=Dense, rows=65536, cols=65536, ld=65536, alignment=16, valuesType=CUDA_R_16F}, computeType=COMPUTE_32F, is_A_sparse=true, activation=NONE, GeLUScaling=1, reluThreshold=0, reluUpperBound=3.4028235e+38, hasBias=false, alphaVector=false, betaVector=false, biasPointer=0x0, biasStride=0, batchSize=1, batchStride=0}, algSel={ptr=0x7fffffffaaa0, alg=MATMUL_ALG_DEFAULT, algId=0, numAlgorithms=4, searchIters=5, splitKInfo={splitK=-1, splitKMode=CUSPARSELT_HEURISTIC, splitKNumBuffers=0}}}, compressedSize[out]=18446744072098938880

Here is the exact code I use :

/*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparseLt.h>       // cusparseLt header
#include <cstdio>             // printf
#include <cstdlib>            // std::rand
                              
#include <cuda_fp8.h>

#define FP16 1000
#define INT8 1001
#define FP8  1002

/*
 * Choose your data type for matrices A and B
 */
// #define AB_TYPE FP16
#define AB_TYPE FP8
// #define AB_TYPE INT8

#if AB_TYPE == FP8
using AB_t         = __nv_fp8_e4m3;
using C_t          = __half;
using COMPUTE_t    = float;
#elif AB_TYPE == FP16
using AB_t         = __half;
using C_t          = __half;
using COMPUTE_t    = float;
#elif AB_TYPE == INT8
using AB_t         = int8_t;
using C_t          = int8_t; // can also be __half, __nv_bfloat16, int
using COMPUTE_t    = int;
#endif
                              
template <typename value_t>
struct cuda_type { };

template <>
struct cuda_type <__half> {
    static constexpr cudaDataType value = CUDA_R_16F;
};

template <>
struct cuda_type <__nv_bfloat16> {
    static constexpr cudaDataType value = CUDA_R_16BF;
};

template <>
struct cuda_type <__nv_fp8_e4m3> {
    static constexpr cudaDataType value = CUDA_R_8F_E4M3;
};

template <>
struct cuda_type <int8_t> {
    static constexpr cudaDataType value = CUDA_R_8I;
};

template <>
struct cuda_type <int> {
    static constexpr cudaDataType value = CUDA_R_32I;
};

template <typename value_t>
struct cusparse_compute_type {  };

template <>
struct cusparse_compute_type<float> {
    static constexpr cusparseComputeType value = CUSPARSE_COMPUTE_32F;
};

template <>
struct cusparse_compute_type<int> {
    static constexpr cusparseComputeType value = CUSPARSE_COMPUTE_32I;
};

#define CHECK_CUDA(func)                                                       \
{                                                                              \
    cudaError_t status = (func);                                               \
    if (status != cudaSuccess) {                                               \
        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
               __LINE__, cudaGetErrorString(status), status);                  \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

#define CHECK_CUSPARSE(func)                                                   \
{                                                                              \
    cusparseStatus_t status = (func);                                          \
    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
               __LINE__, cusparseLtGetErrorString(status), status);            \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}


constexpr int EXIT_UNSUPPORTED = 2;


int main(void) {
    setenv("CUSPARSELT_LOG_LEVEL", "5", 1);
    setenv("CUSPARSELT_LOG_FILE",  "cusparseLt.log", 1);
    int major_cc, minor_cc;
    CHECK_CUDA( cudaDeviceGetAttribute(&major_cc,
                                       cudaDevAttrComputeCapabilityMajor, 0) )
    CHECK_CUDA( cudaDeviceGetAttribute(&minor_cc,
                                       cudaDevAttrComputeCapabilityMinor, 0) )
    if (!(major_cc == 8 && minor_cc == 0) &&
        !(major_cc == 8 && minor_cc == 6) &&
        !(major_cc == 8 && minor_cc == 7) &&
        !(major_cc == 8 && minor_cc == 9) &&
        !(major_cc == 9 && minor_cc == 0) &&
        !(major_cc == 10 && minor_cc == 0) &&
        !(major_cc == 10 && minor_cc == 1) &&
        !(major_cc == 11 && minor_cc == 0) &&
        !(major_cc == 12 && minor_cc == 0) &&
        !(major_cc == 12 && minor_cc == 1)) {
        std::printf("\ncusparseLt is supported only on GPU devices with"
                    " compute capability == 8.0, 8.6, 8.7, 8.9, 9.0 10.0 10.1 110 12.0 12.1 current: %d.%d\n\n",
                     major_cc, minor_cc);
        return EXIT_UNSUPPORTED;
    }
    // Host problem definition, row-major order
    // bigger sizes may require dynamic allocations
    constexpr int m            = 65536;
    constexpr int n            = 65536;
    constexpr int k            = 65536;

    auto     order          = CUSPARSE_ORDER_ROW;
    auto     opA            = CUSPARSE_OPERATION_NON_TRANSPOSE;
    auto     opB            = CUSPARSE_OPERATION_TRANSPOSE;
    auto     type_AB        = cuda_type<AB_t>::value;
    auto     type_C         = cuda_type<C_t>::value;
    auto     compute_type   = cusparse_compute_type<COMPUTE_t>::value;
    bool     matmul_search  = true;
    bool     is_rowmajor    = (order == CUSPARSE_ORDER_ROW);
    bool     isA_transposed = (opA != CUSPARSE_OPERATION_NON_TRANSPOSE);
    bool     isB_transposed = (opB != CUSPARSE_OPERATION_NON_TRANSPOSE);
    auto     num_A_rows     = (isA_transposed) ? k : m;
    auto     num_A_cols     = (isA_transposed) ? m : k;
    auto     num_B_rows     = (isB_transposed) ? n : k;
    auto     num_B_cols     = (isB_transposed) ? k : n;
    auto     num_C_rows     = m;
    auto     num_C_cols     = n;
    unsigned alignment      = 16;
    auto     lda            = (is_rowmajor) ? num_A_cols : num_A_rows;
    auto     ldb            = (is_rowmajor) ? num_B_cols : num_B_rows;
    auto     ldc            = (is_rowmajor) ? num_C_cols : num_C_rows;
    auto     A_height       = (is_rowmajor) ? num_A_rows : num_A_cols;
    auto     B_height       = (is_rowmajor) ? num_B_rows : num_B_cols;
    auto     C_height       = (is_rowmajor) ? num_C_rows : num_C_cols;
    
    size_t     A_size         = (size_t)A_height * lda * sizeof(AB_t);
    size_t     B_size         = (size_t)B_height * ldb * sizeof(AB_t);
    size_t     C_size         = (size_t)C_height * ldc * sizeof(C_t);

    auto     hA             = new AB_t[(size_t)A_height * lda];
    auto     hB             = new AB_t[(size_t)B_height * ldb];
    auto     hC             = new C_t[(size_t)C_height * ldc];
    for (size_t i = 0; i < (size_t)m * k; i++) 
        hA[i] = static_cast<AB_t>(static_cast<float>(std::rand() % 5 - 2)); // -2 ~ 2
    for (size_t i = 0; i < (size_t)k * n; i++)
        hB[i] = static_cast<AB_t>(static_cast<float>(std::rand() % 5 - 2));
    for (size_t i = 0; i < (size_t)m * n; i++)
        hC[i] = static_cast<C_t>(static_cast<float>(std::rand() % 5 - 2));
    float alpha = 1.0f;
    float beta  = 1.0f;

    //--------------------------------------------------------------------------
    // Device memory management

    AB_t* dA, *dB, *dA_compressed;
    C_t* dC, *dD;
    int    *d_valid;
    CHECK_CUDA( cudaMalloc((void**) &dA, A_size) )
    CHECK_CUDA( cudaMalloc((void**) &dB, B_size) )
    CHECK_CUDA( cudaMalloc((void**) &dC, C_size) )
    CHECK_CUDA( cudaMalloc((void**) &d_valid, sizeof(int)) )
    dD = dC;
    CHECK_CUDA( cudaMemcpy(dA, hA, A_size, cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dB, hB, B_size, cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dC, hC, C_size, cudaMemcpyHostToDevice) )
    //--------------------------------------------------------------------------
    cusparseLtHandle_t             handle;
    cusparseLtMatDescriptor_t      matA, matB, matC;
    cusparseLtMatmulDescriptor_t   matmul;
    cusparseLtMatmulAlgSelection_t alg_sel;
    cusparseLtMatmulPlan_t         plan;
    cudaStream_t                   stream = nullptr;

    CHECK_CUSPARSE( cusparseLtInit(&handle) )

    // matrix descriptor initialization
    CHECK_CUSPARSE( cusparseLtStructuredDescriptorInit(
                                            &handle, &matA, num_A_rows,
                                            num_A_cols, lda, alignment,
                                            type_AB, order,
                                            CUSPARSELT_SPARSITY_50_PERCENT) )

    CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
                                            &handle, &matB, num_B_rows,
                                            num_B_cols, ldb, alignment,
                                            type_AB, order) )
    CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
                                            &handle, &matC, num_C_rows,
                                            num_C_cols, ldc, alignment,
                                            type_C, order) )

    // matmul, algorithm selection, and plan initialization
    CHECK_CUSPARSE( cusparseLtMatmulDescriptorInit(
                                            &handle, &matmul, opA, opB,
                                            &matA, &matB, &matC, &matC,
                                            compute_type) )

    CHECK_CUSPARSE( cusparseLtMatmulAlgSelectionInit(
                                            &handle, &alg_sel, &matmul,
                                            CUSPARSELT_MATMUL_ALG_DEFAULT) )

    CHECK_CUSPARSE( cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel))

    CHECK_CUSPARSE(cusparseLtMatmulDescSetAttribute(&handle,
                                                    &matmul,
                                                    CUSPARSELT_MATMUL_SPARSE_MAT_POINTER,
                                                    &dA,
                                                    sizeof(dA)));

    //--------------------------------------------------------------------------
    // Prune the A matrix (in-place) and check the correctness
    CHECK_CUSPARSE( cusparseLtSpMMAPrune(&handle, &matmul, dA, dA,
                                         CUSPARSELT_PRUNE_SPMMA_TILE, stream) )
    CHECK_CUSPARSE( cusparseLtSpMMAPruneCheck(&handle, &matmul, dA,
                                              d_valid, stream) )
    int is_valid;
    CHECK_CUDA( cudaMemcpyAsync(&is_valid, d_valid, sizeof(int),
                                cudaMemcpyDeviceToHost, stream) )
    CHECK_CUDA( cudaStreamSynchronize(stream) )
    if (is_valid != 0) {
        std::printf("!!!! The matrix has been pruned in a wrong way. "
                    "cusparseLtMatmul will not provide correct results\n");
        return EXIT_FAILURE;
    }
    //--------------------------------------------------------------------------
    // Compress the A matrix
    size_t compressed_size, compressed_buffer_size;
    void*  dA_compressedBuffer;
    CHECK_CUSPARSE( cusparseLtSpMMACompressedSize(&handle, &plan,
                                                  &compressed_size,
                                                  &compressed_buffer_size) )
    printf("compressed_size: %zu, compressed_buffer_size: %zu\n", compressed_size, compressed_buffer_size);
    CHECK_CUDA( cudaMalloc((void**) &dA_compressed, compressed_size) )
    CHECK_CUDA( cudaMalloc((void**) &dA_compressedBuffer,
                           compressed_buffer_size) )

    CHECK_CUSPARSE( cusparseLtSpMMACompress(&handle, &plan, dA, dA_compressed,
                                            dA_compressedBuffer,stream) )

    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // Search the best kernel
    int           num_streams = 0;
    cudaStream_t* streams     = nullptr;

    if (matmul_search) {
        CHECK_CUSPARSE( cusparseLtMatmulSearch(&handle, &plan, &alpha,
                                               dA_compressed, dB, &beta,
                                               dC, dD, nullptr,
                                               streams, num_streams) )
        // dC accumulates so reset dC for correctness check
        CHECK_CUDA( cudaMemcpy(dC, hC, C_size, cudaMemcpyHostToDevice) )
    }

    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    size_t workspace_size;

    CHECK_CUSPARSE( cusparseLtMatmulGetWorkspace(&handle, &plan,
                                                 &workspace_size))
    void* d_workspace;
    CHECK_CUDA( cudaMalloc((void**) &d_workspace, workspace_size) )
    // Perform the matrix multiplication
    CHECK_CUSPARSE( cusparseLtMatmul(&handle, &plan, &alpha, dA_compressed, dB,
                                     &beta, dC, dD, d_workspace, streams,
                                     num_streams) )
    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // destroy plan and handle
    CHECK_CUSPARSE( cusparseLtMatDescriptorDestroy(&matA) )
    CHECK_CUSPARSE( cusparseLtMatDescriptorDestroy(&matB) )
    CHECK_CUSPARSE( cusparseLtMatDescriptorDestroy(&matC) )
    CHECK_CUSPARSE( cusparseLtMatmulAlgSelectionDestroy(&alg_sel) )
    CHECK_CUSPARSE( cusparseLtMatmulPlanDestroy(&plan) )
    CHECK_CUSPARSE( cusparseLtDestroy(&handle) )

    // //--------------------------------------------------------------------------
    // // device result check
    // // matrix A has been pruned
    // CHECK_CUDA( cudaMemcpy(hA, dA, A_size, cudaMemcpyDeviceToHost) )

    // bool A_std_layout = (is_rowmajor != isA_transposed);
    // bool B_std_layout = (is_rowmajor != isB_transposed);

    // // host computation
    // C_t* hC_result = new C_t[C_height * ldc];

    // for (int i = 0; i < m; i++) {
    //     for (int j = 0; j < n; j++) {
    //         COMPUTE_t sum  = static_cast<COMPUTE_t>(0);
    //         for (int k1 = 0; k1 < k; k1++) {
    //             auto posA = (A_std_layout) ? static_cast<size_t>(i) * lda + k1 : i + k1 * lda;
    //             auto posB = (B_std_layout) ? static_cast<size_t>(k1) * ldb + j : k1 + j * ldb;
    //             sum      += static_cast<COMPUTE_t>(hA[posA]) *  // [i][k]
    //                         static_cast<COMPUTE_t>(hB[posB]);   // [k][j]
    //         }
    //         auto posC       = (is_rowmajor) ? i * ldc + j : i + j * ldc;
    //         hC_result[posC] = static_cast<C_t>(alpha * sum + beta * static_cast<float>(hC[posC]));  // [i][j]
    //     }
    // }

    // // reuse hC for device results
    // CHECK_CUDA( cudaMemcpy(hC, dD, C_size, cudaMemcpyDeviceToHost) )

    // // host-device comparison
    // int correct = 1;
    // for (int i = 0; i < m; i++) {
    //     for (int j = 0; j < n; j++) {
    //         auto pos          = (is_rowmajor) ? i * ldc + j : i + j * ldc;
    //         auto device_value = hC[pos];
    //         auto host_value   = hC_result[pos];
    //         if (device_value != host_value) {
    //             // direct floating point comparison is not reliable
    //             std::printf("(%d, %d):\t%3.0f vs. %3.0f\n",
    //                         i, j, static_cast<float>(host_value), static_cast<float>(device_value));
    //             correct = 0;
    //             break;
    //         }
    //     }
    // }

    // if (correct) {
    //     std::printf("matmul_example test PASSED\n");
    // }
    // else {
    //     std::printf("matmul_example test FAILED: wrong result\n");
    // }

    //--------------------------------------------------------------------------
    // host memory deallocation
    delete[] hA;
    delete[] hB;
    delete[] hC;
    // delete[] hC_result;
    //--------------------------------------------------------------------------
    // device memory deallocation
    CHECK_CUDA( cudaFree(dA_compressed) )
    CHECK_CUDA( cudaFree(dA) )
    CHECK_CUDA( cudaFree(dB) )
    CHECK_CUDA( cudaFree(dC) )
    CHECK_CUDA( cudaFree(d_valid) )
    CHECK_CUDA( cudaFree(d_workspace) )
    CHECK_CUDA( cudaFree(dA_compressedBuffer) )

    return EXIT_SUCCESS;
}

I’m on Blackwell GPUs.

I use Cuda 13.1 with cusparselt 0.8.1+cu13.

Here is the compile command :

nvcc -O3 -std=c++17 \
  -I.. \
  matmul_example.cpp \
  -lcusparseLt -lcudart -lcublas \
  -arch=sm_120 \
  -o gemm

As I don’t have access to the code of the function, I don’t understand how this result can be possible.

Pls let me know if it’s actually a bug or if I did something wrong.

Thank you for your answer.

Topic		Replies	Views
cusparseLtMatmul is slower than cublasGemmEx GPU-Accelerated Libraries cublas , cusparse	0	653	April 21, 2023
Problem of two large sparse matrices multiplication in cuParse CUDA Programming and Performance	6	3815	November 21, 2016
cuSparseLt problem GPU-Accelerated Libraries cuda , cusparse	3	684	January 21, 2023
cuSPARSELt MatMul with larger matrix size GPU-Accelerated Libraries cuda , cusparse	2	514	May 2, 2023
Problems in the use of cusparseSpGEMM in CUDA Fortran GPU-Accelerated Libraries cusparse	16	1583	October 25, 2023
cusparseScsrilu02 breaks with large matrices GPU-Accelerated Libraries cublas , cusolver , cusparse	10	1266	June 1, 2022
Errors about cuSPARSE blocked ellpack format with fp16 compute type GPU-Accelerated Libraries cuda , cusparse	14	682	January 12, 2024
CuSparse Matrix Multiplication Fails Silently GPU-Accelerated Libraries cusparse	4	65	December 9, 2025
Problems allocating sparse Matrix CUDA Programming and Performance	3	2536	March 15, 2010
Sparse Matrix-Vector Multiplication on CUDA CUDA Programming and Performance	79	314286	November 22, 2010

BUG : Can't use cuSparseLt GEMM for matrices with more than 2**31 elems using FP8

Related topics