Warning: Cuda API error detected: cuModuleLoadFatBinary returned (0xd1)

I am using Visual Studio Code (VSCode) on Windows 11 to connect to WSL2 (Ubuntu 22.04) and encountering an error when attempting to debug with cuda-gdb. Strangely, the compiled executable runs flawlessly when executed directly.

NVIDIA (R) CUDA Debugger
CUDA Toolkit 12.1 release
Portions Copyright (C) 2007-2023 NVIDIA Corporation
GNU gdb (GDB) 12.1
Copyright (C) 2022 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.

warning: Cuda API error detected: cuModuleLoadFatBinary returned (0xd1)

warning: Cuda API error detected: cuModuleLoadFatBinary returned (0xd1)

My setup is as follows:

==============NVSMI LOG==============

Timestamp                                 : Tue Apr 30 00:09:53 2024
Driver Version                            : 552.22
CUDA Version                              : 12.4

Attached GPUs                             : 1
GPU 00000000:01:00.0
    Product Name                          : NVIDIA GeForce RTX 3060 Laptop GPU
    Product Brand                         : GeForce
    Product Architecture                  : Ampere
    Display Mode                          : Enabled
    Display Active                        : Enabled
    Persistence Mode                      : Enabled
    Addressing Mode                       : N/A
    MIG Mode
        Current                           : N/A
        Pending                           : N/A
    Accounting Mode                       : Disabled
    Accounting Mode Buffer Size           : 4000
    Driver Model
        Current                           : WDDM
        Pending                           : WDDM
    Serial Number                         : N/A
    GPU UUID                              : GPU-39ea9d63-045b-112d-1973-27cc4470dba9
    Minor Number                          : N/A
    VBIOS Version                         : 94.06.19.00.3e
    MultiGPU Board                        : No
    Board ID                              : 0x100
    Board Part Number                     : N/A
    GPU Part Number                       : 2520-775-A1
    FRU Part Number                       : N/A
    Module ID                             : 1
    Inforom Version
        Image Version                     : G001.0000.03.03
        OEM Object                        : 2.0
        ECC Object                        : N/A
        Power Management Object           : N/A
    Inforom BBX Object Flush
        Latest Timestamp                  : N/A
        Latest Duration                   : N/A
    GPU Operation Mode
        Current                           : N/A
        Pending                           : N/A
    GPU C2C Mode                          : N/A
    GPU Virtualization Mode
        Virtualization Mode               : None
        Host VGPU Mode                    : N/A
        vGPU Heterogeneous Mode           : N/A
    GPU Reset Status
        Reset Required                    : No
        Drain and Reset Recommended       : N/A
    GSP Firmware Version                  : N/A
    IBMNPU
        Relaxed Ordering Mode             : N/A
    PCI
        Bus                               : 0x01
        Device                            : 0x00
        Domain                            : 0x0000
        Device Id                         : 0x252010DE
        Bus Id                            : 00000000:01:00.0
        Sub System Id                     : 0x380117AA
        GPU Link Info
            PCIe Generation
                Max                       : 3
                Current                   : 3
                Device Current            : 3
                Device Max                : 4
                Host Max                  : 3
            Link Width
                Max                       : 16x
                Current                   : 8x
        Bridge Chip
            Type                          : N/A
            Firmware                      : N/A
        Replays Since Reset               : 0
        Replay Number Rollovers           : 0
        Tx Throughput                     : 2 KB/s
        Rx Throughput                     : 4 KB/s
        Atomic Caps Inbound               : N/A
        Atomic Caps Outbound              : N/A
    Fan Speed                             : N/A
    Performance State                     : P8
    Clocks Event Reasons
        Idle                              : Active
        Applications Clocks Setting       : Not Active
        SW Power Cap                      : Not Active
        HW Slowdown                       : Not Active
            HW Thermal Slowdown           : Not Active
            HW Power Brake Slowdown       : Not Active
        Sync Boost                        : Not Active
        SW Thermal Slowdown               : Not Active
        Display Clock Setting             : Not Active
    Sparse Operation Mode                 : N/A
    FB Memory Usage
        Total                             : 6144 MiB
        Reserved                          : 148 MiB
        Used                              : 1239 MiB
        Free                              : 4758 MiB
    BAR1 Memory Usage
        Total                             : 8192 MiB
        Used                              : 1 MiB
        Free                              : 8191 MiB
    Conf Compute Protected Memory Usage
        Total                             : N/A
        Used                              : N/A
        Free                              : N/A
    Compute Mode                          : Default
    Utilization
        Gpu                               : 5 %
        Memory                            : 16 %
        Encoder                           : 0 %
        Decoder                           : 0 %
        JPEG                              : 0 %
        OFA                               : 0 %
    Encoder Stats
        Active Sessions                   : 0
        Average FPS                       : 0
        Average Latency                   : 0
    FBC Stats
        Active Sessions                   : 0
        Average FPS                       : 0
        Average Latency                   : 0
    ECC Mode
        Current                           : N/A
        Pending                           : N/A
    ECC Errors
        Volatile
            SRAM Correctable              : N/A
            SRAM Uncorrectable Parity     : N/A
            SRAM Uncorrectable SEC-DED    : N/A
            DRAM Correctable              : N/A
            DRAM Uncorrectable            : N/A
        Aggregate
            SRAM Correctable              : N/A
            SRAM Uncorrectable Parity     : N/A
            SRAM Uncorrectable SEC-DED    : N/A
            DRAM Correctable              : N/A
            DRAM Uncorrectable            : N/A
            SRAM Threshold Exceeded       : N/A
        Aggregate Uncorrectable SRAM Sources
            SRAM L2                       : N/A
            SRAM SM                       : N/A
            SRAM Microcontroller          : N/A
            SRAM PCIE                     : N/A
            SRAM Other                    : N/A
    Retired Pages
        Single Bit ECC                    : N/A
        Double Bit ECC                    : N/A
        Pending Page Blacklist            : N/A
    Remapped Rows                         : N/A
    Temperature
        GPU Current Temp                  : 39 C
        GPU T.Limit Temp                  : N/A
        GPU Shutdown Temp                 : 105 C
        GPU Slowdown Temp                 : 102 C
        GPU Max Operating Temp            : 105 C
        GPU Target Temperature            : 87 C
        Memory Current Temp               : N/A
        Memory Max Operating Temp         : N/A
    GPU Power Readings
        Power Draw                        : 13.17 W
        Current Power Limit               : 75.00 W
        Requested Power Limit             : N/A
        Default Power Limit               : 60.00 W
        Min Power Limit                   : 1.00 W
        Max Power Limit                   : 75.00 W
    GPU Memory Power Readings 
        Power Draw                        : N/A
    Module Power Readings
        Power Draw                        : N/A
        Current Power Limit               : N/A
        Requested Power Limit             : N/A
        Default Power Limit               : N/A
        Min Power Limit                   : N/A
        Max Power Limit                   : N/A
    Clocks
        Graphics                          : 210 MHz
        SM                                : 210 MHz
        Memory                            : 405 MHz
        Video                             : 555 MHz
    Applications Clocks
        Graphics                          : N/A
        Memory                            : N/A
    Default Applications Clocks
        Graphics                          : N/A
        Memory                            : N/A
    Deferred Clocks
        Memory                            : N/A
    Max Clocks
        Graphics                          : 2100 MHz
        SM                                : 2100 MHz
        Memory                            : 6001 MHz
        Video                             : 1950 MHz
    Max Customer Boost Clocks
        Graphics                          : N/A
    Clock Policy
        Auto Boost                        : N/A
        Auto Boost Default                : N/A
    Voltage
        Graphics                          : 643.750 mV
    Fabric
        State                             : N/A
        Status                            : N/A
        CliqueId                          : N/A
        ClusterUUID                       : N/A
        Health
            Bandwidth                     : N/A
    Processes                             : None

The debugging target is a simple matrix multiplication example using cuSparseLt, specifically the matmul_example.cpp file. The issue arises during step-by-step debugging at the line CHECK_CUSPARSE( cusparseLtInit(&handle) ), leading me to suspect a problem with cuSparseLt. My task.json and launch.json configurations are configured as such:
task.json

{
	"version": "2.0.0",
	"tasks": [
		{
            "label": "mynvcc",
            "type": "shell",
            "command": "nvcc",
            "args": ["-lcusparse","-g","-G","-o","${fileDirname}/test","${file}"]
        },
        {
            "label": "mynvcc2",
            "type": "shell",
            "command": "nvcc",
            "args": ["-lcublas","-g","-G","-o","${fileDirname}/test","${file}"]
        },
        {
            "label": "cusparseLtNvcc",
            "type": "shell",
            "command": "nvcc",
            "args": ["-lcusparseLt","-lcusparse","-ldl","-gencode","arch=compute_80,code=sm_80","-g","-G","-o","${fileDirname}/test","${file}"]
        },
        {
            "label": "cusparseLtNvccStatic",
            "type": "shell",
            "command": "nvcc",
            "args": ["-lcusparseLt","-lcusparse","-lcudart","-lcuda","-Xlinker=/home/whh/workspace/env/cusparseLt/lib64/libcusparseLt_static.a","-ldl","-gencode","arch=compute_80,code=sm_80","-g","-G","-o","${fileDirname}/test_static","${file}"]
        }
	]
}

launch.json

{
    // 使用 IntelliSense 了解相关属性。 
    // 悬停以查看现有属性的描述。
    // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Cusparse C++: Launch",
            "type": "cuda-gdb",
            "request": "launch",
            "program": "${fileDirname}/test",
            "preLaunchTask": "mynvcc"
        },
        {
            "name": "Cusparse C++: Attach",
            "type": "cuda-gdb",
            "request": "attach"
        },
        {
            "name": "Cublas C++: Launch",
            "type": "cuda-gdb",
            "request": "launch",
            "program": "${fileDirname}/test",
            "preLaunchTask": "mynvcc2"
        },
        {
            "name": "Cublas C++: Attach",
            "type": "cuda-gdb",
            "request": "attach"
        },
        {
            "name": "CusparseLt: Launch",
            "type": "cuda-gdb",
            "debuggerPath": "/usr/local/cuda/bin/cuda-gdb",
            "request": "launch",
            "program": "${fileDirname}/test",
            "preLaunchTask": "cusparseLtNvcc"
        },
        {
            "name": "CusparseLt: Attach",
            "type": "cuda-gdb",
            "debuggerPath": "/usr/local/cuda/bin/cuda-gdb",
            "request": "attach",
            "program": "${fileDirname}/test",
            "preLaunchTask": "cusparseLtNvcc"
        },
        {
            "name": "CusparseLtStatic: Launch",
            "type": "cuda-gdb",
            "debuggerPath": "/usr/local/cuda/bin/cuda-gdb",
            "request": "launch",
            "program": "${fileDirname}/test_static",
            "preLaunchTask": "cusparseLtNvccStatic"
        },
    ]
}

Regarding cuSparseLt, it’s peculiar that after downloading version 0.6.0, there isn’t a distinct cuSparseLt folder, whereas the official tutorials reference paths like ${CUSPARSELT_DIR}/include. This discrepancy is confusing. The error message I receive during debugging corresponds to:

CUDA_ERROR_NO_BINARY_FOR_GPU = 209
This indicates that there is no kernel image available that is suitable for the device. This can occur when a user specifies code generation options for a particular CUDA source file that do not include the corresponding device configuration.

but I’m at a loss for how to address this particular issue. Any guidance or insights from the CUDA community would be greatly appreciated.

By the way, my cuda driver version is :

NVIDIA-SMI 550.76.01              Driver Version: 552.22         CUDA Version: 12.4

my cuda_toolkit version is :

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Feb__7_19:32:13_PST_2023
Cuda compilation tools, release 12.1, V12.1.66
Build cuda_12.1.r12.1/compiler.32415258_0

Hi, @892516165

Sorry for the issue you met !
Can you help to isolate it by

  1. Debug another CUDA sample in VSCode, will the same issue happen ?
  2. Enter into WSL2, and debug this sample using cuda-gdb directly, will the same issue happen ?
  3. In WSL2, debug another simple CUDA sample

Upon entering WSL2 and attempting to debug a sample directly using cuda-gdb , the same issue arises. Conversely, if cuda-gdb is utilized to debug simpler samples that do not incorporate the cusparseLt library, this problem does not manifest.

Hi @892516165
Thank you very much for the report! To help us debug the problem, could you collect and share additional logs with us?

  • Add NVLOG_CONFIG_FILE variable pointing the nvlog.config file (attached). E.g.: NVLOG_CONFIG_FILE=${HOME}/nvlog.config
    nvlog.config (539 Bytes)
  • Run the debugging session.
  • You should see the /tmp/debugger.log file created - could you share it with us?

Hello, I followed your instructions, but I haven’t found the /tmp/debugger.log file.

Could you try with the new config (attached): nvlog.local.config (544 Bytes)

Please place the nvlog.local.config in your home directly, define the env variable and run the CLI debugging session:

export NVLOG_CONFIG_FILE=${HOME}/nvlog.local.config
cuda-gdb ./app/to/debug

The debugger.log file should be generated in the same directory where you run cuda-gdb command.

If possible could you also capture the cuda-gdb output and share it as well?

debugger.log (1.2 MB)
this is the debugger.log file

Hi @892516165
Can you continue the debugging session after this warning is printed? This can be a normal cusparse library behavior (it’s expected to have CUDA_ERROR_NO_BINARY_FOR_GPU during the library initialization):

  • without the debugger, this error is handled by cusparse
  • when running with cuda-gdb the debugger would intercept and report this error

Similar issue: Nsight Compute get stuck at cusparse function calling: CUDA_ERROR_NO_BINARY_FOR_GPU(209) - #4 by jmarusarz

How should I proceed with debugging? When using vscode, I haven’t found a way to continue debugging. Neither have I located where to set “Break on error.”

Could you try setting onAPIError attribute to ignore. E.g.:

        {
            "name": "Cusparse C++: Attach",
            "type": "cuda-gdb",
            "onAPIError": "ignore",
            "request": "attach"
        }

for all cuda-gdb configurations.

More details: Getting Started with the CUDA Debugger :: NVIDIA Nsight VSCE Documentation (search for onAPIError here)

i change the launch to this:

 {
            "name": "CusparseLt: Launch",
            "type": "cuda-gdb",
            "debuggerPath": "/usr/local/cuda/bin/cuda-gdb",
            "onAPIError": "ignore",
            "logFile": "${fileDirname}/log.txt",
            "request": "launch",
            "program": "${fileDirname}/test",
            "preLaunchTask": "cusparseLtNvcc"
        },

and the log.txt:
log.txt (1.3 MB)

Hi @892516165 , thank you for the updated log. Could you please clarify a few more things:

  • Can you run the application without the debugger? Does it produce the expected result (e.g. does cusparseLtInit return success)?

  • When you run the debugging session in cuda-gdb without (VS Code) can you continue the debugging after this warning is printed? This warning is likely harmless (coming from underlying library internals) and can be ignored. If you cannot continue the debugging session after this warning is printed (e.g. debugger hands or quits), could you share the cuda-gdb output (which commands you input and the cuda-gdb output).

  • When debugging in the VS Code what excatly happens when you step over cusparseLtInit - does the app exits (or the step never completes)?

  1. If the compiled executable is run directly, there should be no issues.
  2. When I attempt to execute cusparseLtInit using cuda-gdb from the command line, it consistently fails with errors, as seen in the debugger.log. Do I need to set onAPIError somewhere when using cuda-gdb via the command line?
  3. While debugging in Visual Studio Code and reaching cusparseInit, the process never halts; instead, the error is logged in log.txt.
    debugger.log (821.1 KB)
    log.txt (2.1 MB)

Hi @892516165
Thank you for the updated logs, we will try to reproduce the issue locally. I will update this post as soon as we manage to identify the issue.

Hi, @892516165

Now we know the API error CUDA_ERROR_NO_BINARY_FOR_GPU is expected, and you can ignore. now the problem is

  1. Can you continue to debug ?
  2. You mentioned “It consistently fails with errors”, which error do you refer here ? Please provide the cuda-gdb output.

When I’m debugging, this error keeps popping up: “warning: Cuda API error detected: cuModuleLoadFatBinary returned (0xd1)”, preventing me from proceeding to the next step.I can’t continue to debug.
debugger.log (1017.3 KB)

OK. Got it.

I see you are using NVIDIA GeForce RTX 3060 Laptop GPU, so can you use “arch=compute_86,code=sm_86” to compile the sample ?

Also it seems you are using cuda-gdb from CUDA 12.1 and the driver is 552.22 for 12.4.
Can you update both to latest release ? (We recently have cuda12.5 released)

If you still encounter the issue, please provide us a mini-repo(for example: matmul_example.cpp). Thanks !

I update cuda driver and toolkit to 12.5 and use “arch=compute_86,code=sm_86” to compile the sample, but the error still keeps popping up.
debugger.log (607.9 KB)

Please share the source code for repro. Thanks !

The following code is from within this code repository.

/*
 * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparseLt.h>       // cusparseLt header
#include <cstdio>             // printf
#include <cstdlib>            // std::rand

#define CHECK_CUDA(func)                                                       \
{                                                                              \
    cudaError_t status = (func);                                               \
    if (status != cudaSuccess) {                                               \
        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
               __LINE__, cudaGetErrorString(status), status);                  \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

#define CHECK_CUSPARSE(func)                                                   \
{                                                                              \
    cusparseStatus_t status = (func);                                          \
    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
               __LINE__, cusparseGetErrorString(status), status);              \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

constexpr int EXIT_UNSUPPORTED = 2;

int main(void) {
    int major_cc, minor_cc;
    CHECK_CUDA( cudaDeviceGetAttribute(&major_cc,
                                       cudaDevAttrComputeCapabilityMajor, 0) )
    CHECK_CUDA( cudaDeviceGetAttribute(&minor_cc,
                                       cudaDevAttrComputeCapabilityMinor, 0) )
    if (!(major_cc == 8 && minor_cc == 0) &&
        !(major_cc == 8 && minor_cc == 6) &&
        !(major_cc == 8 && minor_cc == 9)) {
        std::printf("\ncusparseLt is supported only on GPU devices with"
                    " compute capability == 8.0, 8.6, 8.9 current: %d.%d\n\n",
                     major_cc, minor_cc);
        return EXIT_UNSUPPORTED;
    }
    // Host problem definition, row-major order
    // bigger sizes may require dynamic allocations
    constexpr int m            = 32;
    constexpr int n            = 32;
    constexpr int k            = 32;
    auto          order        = CUSPARSE_ORDER_ROW;
    auto          opA          = CUSPARSE_OPERATION_NON_TRANSPOSE;
    auto          opB          = CUSPARSE_OPERATION_NON_TRANSPOSE;
    auto          type         = CUDA_R_16F;
    auto          compute_type = CUSPARSE_COMPUTE_32F;

    bool     is_rowmajor    = (order == CUSPARSE_ORDER_ROW);
    bool     isA_transposed = (opA != CUSPARSE_OPERATION_NON_TRANSPOSE);
    bool     isB_transposed = (opB != CUSPARSE_OPERATION_NON_TRANSPOSE);
    auto     num_A_rows     = (isA_transposed) ? k : m;
    auto     num_A_cols     = (isA_transposed) ? m : k;
    auto     num_B_rows     = (isB_transposed) ? n : k;
    auto     num_B_cols     = (isB_transposed) ? k : n;
    auto     num_C_rows     = m;
    auto     num_C_cols     = n;
    unsigned alignment      = 16;
    auto     lda            = (is_rowmajor) ? num_A_cols : num_A_rows;
    auto     ldb            = (is_rowmajor) ? num_B_cols : num_B_rows;
    auto     ldc            = (is_rowmajor) ? num_C_cols : num_C_rows;
    auto     A_height       = (is_rowmajor) ? num_A_rows : num_A_cols;
    auto     B_height       = (is_rowmajor) ? num_B_rows : num_B_cols;
    auto     C_height       = (is_rowmajor) ? num_C_rows : num_C_cols;
    auto     A_size         = A_height * lda * sizeof(__half);
    auto     B_size         = B_height * ldb * sizeof(__half);
    auto     C_size         = C_height * ldc * sizeof(__half);
    __half hA[m * k];
    __half hB[k * n];
    __half hC[m * n] = {};
    for (int i = 0; i < m * k; i++)
        hA[i] = static_cast<__half>(static_cast<float>(std::rand() % 10));
    for (int i = 0; i < k * n; i++)
        hB[i] = static_cast<__half>(static_cast<float>(std::rand() % 10));
    float alpha = 1.0f;
    float beta  = 0.0f;

    //--------------------------------------------------------------------------
    // Device memory management
    __half *dA, *dB, *dC, *dD, *dA_compressed;
    int    *d_valid;
    CHECK_CUDA( cudaMalloc((void**) &dA, A_size) )
    CHECK_CUDA( cudaMalloc((void**) &dB, B_size) )
    CHECK_CUDA( cudaMalloc((void**) &dC, C_size) )
    CHECK_CUDA( cudaMalloc((void**) &d_valid, sizeof(int)) )
    dD = dC;

    CHECK_CUDA( cudaMemcpy(dA, hA, A_size, cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dB, hB, B_size, cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dC, hC, C_size, cudaMemcpyHostToDevice) )
    //--------------------------------------------------------------------------
    cusparseLtHandle_t             handle;
    cusparseLtMatDescriptor_t      matA, matB, matC;
    cusparseLtMatmulDescriptor_t   matmul;
    cusparseLtMatmulAlgSelection_t alg_sel;
    cusparseLtMatmulPlan_t         plan;
    cudaStream_t                   stream = nullptr;
    CHECK_CUSPARSE( cusparseLtInit(&handle) )
    // matrix descriptor initialization
    CHECK_CUSPARSE( cusparseLtStructuredDescriptorInit(
                                            &handle, &matA, num_A_rows,
                                            num_A_cols, lda, alignment,
                                            type, order,
                                            CUSPARSELT_SPARSITY_50_PERCENT) )
    CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
                                            &handle, &matB, num_B_rows,
                                            num_B_cols, ldb, alignment,
                                            type, order) )
    CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
                                            &handle, &matC, num_C_rows,
                                            num_C_cols, ldc, alignment,
                                            type, order) )
    // matmul, algorithm selection, and plan initialization
    CHECK_CUSPARSE( cusparseLtMatmulDescriptorInit(
                                            &handle, &matmul, opA, opB,
                                            &matA, &matB, &matC, &matC,
                                            compute_type) )
    CHECK_CUSPARSE( cusparseLtMatmulAlgSelectionInit(
                                            &handle, &alg_sel, &matmul,
                                            CUSPARSELT_MATMUL_ALG_DEFAULT) )
    CHECK_CUSPARSE( cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel))

    //--------------------------------------------------------------------------
    // Prune the A matrix (in-place) and check the correctness
    CHECK_CUSPARSE( cusparseLtSpMMAPrune(&handle, &matmul, dA, dA,
                                         CUSPARSELT_PRUNE_SPMMA_TILE, stream) )
    CHECK_CUSPARSE( cusparseLtSpMMAPruneCheck(&handle, &matmul, dA,
                                              d_valid, stream) )
    int is_valid;
    CHECK_CUDA( cudaMemcpyAsync(&is_valid, d_valid, sizeof(int),
                                cudaMemcpyDeviceToHost, stream) )
    CHECK_CUDA( cudaStreamSynchronize(stream) )
    if (is_valid != 0) {
        std::printf("!!!! The matrix has been pruned in a wrong way. "
                    "cusparseLtMatmul will not provide correct results\n");
        return EXIT_FAILURE;
    }
    //--------------------------------------------------------------------------
    // Compress the A matrix
    size_t compressed_size, compressed_buffer_size;
    void*  dA_compressedBuffer;
    CHECK_CUSPARSE( cusparseLtSpMMACompressedSize(&handle, &plan,
                                                  &compressed_size,
                                                  &compressed_buffer_size) )
    CHECK_CUDA( cudaMalloc((void**) &dA_compressed, compressed_size) )
    CHECK_CUDA( cudaMalloc((void**) &dA_compressedBuffer,
                           compressed_buffer_size) )

    CHECK_CUSPARSE( cusparseLtSpMMACompress(&handle, &plan, dA, dA_compressed,
                                            dA_compressedBuffer,stream) )
    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // Search the best kernel
    int           num_streams = 0;
    cudaStream_t* streams     = nullptr;
    CHECK_CUSPARSE( cusparseLtMatmulSearch(&handle, &plan, &alpha,
                                           dA_compressed, dB, &beta,
                                           dC, dD, nullptr,
                                           streams, num_streams) )
    // otherwise, it is possible to set it directly:
    //int alg = 0;
    //CHECK_CUSPARSE( cusparseLtMatmulAlgSetAttribute(
    //                                        &handle, &alg_sel,
    //                                        CUSPARSELT_MATMUL_ALG_CONFIG_ID,
    //                                        &alg, sizeof(alg)))
    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    size_t workspace_size;
    CHECK_CUSPARSE( cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel))

    CHECK_CUSPARSE( cusparseLtMatmulGetWorkspace(&handle, &plan,
                                                 &workspace_size))
    void* d_workspace;
    CHECK_CUDA( cudaMalloc((void**) &d_workspace, workspace_size) )
    // Perform the matrix multiplication
    CHECK_CUSPARSE( cusparseLtMatmul(&handle, &plan, &alpha, dA_compressed, dB,
                                     &beta, dC, dD, d_workspace, streams,
                                     num_streams) )
    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // destroy plan and handle
    CHECK_CUSPARSE( cusparseLtMatDescriptorDestroy(&matA) )
    CHECK_CUSPARSE( cusparseLtMatDescriptorDestroy(&matB) )
    CHECK_CUSPARSE( cusparseLtMatDescriptorDestroy(&matC) )
    CHECK_CUSPARSE( cusparseLtMatmulPlanDestroy(&plan) )
    CHECK_CUSPARSE( cusparseLtDestroy(&handle) )
    //--------------------------------------------------------------------------
    // device result check
    // matrix A has been pruned
    CHECK_CUDA( cudaMemcpy(hA, dA, A_size, cudaMemcpyDeviceToHost) )
    CHECK_CUDA( cudaMemcpy(hC, dC, C_size, cudaMemcpyDeviceToHost) )

    bool A_std_layout = (is_rowmajor != isA_transposed);
    bool B_std_layout = (is_rowmajor != isB_transposed);
    // host computation
    float hC_result[m * n];
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            float sum  = 0.0f;
            for (int k1 = 0; k1 < k; k1++) {
                auto posA = (A_std_layout) ? i * lda + k1 : i + k1 * lda;
                auto posB = (B_std_layout) ? k1 * ldb + j : k1 + j * ldb;
                sum      += static_cast<float>(hA[posA]) *  // [i][k]
                            static_cast<float>(hB[posB]);   // [k][j]
            }
            auto posC       = (is_rowmajor) ? i * ldc + j : i + j * ldc;
            hC_result[posC] = sum;  // [i][j]
        }
    }
    // host-device comparison
    int correct = 1;
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            auto pos          = (is_rowmajor) ? i * ldc + j : i + j * ldc;
            auto device_value = static_cast<float>(hC[pos]);
            auto host_value   = hC_result[pos];
            if (device_value != host_value) {
                // direct floating point comparison is not reliable
                std::printf("(%d, %d):\t%f vs. %f\n",
                            i, j, host_value, device_value);
                correct = 0;
                break;
            }
        }
    }
    if (correct)
        std::printf("matmul_example test PASSED\n");
    else
        std::printf("matmul_example test FAILED: wrong result\n");
    //--------------------------------------------------------------------------
    // device memory deallocation
    CHECK_CUDA( cudaFree(dA_compressed) )
    CHECK_CUDA( cudaFree(dA) )
    CHECK_CUDA( cudaFree(dB) )
    CHECK_CUDA( cudaFree(dC) )
    CHECK_CUDA( cudaFree(d_valid) )
    CHECK_CUDA( cudaFree(d_workspace) )
    CHECK_CUDA( cudaFree(dA_compressedBuffer) )
    return EXIT_SUCCESS;
}