NCU on cuda12.5: what's the difference of two lines from L2 to shared?

I tested cublas on H100:

#include <iostream>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <chrono>

#define CHECK_CUDA(call) \
    if((call) != cudaSuccess) { \
        std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString(call) << std::endl; \
        exit(1); \
    }

#define CHECK_CUBLAS(call) \
    if((call) != CUBLAS_STATUS_SUCCESS) { \
        std::cerr << "cuBLAS error at " << __FILE__ << ":" << __LINE__ << std::endl; \
        exit(1); \
    }

void initialize_matrix(half *matrix, int rows, int cols, half value) {
    half *host_matrix = new half[rows * cols];
    for (int i = 0; i < rows * cols; ++i) {
        host_matrix[i] = value;
    }
    CHECK_CUDA(cudaMemcpy(matrix, host_matrix, rows * cols * sizeof(half), cudaMemcpyHostToDevice));
    delete[] host_matrix;
}

void print_first_element(half *matrix) {
    half host_value;
    CHECK_CUDA(cudaMemcpy(&host_value, matrix, sizeof(half), cudaMemcpyDeviceToHost));
    std::cout << "First element: " << __half2float(host_value) << std::endl;
}

void handle_cublas_status(cublasStatus_t status) {
    if (status != CUBLAS_STATUS_SUCCESS) {
        std::cerr << "cublasGemmEx failed with error code: " << status << std::endl;
        switch(status) {
            case CUBLAS_STATUS_NOT_INITIALIZED:
                std::cerr << "CUBLAS_STATUS_NOT_INITIALIZED" << std::endl;
                break;
            case CUBLAS_STATUS_ALLOC_FAILED:
                std::cerr << "CUBLAS_STATUS_ALLOC_FAILED" << std::endl;
                break;
            case CUBLAS_STATUS_INVALID_VALUE:
                std::cerr << "CUBLAS_STATUS_INVALID_VALUE" << std::endl;
                break;
            case CUBLAS_STATUS_ARCH_MISMATCH:
                std::cerr << "CUBLAS_STATUS_ARCH_MISMATCH" << std::endl;
                break;
            case CUBLAS_STATUS_MAPPING_ERROR:
                std::cerr << "CUBLAS_STATUS_MAPPING_ERROR" << std::endl;
                break;
            case CUBLAS_STATUS_EXECUTION_FAILED:
                std::cerr << "CUBLAS_STATUS_EXECUTION_FAILED" << std::endl;
                break;
            case CUBLAS_STATUS_INTERNAL_ERROR:
                std::cerr << "CUBLAS_STATUS_INTERNAL_ERROR" << std::endl;
                break;
            case CUBLAS_STATUS_NOT_SUPPORTED:
                std::cerr << "CUBLAS_STATUS_NOT_SUPPORTED" << std::endl;
                break;
            case CUBLAS_STATUS_LICENSE_ERROR:
                std::cerr << "CUBLAS_STATUS_LICENSE_ERROR" << std::endl;
                break;
            default:
                std::cerr << "Unknown cublas status" << std::endl;
        }
        exit(1);
    }
}

int main() {
    const int M = 4096;
    const int N = 20480;
    const int K = 5120;

    // Allocate device memory
    half *d_A, *d_B, *d_C;
    CHECK_CUDA(cudaMalloc((void**)&d_A, M * K * sizeof(half)));
    CHECK_CUDA(cudaMalloc((void**)&d_B, K * N * sizeof(half)));
    CHECK_CUDA(cudaMalloc((void**)&d_C, M * N * sizeof(half)));

    // Initialize matrices
    initialize_matrix(d_A, M, K, __float2half(1.0f));
    initialize_matrix(d_B, K, N, __float2half(1.0f));
    initialize_matrix(d_C, M, N, __float2half(0.0f));

    // Initialize cuBLAS
    cublasHandle_t handle;
    CHECK_CUBLAS(cublasCreate(&handle));

    // Set cuBLAS to use Tensor Cores
    CHECK_CUBLAS(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

    // Define scaling factors
    const half alpha = __float2half(1.0f);
    const half beta = __float2half(0.0f);

    // Perform matrix multiplication: C = alpha * A * B + beta * C
    // A is M x K
    // B is K x N
    // C is M x N

    auto start = std::chrono::high_resolution_clock::now();

    cublasStatus_t status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                              N, M, K,
                              &alpha,
                              d_B, CUDA_R_16F, N,
                              d_A, CUDA_R_16F, K,
                              &beta,
                              d_C, CUDA_R_16F, N,
                              CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<float, std::milli> duration = end - start;

    handle_cublas_status(status);

    // Print the first element of the result matrix
    print_first_element(d_C);

    // Print the execution time
    std::cout << "Execution time: " << duration.count() << " ms" << std::endl;

    // Clean up
    CHECK_CUDA(cudaFree(d_A));
    CHECK_CUDA(cudaFree(d_B));
    CHECK_CUDA(cudaFree(d_C));
    CHECK_CUBLAS(cublasDestroy(handle));

    std::cout << "Matrix multiplication completed successfully!" << std::endl;
    return 0;
}

And I am curious that what is the difference between these two lines? Both are from L2 to shared memory…
output-file-full.nsight-cuprof-report.zip (40.3 MB)

The upper left arrow shows bytes transferred from L2 to Shared Memory due to LDGSTS (Asynchronous Global to Shared Memcopy) instructions. The lower right line has a symmetrical one in the other direction (from Shared Memory to L2), and measures the bytes transferred between the two units due to TMA (Tensor Memory Access). You should be able to get more details on which exact metrics are used to compute either by hovering over the respective labels.

1 Like