Warning: ctc__rx_bytes_data_user.sum cant be measured?

202476410arsmart · July 29, 2024, 7:00am

Hi, I am doing ncu on cublas using H100:

#include <iostream>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <chrono>

#define CHECK_CUDA(call) \
    if((call) != cudaSuccess) { \
        std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString(call) << std::endl; \
        exit(1); \
    }

#define CHECK_CUBLAS(call) \
    if((call) != CUBLAS_STATUS_SUCCESS) { \
        std::cerr << "cuBLAS error at " << __FILE__ << ":" << __LINE__ << std::endl; \
        exit(1); \
    }

void initialize_matrix(half *matrix, int rows, int cols, half value) {
    half *host_matrix = new half[rows * cols];
    for (int i = 0; i < rows * cols; ++i) {
        host_matrix[i] = value;
    }
    CHECK_CUDA(cudaMemcpy(matrix, host_matrix, rows * cols * sizeof(half), cudaMemcpyHostToDevice));
    delete[] host_matrix;
}

void print_first_element(half *matrix) {
    half host_value;
    CHECK_CUDA(cudaMemcpy(&host_value, matrix, sizeof(half), cudaMemcpyDeviceToHost));
    std::cout << "First element: " << __half2float(host_value) << std::endl;
}

void handle_cublas_status(cublasStatus_t status) {
    if (status != CUBLAS_STATUS_SUCCESS) {
        std::cerr << "cublasGemmEx failed with error code: " << status << std::endl;
        switch(status) {
            case CUBLAS_STATUS_NOT_INITIALIZED:
                std::cerr << "CUBLAS_STATUS_NOT_INITIALIZED" << std::endl;
                break;
            case CUBLAS_STATUS_ALLOC_FAILED:
                std::cerr << "CUBLAS_STATUS_ALLOC_FAILED" << std::endl;
                break;
            case CUBLAS_STATUS_INVALID_VALUE:
                std::cerr << "CUBLAS_STATUS_INVALID_VALUE" << std::endl;
                break;
            case CUBLAS_STATUS_ARCH_MISMATCH:
                std::cerr << "CUBLAS_STATUS_ARCH_MISMATCH" << std::endl;
                break;
            case CUBLAS_STATUS_MAPPING_ERROR:
                std::cerr << "CUBLAS_STATUS_MAPPING_ERROR" << std::endl;
                break;
            case CUBLAS_STATUS_EXECUTION_FAILED:
                std::cerr << "CUBLAS_STATUS_EXECUTION_FAILED" << std::endl;
                break;
            case CUBLAS_STATUS_INTERNAL_ERROR:
                std::cerr << "CUBLAS_STATUS_INTERNAL_ERROR" << std::endl;
                break;
            case CUBLAS_STATUS_NOT_SUPPORTED:
                std::cerr << "CUBLAS_STATUS_NOT_SUPPORTED" << std::endl;
                break;
            case CUBLAS_STATUS_LICENSE_ERROR:
                std::cerr << "CUBLAS_STATUS_LICENSE_ERROR" << std::endl;
                break;
            default:
                std::cerr << "Unknown cublas status" << std::endl;
        }
        exit(1);
    }
}

int main() {
    const int M = 8192;
    const int N = 20480;
    const int K = 5120;

    // Allocate device memory
    half *d_A, *d_B, *d_C;
    CHECK_CUDA(cudaMalloc((void**)&d_A, M * K * sizeof(half)));
    CHECK_CUDA(cudaMalloc((void**)&d_B, K * N * sizeof(half)));
    CHECK_CUDA(cudaMalloc((void**)&d_C, M * N * sizeof(half)));

    // Initialize matrices
    initialize_matrix(d_A, M, K, __float2half(1.0f));
    initialize_matrix(d_B, K, N, __float2half(1.0f));
    initialize_matrix(d_C, M, N, __float2half(0.0f));

    // Initialize cuBLAS
    cublasHandle_t handle;
    CHECK_CUBLAS(cublasCreate(&handle));

    // Set cuBLAS to use Tensor Cores
    CHECK_CUBLAS(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

    // Define scaling factors
    const half alpha = __float2half(1.0f);
    const half beta = __float2half(0.0f);

    // Perform matrix multiplication: C = alpha * A * B + beta * C
    // A is M x K
    // B is K x N
    // C is M x N

    auto start = std::chrono::high_resolution_clock::now();

    cublasStatus_t status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                              N, M, K,
                              &alpha,
                              d_B, CUDA_R_16F, N,
                              d_A, CUDA_R_16F, K,
                              &beta,
                              d_C, CUDA_R_16F, N,
                              CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<float, std::milli> duration = end - start;

    handle_cublas_status(status);

    // Print the first element of the result matrix
    print_first_element(d_C);

    // Print the execution time
    std::cout << "Execution time: " << duration.count() << " ms" << std::endl;

    // Clean up
    CHECK_CUDA(cudaFree(d_A));
    CHECK_CUDA(cudaFree(d_B));
    CHECK_CUDA(cudaFree(d_C));
    CHECK_CUBLAS(cublasDestroy(handle));

    std::cout << "Matrix multiplication completed successfully!" << std::endl;
    return 0;
}

And use below command:

ncu --set full --replay-mode application --app-replay-match grid --app-replay-buffer file -f --export output-file-full.nsight-cuprof-report ./cublas_test

But receive below warning:

==WARNING== Unable to access the following 6 metrics:   ctc__rx_bytes_data_user.sum, ctc__rx_bytes_data_user.sum.pct_of_peak_sustained_elapsed, ctc__rx_bytes_data_user.sum.per_second, ctc__tx_bytes_data_user.sum, ctc__tx_bytes_data_user.sum.pct_of_peak_sustained_elapsed, ctc__tx_bytes_data_user.sum.per_second.

==PROF== Profiling "sm90_xmma_gemm_f16f16_f16f16_..." - 0: Application replay pass 1

Why??

veraj · July 29, 2024, 8:39am

Hi, @202476410arsmart

The warning is expected as your env do not support these metrics.
You can still profile to end successfully, right ?

202476410arsmart · July 29, 2024, 8:40am

yes! Just curious. Thanks!

Topic		Replies	Views
NCU on cuda12.5: what's the difference of two lines from L2 to shared? Nsight Compute	2	115	July 30, 2024
Cuda application crashes works fine for small data and crashes for big data CUDA Programming and Performance	3	414	October 12, 2021
Stream capture of cublas gemm CUDA Programming and Performance	8	695	March 31, 2025
cuBLAS call from kernel in CUDA 10.0 GPU-Accelerated Libraries	9	4849	April 7, 2021
Cuda application crashes works fine for small data and crashes for big data CUDA Developer Tools	0	362	December 8, 2020
cuBLAS handle creation fails CUDA Programming and Performance	1	511	June 13, 2022
Calling cuda with cublas_v2 from fortran CUDA Programming and Performance	4	568	March 2, 2017
Solve AX=B With cuSolver CUDA Programming and Performance	10	5786	August 18, 2015
[BUG][DEVICE LAUNCHED GRAPHS] CudaErrorInvalidValue with cuBLAS combined with the cudaGraphInstantiateFlagsDeviceLaunch starting with CUDA 12.1 CUDA Programming and Performance nvbugs	3	768	February 19, 2025
problem with double precision unpredictable results Different run give differents errors or no error CUDA Programming and Performance	12	2806	September 10, 2010

Warning: ctc__rx_bytes_data_user.sum cant be measured?

Related topics