Warning: ctc__rx_bytes_data_user.sum cant be measured?

Hi, I am doing ncu on cublas using H100:

#include <iostream>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <chrono>

#define CHECK_CUDA(call) \
    if((call) != cudaSuccess) { \
        std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString(call) << std::endl; \
        exit(1); \
    }

#define CHECK_CUBLAS(call) \
    if((call) != CUBLAS_STATUS_SUCCESS) { \
        std::cerr << "cuBLAS error at " << __FILE__ << ":" << __LINE__ << std::endl; \
        exit(1); \
    }

void initialize_matrix(half *matrix, int rows, int cols, half value) {
    half *host_matrix = new half[rows * cols];
    for (int i = 0; i < rows * cols; ++i) {
        host_matrix[i] = value;
    }
    CHECK_CUDA(cudaMemcpy(matrix, host_matrix, rows * cols * sizeof(half), cudaMemcpyHostToDevice));
    delete[] host_matrix;
}

void print_first_element(half *matrix) {
    half host_value;
    CHECK_CUDA(cudaMemcpy(&host_value, matrix, sizeof(half), cudaMemcpyDeviceToHost));
    std::cout << "First element: " << __half2float(host_value) << std::endl;
}

void handle_cublas_status(cublasStatus_t status) {
    if (status != CUBLAS_STATUS_SUCCESS) {
        std::cerr << "cublasGemmEx failed with error code: " << status << std::endl;
        switch(status) {
            case CUBLAS_STATUS_NOT_INITIALIZED:
                std::cerr << "CUBLAS_STATUS_NOT_INITIALIZED" << std::endl;
                break;
            case CUBLAS_STATUS_ALLOC_FAILED:
                std::cerr << "CUBLAS_STATUS_ALLOC_FAILED" << std::endl;
                break;
            case CUBLAS_STATUS_INVALID_VALUE:
                std::cerr << "CUBLAS_STATUS_INVALID_VALUE" << std::endl;
                break;
            case CUBLAS_STATUS_ARCH_MISMATCH:
                std::cerr << "CUBLAS_STATUS_ARCH_MISMATCH" << std::endl;
                break;
            case CUBLAS_STATUS_MAPPING_ERROR:
                std::cerr << "CUBLAS_STATUS_MAPPING_ERROR" << std::endl;
                break;
            case CUBLAS_STATUS_EXECUTION_FAILED:
                std::cerr << "CUBLAS_STATUS_EXECUTION_FAILED" << std::endl;
                break;
            case CUBLAS_STATUS_INTERNAL_ERROR:
                std::cerr << "CUBLAS_STATUS_INTERNAL_ERROR" << std::endl;
                break;
            case CUBLAS_STATUS_NOT_SUPPORTED:
                std::cerr << "CUBLAS_STATUS_NOT_SUPPORTED" << std::endl;
                break;
            case CUBLAS_STATUS_LICENSE_ERROR:
                std::cerr << "CUBLAS_STATUS_LICENSE_ERROR" << std::endl;
                break;
            default:
                std::cerr << "Unknown cublas status" << std::endl;
        }
        exit(1);
    }
}

int main() {
    const int M = 8192;
    const int N = 20480;
    const int K = 5120;

    // Allocate device memory
    half *d_A, *d_B, *d_C;
    CHECK_CUDA(cudaMalloc((void**)&d_A, M * K * sizeof(half)));
    CHECK_CUDA(cudaMalloc((void**)&d_B, K * N * sizeof(half)));
    CHECK_CUDA(cudaMalloc((void**)&d_C, M * N * sizeof(half)));

    // Initialize matrices
    initialize_matrix(d_A, M, K, __float2half(1.0f));
    initialize_matrix(d_B, K, N, __float2half(1.0f));
    initialize_matrix(d_C, M, N, __float2half(0.0f));

    // Initialize cuBLAS
    cublasHandle_t handle;
    CHECK_CUBLAS(cublasCreate(&handle));

    // Set cuBLAS to use Tensor Cores
    CHECK_CUBLAS(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

    // Define scaling factors
    const half alpha = __float2half(1.0f);
    const half beta = __float2half(0.0f);

    // Perform matrix multiplication: C = alpha * A * B + beta * C
    // A is M x K
    // B is K x N
    // C is M x N

    auto start = std::chrono::high_resolution_clock::now();

    cublasStatus_t status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                              N, M, K,
                              &alpha,
                              d_B, CUDA_R_16F, N,
                              d_A, CUDA_R_16F, K,
                              &beta,
                              d_C, CUDA_R_16F, N,
                              CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<float, std::milli> duration = end - start;

    handle_cublas_status(status);

    // Print the first element of the result matrix
    print_first_element(d_C);

    // Print the execution time
    std::cout << "Execution time: " << duration.count() << " ms" << std::endl;

    // Clean up
    CHECK_CUDA(cudaFree(d_A));
    CHECK_CUDA(cudaFree(d_B));
    CHECK_CUDA(cudaFree(d_C));
    CHECK_CUBLAS(cublasDestroy(handle));

    std::cout << "Matrix multiplication completed successfully!" << std::endl;
    return 0;
}

And use below command:

ncu --set full --replay-mode application --app-replay-match grid --app-replay-buffer file -f --export output-file-full.nsight-cuprof-report ./cublas_test

But receive below warning:

==WARNING== Unable to access the following 6 metrics:   ctc__rx_bytes_data_user.sum, ctc__rx_bytes_data_user.sum.pct_of_peak_sustained_elapsed, ctc__rx_bytes_data_user.sum.per_second, ctc__tx_bytes_data_user.sum, ctc__tx_bytes_data_user.sum.pct_of_peak_sustained_elapsed, ctc__tx_bytes_data_user.sum.per_second.

==PROF== Profiling "sm90_xmma_gemm_f16f16_f16f16_..." - 0: Application replay pass 1

Why??

Hi, @202476410arsmart

The warning is expected as your env do not support these metrics.
You can still profile to end successfully, right ?

1 Like

yes! Just curious. Thanks!