cuBLAS launch 5 times threads blocks more than expected

1793030808 · April 11, 2024, 11:43am

environment: NVCC 11.8 and NVCC 12.3

hardware: A100-PCIe-40GB

compile: nvcc -o testcublas testcublas.cu

run:

ncu testcublas -M 2048 -K 2048 -N 512 -sp 0.1

problems:

I am using cublasHgemm, and I find that in the testcase m2048k2048n512, the ampere_h16816gemm_128x128_ldg8_stages_32x5_nn will launch [4, 16, 6] thread blocks, while ampere_h16816gemm_128x128_ldg8_stages_32x5_tt will only launch [16, 4, 1] thread blocks. The result C are both correct. But the first kernel is 2 times slower than the second one, why?

source file:

#include<bits/stdc++.h>
#include<cublas_v2.h>
#include <assert.h>


using namespace std;

#define HGEMM_LIKELY(x) __builtin_expect(!!(x), 1)
#define HGEMM_UNLIKELY(x) __builtin_expect(!!(x), 0)

#define HGEMM_CHECK(x)                    \
    do {                                  \
        if (HGEMM_UNLIKELY(!(x))) {       \
            HLOG("Check failed: %s", #x); \
            exit(EXIT_FAILURE);           \
        }                                 \
    } while (0)

#define HGEMM_CHECK_EQ(x, y) HGEMM_CHECK((x) == (y))
#define HGEMM_CHECK_NE(x, y) HGEMM_CHECK((x) != (y))
#define HGEMM_CHECK_LE(x, y) HGEMM_CHECK((x) <= (y))
#define HGEMM_CHECK_LT(x, y) HGEMM_CHECK((x) < (y))
#define HGEMM_CHECK_GE(x, y) HGEMM_CHECK((x) >= (y))
#define HGEMM_CHECK_GT(x, y) HGEMM_CHECK((x) > (y))


#define HGEMM_CHECK_CUBLAS_ERROR(_expr_)                                                                  \
    do {                                                                                                  \
        cublasStatus_t _ret_ = _expr_;                                                                    \
        if (HGEMM_UNLIKELY(_ret_ != CUBLAS_STATUS_SUCCESS)) {                                             \
            size_t _rt_version_ = cublasGetCudartVersion();                                               \
            exit(EXIT_FAILURE);                                                                           \
        }                                                                                                 \
    } while (0)


cublasHandle_t getCublasTensorOpHandle() {
    cublasHandle_t handle = nullptr;
    HGEMM_CHECK_CUBLAS_ERROR(cublasCreate(&handle));
    HGEMM_CHECK_CUBLAS_ERROR(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

    return handle;
}

void cublasTensorOp(half *A, half *B, half *C, size_t M, size_t N, size_t K) {
    static cublasHandle_t handle = getCublasTensorOpHandle();
    static half alpha = 1.0;
    static half beta = 0.0;

    HGEMM_CHECK_CUBLAS_ERROR(cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, K, &alpha, B, CUDA_R_16F, K, A,
                                          CUDA_R_16F, K, &beta, C, CUDA_R_16F, N, CUBLAS_COMPUTE_16F,
                                          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}

#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif

#define checkCudaErrors(func)				\
{									\
    cudaError_t e = (func);			\
    if(e != cudaSuccess)						                \
        printf ("%s %d CUDA: %s\n", __FILE__,  __LINE__, cudaGetErrorString(e));		\
}

// void init(float * ptr, size_t length, float sparsity)
// {
//     // lock the random seed for
//     srand (1);
//     for (int i = 0; i < length; i++)
//     {
//         float pro = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
//         //printf("pro: %f\n", pro);
//         if (pro < sparsity)
//         {
//             ptr[i] = 0.0;
//         }
//         else
//         {
//             ptr[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
//         }
//     }
// }

void init(half* ptr, size_t length, float sparsity) {
    srand(1);
    for (int i = 0; i < length; i++) {
        float pro = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
        if (pro < sparsity) {
            ptr[i] = (half)0.0;
        } else {
            ptr[i] = (half)(static_cast<float>(rand()) / static_cast<float>(RAND_MAX));
        }
    }
}

int main(int argc, char** argv) {
    int M;
    int K; 
    int N;
    float sparsity;
    unordered_map<string, string> options;
    for (int i = 1; i < argc; i+=2) {
        string option = argv[i];
        string value = argv[i + 1];
        if (option[0] != '-') {
            cerr << "Invalid option: " << option << endl;
            return 1;
        }
        option = option.substr(1);
        options[option] = value;
    }
    if (options.find("M") != options.end()) {
        std::cout << "Option M: " << options["M"] << endl;
        M = stoi(options["M"]);
    }
    if (options.find("K") != options.end()) {
        std::cout << "Option K: " << options["K"] << endl;
        K = stoi(options["K"]);
    }
    if (options.find("N") != options.end()) {
        std::cout << "Option N: " << options["N"] << endl;
        N = stoi(options["N"]);
    }
    if (options.find("sp") != options.end()) {
        std::cout << "sp: " << options["sp"] << endl;
        sparsity = stof(options["sp"]);
    }
    // scanf("%d%d%d%f", &M, &N, &K, &sparsity);
    half* h_A = (half*)malloc(sizeof(half) * M * K);
    half* h_B = (half*)malloc(sizeof(half) * K * N);
    half* h_C_host = (half*)malloc(sizeof(half) * M * N);
    half* h_C_device = (half*)malloc(sizeof(half) * M * N);

    init(h_A, M * K, sparsity);
    init(h_B, K * N, 0);

    half* d_A;
    half* d_B;
    half* d_C;
    checkCudaErrors(cudaMalloc((void**)&d_A, sizeof(half) * M * K));
    checkCudaErrors(cudaMalloc((void**)&d_B, sizeof(half) * K * N));
    checkCudaErrors(cudaMalloc(&d_C, sizeof(half) * M * N));
    checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(half) * M * K, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_B, h_B, sizeof(half) * K * N, cudaMemcpyHostToDevice));
    
    
    const half alpha = (half)1.0;
    const half beta = (half)0.0;
    cublasHandle_t handle;
    cublasCreate(&handle);
    {   // case1
        cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, M, N, K, &alpha, d_A, K, d_B, N, &beta, d_C, M);
        checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        memset(h_C_host, 0, sizeof(half) * M * N);
        printf("start host\n");
        for (int i = 0; i < M; i++) {
            for (int j = 0; j < N; j++) {
                for (int k = 0; k < K; k++) {
                    h_C_host[i + j * M] = (half)((float)(h_A[i * K + k]) * (float)(h_B[k * N + j]) + (float)(h_C_host[i + j * M]));
                }
            }
        }
        printf("end host\n");
        for (int i = 0; i < M * N; i++) {
            float value_host = (float)h_C_host[i];
            float value_device = (float)h_C_device[i];
            if (abs(value_host - value_device) / value_host > 0.02) {
                printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
                // exit(-1);
            }
        }
    }
    {   // case2
        cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B, N, d_A, K, &beta, d_C, N);
        checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        memset(h_C_host, 0, sizeof(half) * M * N);
        printf("start host\n");
        for (int i = 0; i < M; i++) {
            for (int j = 0; j < N; j++) {
                for (int k = 0; k < K; k++) {
                    h_C_host[i * N + j] = (half)((float)(h_A[i * K + k]) * (float)(h_B[k * N + j]) + (float)(h_C_host[i * N + j]));
                }
            }
        }
        printf("end host\n");
        for (int i = 0; i < M * N; i++) {
            float value_host = (float)h_C_host[i];
            float value_device = (float)h_C_device[i];
            if (abs(value_host - value_device) / value_host > 0.02) {
                printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i / N, i % N, value_host, value_device);
                // exit(-1);
            }
        }
    }
}

1793030808 · April 11, 2024, 12:31pm

I test nn, nt, tn, tt and set matrix A and B to different place(whether lhs matrix or rhs matrix), here is the report:

“6x” means that cuBLAS launch thread blocks 6 times more than expected.

kernel	A-lhs，B-rhs	A-rhs，B-lhs
nn	1x	6x
nt	1x	3x
tn	1x	1x
tt	1x	1x

source:

#define OFFSET_ROW(i, j, lda) ((i) * (lda) + (j))
#define OFFSET_COL(i, j, lda) ((i) + (j) * (lda))

int main(int argc, char** argv) {
    int M;
    int K; 
    int N;
    float sparsity;
    unordered_map<string, string> options;
    for (int i = 1; i < argc; i+=2) {
        string option = argv[i];
        string value = argv[i + 1];
        if (option[0] != '-') {
            cerr << "Invalid option: " << option << endl;
            return 1;
        }
        option = option.substr(1);
        options[option] = value;
    }
    if (options.find("M") != options.end()) {
        std::cout << "Option M: " << options["M"] << endl;
        M = stoi(options["M"]);
    }
    if (options.find("K") != options.end()) {
        std::cout << "Option K: " << options["K"] << endl;
        K = stoi(options["K"]);
    }
    if (options.find("N") != options.end()) {
        std::cout << "Option N: " << options["N"] << endl;
        N = stoi(options["N"]);
    }
    if (options.find("sp") != options.end()) {
        std::cout << "sp: " << options["sp"] << endl;
        sparsity = stof(options["sp"]);
    }
    // scanf("%d%d%d%f", &M, &N, &K, &sparsity);
    half* h_A = (half*)malloc(sizeof(half) * M * K);
    half* h_B = (half*)malloc(sizeof(half) * K * N);
    half* h_C_host = (half*)malloc(sizeof(half) * M * N);
    half* h_C_device = (half*)malloc(sizeof(half) * M * N);

    init(h_A, M * K, sparsity);
    init(h_B, K * N, 0);

    half* d_A;
    half* d_B;
    half* d_C;
    checkCudaErrors(cudaMalloc((void**)&d_A, sizeof(half) * M * K));
    checkCudaErrors(cudaMalloc((void**)&d_B, sizeof(half) * K * N));
    checkCudaErrors(cudaMalloc(&d_C, sizeof(half) * M * N));
    checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(half) * M * K, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_B, h_B, sizeof(half) * K * N, cudaMemcpyHostToDevice));
    
    
    const half alpha = (half)1.0;
    const half beta = (half)0.0;
    cublasHandle_t handle;
    cublasCreate(&handle);
    {//! case1 tt
        // cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, M, N, K, &alpha, d_A, K, d_B, N, &beta, d_C, M);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[i + j * M] = (half)((float)(h_A[i * K + k]) * (float)(h_B[k * N + j]) + (float)(h_C_host[i + j * M]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case2 nn
        // cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B, N, d_A, K, &beta, d_C, N);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[i * N + j] = (half)((float)(h_A[i * K + k]) * (float)(h_B[k * N + j]) + (float)(h_C_host[i * N + j]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i / N, i % N, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case3 nt
        // cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, M, N, K, &alpha, d_A, M, d_B, N, &beta, d_C, M);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[OFFSET_COL(i, j, M)] = (half)((float)(h_A[OFFSET_COL(i, k, M)]) * (float)(h_B[OFFSET_ROW(k, j, N)]) + (float)(h_C_host[OFFSET_COL(i, j, M)]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case4 nt
        // cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, N, M, K, &alpha, d_B, N, d_A, M, &beta, d_C, N);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[OFFSET_ROW(i, j, N)] = 
        //                 (half)((float)(h_A[OFFSET_COL(i, k, M)]) * 
        //                 (float)(h_B[OFFSET_ROW(k, j, N)]) + 
        //                 (float)(h_C_host[OFFSET_ROW(i, j, N)]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case5 tt
        // cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, M, K, &alpha, d_B, K, d_A, M, &beta, d_C, N);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[OFFSET_ROW(i, j, N)] = (half)(
        //                 (float)(h_A[OFFSET_COL(i, k, M)]) * 
        //                 (float)(h_B[OFFSET_COL(k, j, K)]) + 
        //                 (float)(h_C_host[OFFSET_ROW(i, j, N)]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case6 tn
        // cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, K, &alpha, d_B, K, d_A, K, &beta, d_C, N);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[OFFSET_ROW(i, j, N)] = (half)(
        //                 (float)(h_A[OFFSET_ROW(i, k, K)]) * 
        //                 (float)(h_B[OFFSET_COL(k, j, K)]) + 
        //                 (float)(h_C_host[OFFSET_ROW(i, j, N)]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case7 tn
        // cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, M, N, K, &alpha, d_A, K, d_B, K, &beta, d_C, M);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[OFFSET_COL(i, j, M)] = (half)(
        //                 (float)(h_A[OFFSET_ROW(i, k, K)]) * 
        //                 (float)(h_B[OFFSET_COL(k, j, K)]) + 
        //                 (float)(h_C_host[OFFSET_COL(i, j, M)]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case8 nn
        cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, d_A, M, d_B, K, &beta, d_C, M);
        checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        memset(h_C_host, 0, sizeof(half) * M * N);
        printf("start host\n");
        for (int i = 0; i < M; i++) {
            for (int j = 0; j < N; j++) {
                for (int k = 0; k < K; k++) {
                    h_C_host[OFFSET_COL(i, j, M)] = (half)(
                        (float)(h_A[OFFSET_COL(i, k, M)]) * 
                        (float)(h_B[OFFSET_COL(k, j, K)]) + 
                        (float)(h_C_host[OFFSET_COL(i, j, M)]));
                }
            }
        }
        printf("end host\n");
        for (int i = 0; i < M * N; i++) {
            float value_host = (float)h_C_host[i];
            float value_device = (float)h_C_device[i];
            if (abs(value_host - value_device) / value_host > 0.02) {
                printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i / N, i % N, value_host, value_device);
                // exit(-1);
            }
        }
    }
}

Robert_Crovella · April 11, 2024, 4:39pm

specifying a transpose vs. no-transpose can certainly have an effect on performance. And there is no reason to assume that the implementation will otherwise be the same. There could be significant differences in kernel design between transpose and non-transpose cases, and these differences could include differing number of threads/blocks to handle the case.

1793030808 · April 11, 2024, 4:45pm

Ok, so the kernel’s thread block number is correct. I understand, thanks for the explanation.

system · April 25, 2024, 4:46pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
HPL on cuBlas : Ok, but not on Tesla 1060 Board ! Tesla board crash on large array when launchin CUDA Programming and Performance	11	30523	December 20, 2009
cuBLAS convolution does not use Tensor Cores GPU-Accelerated Libraries cublas	6	2349	June 8, 2021
benchmark CUDA CuBLas and OpenCL CUDA Programming and Performance	13	28185	February 1, 2011
Slow CUDA SGEMM CUDA Programming and Performance	5	751	September 15, 2022
Is it correct that my Pascal card is calling Maxwell_gemm kernels through cublas? And if so, why is cublas unusably slow for me? CUDA Programming and Performance	6	1022	August 23, 2018
The best input layout settings in CuBlas GPU-Accelerated Libraries cublas	4	406	August 27, 2024
cublas<S,D,C,Z>GEMM crash on multi GPUs CUDA Programming and Performance	8	1991	November 27, 2014
Why is cuBLAS cublasDgemm slower than my naive GEMM kernel? GPU-Accelerated Libraries cuda , kernel , cublas , cutlass	1	65	September 15, 2025
Strange "unspecified launch error" from a call to cublas gemm CUDA Programming and Performance	23	3013	January 19, 2019
Help with CUBLAS performance and timing issues, please help... CUDA Programming and Performance	1	3487	December 26, 2008

cuBLAS launch 5 times threads blocks more than expected

Related topics