cuBLAS launch 5 times threads blocks more than expected

environment: NVCC 11.8 and NVCC 12.3

hardware: A100-PCIe-40GB

compile: nvcc -o testcublas testcublas.cu

run:

ncu testcublas -M 2048 -K 2048 -N 512 -sp 0.1

problems:

I am using cublasHgemm, and I find that in the testcase m2048k2048n512, the ampere_h16816gemm_128x128_ldg8_stages_32x5_nn will launch [4, 16, 6] thread blocks, while ampere_h16816gemm_128x128_ldg8_stages_32x5_tt will only launch [16, 4, 1] thread blocks. The result C are both correct. But the first kernel is 2 times slower than the second one, why?

source file:

#include<bits/stdc++.h>
#include<cublas_v2.h>
#include <assert.h>


using namespace std;

#define HGEMM_LIKELY(x) __builtin_expect(!!(x), 1)
#define HGEMM_UNLIKELY(x) __builtin_expect(!!(x), 0)

#define HGEMM_CHECK(x)                    \
    do {                                  \
        if (HGEMM_UNLIKELY(!(x))) {       \
            HLOG("Check failed: %s", #x); \
            exit(EXIT_FAILURE);           \
        }                                 \
    } while (0)

#define HGEMM_CHECK_EQ(x, y) HGEMM_CHECK((x) == (y))
#define HGEMM_CHECK_NE(x, y) HGEMM_CHECK((x) != (y))
#define HGEMM_CHECK_LE(x, y) HGEMM_CHECK((x) <= (y))
#define HGEMM_CHECK_LT(x, y) HGEMM_CHECK((x) < (y))
#define HGEMM_CHECK_GE(x, y) HGEMM_CHECK((x) >= (y))
#define HGEMM_CHECK_GT(x, y) HGEMM_CHECK((x) > (y))


#define HGEMM_CHECK_CUBLAS_ERROR(_expr_)                                                                  \
    do {                                                                                                  \
        cublasStatus_t _ret_ = _expr_;                                                                    \
        if (HGEMM_UNLIKELY(_ret_ != CUBLAS_STATUS_SUCCESS)) {                                             \
            size_t _rt_version_ = cublasGetCudartVersion();                                               \
            exit(EXIT_FAILURE);                                                                           \
        }                                                                                                 \
    } while (0)


cublasHandle_t getCublasTensorOpHandle() {
    cublasHandle_t handle = nullptr;
    HGEMM_CHECK_CUBLAS_ERROR(cublasCreate(&handle));
    HGEMM_CHECK_CUBLAS_ERROR(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

    return handle;
}

void cublasTensorOp(half *A, half *B, half *C, size_t M, size_t N, size_t K) {
    static cublasHandle_t handle = getCublasTensorOpHandle();
    static half alpha = 1.0;
    static half beta = 0.0;

    HGEMM_CHECK_CUBLAS_ERROR(cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, K, &alpha, B, CUDA_R_16F, K, A,
                                          CUDA_R_16F, K, &beta, C, CUDA_R_16F, N, CUBLAS_COMPUTE_16F,
                                          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}

#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif

#define checkCudaErrors(func)				\
{									\
    cudaError_t e = (func);			\
    if(e != cudaSuccess)						                \
        printf ("%s %d CUDA: %s\n", __FILE__,  __LINE__, cudaGetErrorString(e));		\
}

// void init(float * ptr, size_t length, float sparsity)
// {
//     // lock the random seed for
//     srand (1);
//     for (int i = 0; i < length; i++)
//     {
//         float pro = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
//         //printf("pro: %f\n", pro);
//         if (pro < sparsity)
//         {
//             ptr[i] = 0.0;
//         }
//         else
//         {
//             ptr[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
//         }
//     }
// }

void init(half* ptr, size_t length, float sparsity) {
    srand(1);
    for (int i = 0; i < length; i++) {
        float pro = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
        if (pro < sparsity) {
            ptr[i] = (half)0.0;
        } else {
            ptr[i] = (half)(static_cast<float>(rand()) / static_cast<float>(RAND_MAX));
        }
    }
}

int main(int argc, char** argv) {
    int M;
    int K; 
    int N;
    float sparsity;
    unordered_map<string, string> options;
    for (int i = 1; i < argc; i+=2) {
        string option = argv[i];
        string value = argv[i + 1];
        if (option[0] != '-') {
            cerr << "Invalid option: " << option << endl;
            return 1;
        }
        option = option.substr(1);
        options[option] = value;
    }
    if (options.find("M") != options.end()) {
        std::cout << "Option M: " << options["M"] << endl;
        M = stoi(options["M"]);
    }
    if (options.find("K") != options.end()) {
        std::cout << "Option K: " << options["K"] << endl;
        K = stoi(options["K"]);
    }
    if (options.find("N") != options.end()) {
        std::cout << "Option N: " << options["N"] << endl;
        N = stoi(options["N"]);
    }
    if (options.find("sp") != options.end()) {
        std::cout << "sp: " << options["sp"] << endl;
        sparsity = stof(options["sp"]);
    }
    // scanf("%d%d%d%f", &M, &N, &K, &sparsity);
    half* h_A = (half*)malloc(sizeof(half) * M * K);
    half* h_B = (half*)malloc(sizeof(half) * K * N);
    half* h_C_host = (half*)malloc(sizeof(half) * M * N);
    half* h_C_device = (half*)malloc(sizeof(half) * M * N);

    init(h_A, M * K, sparsity);
    init(h_B, K * N, 0);

    half* d_A;
    half* d_B;
    half* d_C;
    checkCudaErrors(cudaMalloc((void**)&d_A, sizeof(half) * M * K));
    checkCudaErrors(cudaMalloc((void**)&d_B, sizeof(half) * K * N));
    checkCudaErrors(cudaMalloc(&d_C, sizeof(half) * M * N));
    checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(half) * M * K, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_B, h_B, sizeof(half) * K * N, cudaMemcpyHostToDevice));
    
    
    const half alpha = (half)1.0;
    const half beta = (half)0.0;
    cublasHandle_t handle;
    cublasCreate(&handle);
    {   // case1
        cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, M, N, K, &alpha, d_A, K, d_B, N, &beta, d_C, M);
        checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        memset(h_C_host, 0, sizeof(half) * M * N);
        printf("start host\n");
        for (int i = 0; i < M; i++) {
            for (int j = 0; j < N; j++) {
                for (int k = 0; k < K; k++) {
                    h_C_host[i + j * M] = (half)((float)(h_A[i * K + k]) * (float)(h_B[k * N + j]) + (float)(h_C_host[i + j * M]));
                }
            }
        }
        printf("end host\n");
        for (int i = 0; i < M * N; i++) {
            float value_host = (float)h_C_host[i];
            float value_device = (float)h_C_device[i];
            if (abs(value_host - value_device) / value_host > 0.02) {
                printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
                // exit(-1);
            }
        }
    }
    {   // case2
        cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B, N, d_A, K, &beta, d_C, N);
        checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        memset(h_C_host, 0, sizeof(half) * M * N);
        printf("start host\n");
        for (int i = 0; i < M; i++) {
            for (int j = 0; j < N; j++) {
                for (int k = 0; k < K; k++) {
                    h_C_host[i * N + j] = (half)((float)(h_A[i * K + k]) * (float)(h_B[k * N + j]) + (float)(h_C_host[i * N + j]));
                }
            }
        }
        printf("end host\n");
        for (int i = 0; i < M * N; i++) {
            float value_host = (float)h_C_host[i];
            float value_device = (float)h_C_device[i];
            if (abs(value_host - value_device) / value_host > 0.02) {
                printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i / N, i % N, value_host, value_device);
                // exit(-1);
            }
        }
    }
}

I test nn, nt, tn, tt and set matrix A and B to different place(whether lhs matrix or rhs matrix), here is the report:

“6x” means that cuBLAS launch thread blocks 6 times more than expected.

kernel A-lhs,B-rhs A-rhs,B-lhs
nn 1x 6x
nt 1x 3x
tn 1x 1x
tt 1x 1x

source:

#define OFFSET_ROW(i, j, lda) ((i) * (lda) + (j))
#define OFFSET_COL(i, j, lda) ((i) + (j) * (lda))

int main(int argc, char** argv) {
    int M;
    int K; 
    int N;
    float sparsity;
    unordered_map<string, string> options;
    for (int i = 1; i < argc; i+=2) {
        string option = argv[i];
        string value = argv[i + 1];
        if (option[0] != '-') {
            cerr << "Invalid option: " << option << endl;
            return 1;
        }
        option = option.substr(1);
        options[option] = value;
    }
    if (options.find("M") != options.end()) {
        std::cout << "Option M: " << options["M"] << endl;
        M = stoi(options["M"]);
    }
    if (options.find("K") != options.end()) {
        std::cout << "Option K: " << options["K"] << endl;
        K = stoi(options["K"]);
    }
    if (options.find("N") != options.end()) {
        std::cout << "Option N: " << options["N"] << endl;
        N = stoi(options["N"]);
    }
    if (options.find("sp") != options.end()) {
        std::cout << "sp: " << options["sp"] << endl;
        sparsity = stof(options["sp"]);
    }
    // scanf("%d%d%d%f", &M, &N, &K, &sparsity);
    half* h_A = (half*)malloc(sizeof(half) * M * K);
    half* h_B = (half*)malloc(sizeof(half) * K * N);
    half* h_C_host = (half*)malloc(sizeof(half) * M * N);
    half* h_C_device = (half*)malloc(sizeof(half) * M * N);

    init(h_A, M * K, sparsity);
    init(h_B, K * N, 0);

    half* d_A;
    half* d_B;
    half* d_C;
    checkCudaErrors(cudaMalloc((void**)&d_A, sizeof(half) * M * K));
    checkCudaErrors(cudaMalloc((void**)&d_B, sizeof(half) * K * N));
    checkCudaErrors(cudaMalloc(&d_C, sizeof(half) * M * N));
    checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(half) * M * K, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_B, h_B, sizeof(half) * K * N, cudaMemcpyHostToDevice));
    
    
    const half alpha = (half)1.0;
    const half beta = (half)0.0;
    cublasHandle_t handle;
    cublasCreate(&handle);
    {//! case1 tt
        // cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, M, N, K, &alpha, d_A, K, d_B, N, &beta, d_C, M);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[i + j * M] = (half)((float)(h_A[i * K + k]) * (float)(h_B[k * N + j]) + (float)(h_C_host[i + j * M]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case2 nn
        // cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B, N, d_A, K, &beta, d_C, N);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[i * N + j] = (half)((float)(h_A[i * K + k]) * (float)(h_B[k * N + j]) + (float)(h_C_host[i * N + j]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i / N, i % N, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case3 nt
        // cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, M, N, K, &alpha, d_A, M, d_B, N, &beta, d_C, M);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[OFFSET_COL(i, j, M)] = (half)((float)(h_A[OFFSET_COL(i, k, M)]) * (float)(h_B[OFFSET_ROW(k, j, N)]) + (float)(h_C_host[OFFSET_COL(i, j, M)]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case4 nt
        // cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, N, M, K, &alpha, d_B, N, d_A, M, &beta, d_C, N);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[OFFSET_ROW(i, j, N)] = 
        //                 (half)((float)(h_A[OFFSET_COL(i, k, M)]) * 
        //                 (float)(h_B[OFFSET_ROW(k, j, N)]) + 
        //                 (float)(h_C_host[OFFSET_ROW(i, j, N)]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case5 tt
        // cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, M, K, &alpha, d_B, K, d_A, M, &beta, d_C, N);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[OFFSET_ROW(i, j, N)] = (half)(
        //                 (float)(h_A[OFFSET_COL(i, k, M)]) * 
        //                 (float)(h_B[OFFSET_COL(k, j, K)]) + 
        //                 (float)(h_C_host[OFFSET_ROW(i, j, N)]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case6 tn
        // cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, K, &alpha, d_B, K, d_A, K, &beta, d_C, N);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[OFFSET_ROW(i, j, N)] = (half)(
        //                 (float)(h_A[OFFSET_ROW(i, k, K)]) * 
        //                 (float)(h_B[OFFSET_COL(k, j, K)]) + 
        //                 (float)(h_C_host[OFFSET_ROW(i, j, N)]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case7 tn
        // cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, M, N, K, &alpha, d_A, K, d_B, K, &beta, d_C, M);
        // checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        // memset(h_C_host, 0, sizeof(half) * M * N);
        // printf("start host\n");
        // for (int i = 0; i < M; i++) {
        //     for (int j = 0; j < N; j++) {
        //         for (int k = 0; k < K; k++) {
        //             h_C_host[OFFSET_COL(i, j, M)] = (half)(
        //                 (float)(h_A[OFFSET_ROW(i, k, K)]) * 
        //                 (float)(h_B[OFFSET_COL(k, j, K)]) + 
        //                 (float)(h_C_host[OFFSET_COL(i, j, M)]));
        //         }
        //     }
        // }
        // printf("end host\n");
        // for (int i = 0; i < M * N; i++) {
        //     float value_host = (float)h_C_host[i];
        //     float value_device = (float)h_C_device[i];
        //     if (abs(value_host - value_device) / value_host > 0.02) {
        //         printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
        //         // exit(-1);
        //     }
        // }
    }
    {//! case8 nn
        cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, d_A, M, d_B, K, &beta, d_C, M);
        checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
        memset(h_C_host, 0, sizeof(half) * M * N);
        printf("start host\n");
        for (int i = 0; i < M; i++) {
            for (int j = 0; j < N; j++) {
                for (int k = 0; k < K; k++) {
                    h_C_host[OFFSET_COL(i, j, M)] = (half)(
                        (float)(h_A[OFFSET_COL(i, k, M)]) * 
                        (float)(h_B[OFFSET_COL(k, j, K)]) + 
                        (float)(h_C_host[OFFSET_COL(i, j, M)]));
                }
            }
        }
        printf("end host\n");
        for (int i = 0; i < M * N; i++) {
            float value_host = (float)h_C_host[i];
            float value_device = (float)h_C_device[i];
            if (abs(value_host - value_device) / value_host > 0.02) {
                printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i / N, i % N, value_host, value_device);
                // exit(-1);
            }
        }
    }
}

specifying a transpose vs. no-transpose can certainly have an effect on performance. And there is no reason to assume that the implementation will otherwise be the same. There could be significant differences in kernel design between transpose and non-transpose cases, and these differences could include differing number of threads/blocks to handle the case.

Ok, so the kernel’s thread block number is correct. I understand, thanks for the explanation.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.