environment: NVCC 11.8 and NVCC 12.3
hardware: A100-PCIe-40GB
compile: nvcc -o testcublas testcublas.cu
run:
ncu testcublas -M 2048 -K 2048 -N 512 -sp 0.1
problems:
I am using cublasHgemm, and I find that in the testcase m2048k2048n512, the ampere_h16816gemm_128x128_ldg8_stages_32x5_nn will launch [4, 16, 6] thread blocks, while ampere_h16816gemm_128x128_ldg8_stages_32x5_tt will only launch [16, 4, 1] thread blocks. The result C are both correct. But the first kernel is 2 times slower than the second one, why?
source file:
#include<bits/stdc++.h>
#include<cublas_v2.h>
#include <assert.h>
using namespace std;
#define HGEMM_LIKELY(x) __builtin_expect(!!(x), 1)
#define HGEMM_UNLIKELY(x) __builtin_expect(!!(x), 0)
#define HGEMM_CHECK(x) \
do { \
if (HGEMM_UNLIKELY(!(x))) { \
HLOG("Check failed: %s", #x); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define HGEMM_CHECK_EQ(x, y) HGEMM_CHECK((x) == (y))
#define HGEMM_CHECK_NE(x, y) HGEMM_CHECK((x) != (y))
#define HGEMM_CHECK_LE(x, y) HGEMM_CHECK((x) <= (y))
#define HGEMM_CHECK_LT(x, y) HGEMM_CHECK((x) < (y))
#define HGEMM_CHECK_GE(x, y) HGEMM_CHECK((x) >= (y))
#define HGEMM_CHECK_GT(x, y) HGEMM_CHECK((x) > (y))
#define HGEMM_CHECK_CUBLAS_ERROR(_expr_) \
do { \
cublasStatus_t _ret_ = _expr_; \
if (HGEMM_UNLIKELY(_ret_ != CUBLAS_STATUS_SUCCESS)) { \
size_t _rt_version_ = cublasGetCudartVersion(); \
exit(EXIT_FAILURE); \
} \
} while (0)
cublasHandle_t getCublasTensorOpHandle() {
cublasHandle_t handle = nullptr;
HGEMM_CHECK_CUBLAS_ERROR(cublasCreate(&handle));
HGEMM_CHECK_CUBLAS_ERROR(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
return handle;
}
void cublasTensorOp(half *A, half *B, half *C, size_t M, size_t N, size_t K) {
static cublasHandle_t handle = getCublasTensorOpHandle();
static half alpha = 1.0;
static half beta = 0.0;
HGEMM_CHECK_CUBLAS_ERROR(cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, K, &alpha, B, CUDA_R_16F, K, A,
CUDA_R_16F, K, &beta, C, CUDA_R_16F, N, CUBLAS_COMPUTE_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
#define checkCudaErrors(func) \
{ \
cudaError_t e = (func); \
if(e != cudaSuccess) \
printf ("%s %d CUDA: %s\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
}
// void init(float * ptr, size_t length, float sparsity)
// {
// // lock the random seed for
// srand (1);
// for (int i = 0; i < length; i++)
// {
// float pro = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
// //printf("pro: %f\n", pro);
// if (pro < sparsity)
// {
// ptr[i] = 0.0;
// }
// else
// {
// ptr[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
// }
// }
// }
void init(half* ptr, size_t length, float sparsity) {
srand(1);
for (int i = 0; i < length; i++) {
float pro = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
if (pro < sparsity) {
ptr[i] = (half)0.0;
} else {
ptr[i] = (half)(static_cast<float>(rand()) / static_cast<float>(RAND_MAX));
}
}
}
int main(int argc, char** argv) {
int M;
int K;
int N;
float sparsity;
unordered_map<string, string> options;
for (int i = 1; i < argc; i+=2) {
string option = argv[i];
string value = argv[i + 1];
if (option[0] != '-') {
cerr << "Invalid option: " << option << endl;
return 1;
}
option = option.substr(1);
options[option] = value;
}
if (options.find("M") != options.end()) {
std::cout << "Option M: " << options["M"] << endl;
M = stoi(options["M"]);
}
if (options.find("K") != options.end()) {
std::cout << "Option K: " << options["K"] << endl;
K = stoi(options["K"]);
}
if (options.find("N") != options.end()) {
std::cout << "Option N: " << options["N"] << endl;
N = stoi(options["N"]);
}
if (options.find("sp") != options.end()) {
std::cout << "sp: " << options["sp"] << endl;
sparsity = stof(options["sp"]);
}
// scanf("%d%d%d%f", &M, &N, &K, &sparsity);
half* h_A = (half*)malloc(sizeof(half) * M * K);
half* h_B = (half*)malloc(sizeof(half) * K * N);
half* h_C_host = (half*)malloc(sizeof(half) * M * N);
half* h_C_device = (half*)malloc(sizeof(half) * M * N);
init(h_A, M * K, sparsity);
init(h_B, K * N, 0);
half* d_A;
half* d_B;
half* d_C;
checkCudaErrors(cudaMalloc((void**)&d_A, sizeof(half) * M * K));
checkCudaErrors(cudaMalloc((void**)&d_B, sizeof(half) * K * N));
checkCudaErrors(cudaMalloc(&d_C, sizeof(half) * M * N));
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(half) * M * K, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_B, h_B, sizeof(half) * K * N, cudaMemcpyHostToDevice));
const half alpha = (half)1.0;
const half beta = (half)0.0;
cublasHandle_t handle;
cublasCreate(&handle);
{ // case1
cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, M, N, K, &alpha, d_A, K, d_B, N, &beta, d_C, M);
checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
memset(h_C_host, 0, sizeof(half) * M * N);
printf("start host\n");
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < K; k++) {
h_C_host[i + j * M] = (half)((float)(h_A[i * K + k]) * (float)(h_B[k * N + j]) + (float)(h_C_host[i + j * M]));
}
}
}
printf("end host\n");
for (int i = 0; i < M * N; i++) {
float value_host = (float)h_C_host[i];
float value_device = (float)h_C_device[i];
if (abs(value_host - value_device) / value_host > 0.02) {
printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i % M, i / M, value_host, value_device);
// exit(-1);
}
}
}
{ // case2
cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B, N, d_A, K, &beta, d_C, N);
checkCudaErrors(cudaMemcpy(h_C_device, d_C, sizeof(half) * M * N, cudaMemcpyDeviceToHost));
memset(h_C_host, 0, sizeof(half) * M * N);
printf("start host\n");
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < K; k++) {
h_C_host[i * N + j] = (half)((float)(h_A[i * K + k]) * (float)(h_B[k * N + j]) + (float)(h_C_host[i * N + j]));
}
}
}
printf("end host\n");
for (int i = 0; i < M * N; i++) {
float value_host = (float)h_C_host[i];
float value_device = (float)h_C_device[i];
if (abs(value_host - value_device) / value_host > 0.02) {
printf("(%d, %d) value_host = %.2f, value_device = %.2f\n", i / N, i % N, value_host, value_device);
// exit(-1);
}
}
}
}
