You can try this:
# cat t340.cu
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cassert>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
using T_ELEM_IN = half;
const int m = 8192;
const int n = 8192;
const int k = 8192;
const int rowsA = m;
const int colsA = k;
const int rowsB = k;
const int colsB = n;
const int rowsC = m;
const int colsC = n;
int main(){
half val1 = __float2half(1.0f);
half val0 = __float2half(0.f);
cublasHandle_t handle;
// First, create a cuBLAS handle:
cublasStatus_t cublasStat = cublasCreate(&handle);
assert(cublasStat == CUBLAS_STATUS_SUCCESS);
// Set the math mode to allow cuBLAS to use Tensor Cores: NO LONGER NECESSARY
//cublasStat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
// Allocate and initialize your matrices (only the A matrix is shown):
size_t matrixSizeA = (size_t)rowsA * colsA;
T_ELEM_IN *devPtrA = 0;
cudaMalloc((void**)&devPtrA, matrixSizeA * sizeof(devPtrA[0]));
T_ELEM_IN *A = (T_ELEM_IN *)malloc(matrixSizeA * sizeof(A[0]));
for (int i = 0; i < matrixSizeA; i++) A[i] = val1;
cublasStat = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA, rowsA);
assert(cublasStat == CUBLAS_STATUS_SUCCESS);
// ... allocate and initialize B and C matrices ...
size_t matrixSizeB = (size_t)rowsB * colsB;
T_ELEM_IN *devPtrB = 0;
cudaMalloc((void**)&devPtrB, matrixSizeB * sizeof(devPtrB[0]));
T_ELEM_IN *B = (T_ELEM_IN *)malloc(matrixSizeB * sizeof(B[0]));
for (int i = 0; i < matrixSizeB; i++) B[i] = val1;
cublasStat = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB, rowsB);
assert(cublasStat == CUBLAS_STATUS_SUCCESS);
size_t matrixSizeC = (size_t)rowsC * colsC;
T_ELEM_IN *devPtrC = 0;
cudaMalloc((void**)&devPtrC, matrixSizeC * sizeof(devPtrC[0]));
T_ELEM_IN *C = (T_ELEM_IN *)malloc(matrixSizeC * sizeof(C[0]));
for (int i = 0; i < matrixSizeC; i++) C[i] = val0;
cublasStat = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, devPtrC, rowsC);
assert(cublasStat == CUBLAS_STATUS_SUCCESS);
float alpha = 1.0f;
float beta = 0.f;
int lda = m;
int ldb = k;
int ldc = m;
// Invoke the GEMM, ensuring k, lda, ldb, and ldc are all multiples of 8,
// and m is a multiple of 4:
unsigned long long dt = dtime_usec(0);
cublasStat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha,
devPtrA, CUDA_R_16F, lda,
devPtrB, CUDA_R_16F, ldb,
&beta, devPtrC, CUDA_R_16F, ldc, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
cudaDeviceSynchronize();
assert(cublasStat == CUBLAS_STATUS_SUCCESS);
cudaError_t err = cudaGetLastError();
assert(err == cudaSuccess);
dt = dtime_usec(dt);
cudaMemcpy(C, devPtrC, sizeof(C[0]) * matrixSizeC, cudaMemcpyDeviceToHost);
std::cout << "C[0]: " << __half2float(C[0]) << std::endl;
std::cout << "duration: " << dt << "us" << std::endl;
std::cout << "flops/s: " << ((unsigned long long)m)*n*k*2/(float)dt << "MF/s" << std::endl;
}
# nvcc -o t340 t340.cu -lcublas
# ./t340
C[0]: 8192
duration: 93216us
flops/s: 1.17953e+07MF/s
#
(CUDA 12.2, L4 GPU)
More CUDA library sample codes are available here.
Note that FP16 has fairly limited range. Doing something seemingly innocuous like changing this:
half val1 = __float2half(1.0f);
to this:
half val1 = __float2half(4.0f);
will give a possibly unexpected result.
Also remember CUBLAS assumes column-major storage. This doesn’t matter for this square case where all the values are the same.
If you want to see something getting closer to full tensor core throughput, you may need to make the matrix dimensions larger, up to perhaps 32768.