Tensor core boiler plate with cublas, can not compile

This is very old documentation but I have tried using cublas version of tensor core implementation but because many of the lines are missing from example snippet, I can not fully form the functioning and compiling file, can someone assist me on this?
Reading further below, it seems git repo available for code samples but seems for cudnn and manual implementation using raw cuda code:

Thx

You can try this:

# cat t340.cu
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cassert>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start=0){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

using T_ELEM_IN = half;
const int m = 8192;
const int n = 8192;
const int k = 8192;
const int rowsA = m;
const int colsA = k;
const int rowsB = k;
const int colsB = n;
const int rowsC = m;
const int colsC = n;

int main(){
  half val1 = __float2half(1.0f);
  half val0 = __float2half(0.f);
  cublasHandle_t handle;
// First, create a cuBLAS handle:
  cublasStatus_t cublasStat = cublasCreate(&handle);
  assert(cublasStat == CUBLAS_STATUS_SUCCESS);
// Set the math mode to allow cuBLAS to use Tensor Cores: NO LONGER NECESSARY
  //cublasStat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);

// Allocate and initialize your matrices (only the A matrix is shown):
  size_t matrixSizeA = (size_t)rowsA * colsA;
  T_ELEM_IN *devPtrA = 0;

  cudaMalloc((void**)&devPtrA, matrixSizeA * sizeof(devPtrA[0]));
  T_ELEM_IN *A  = (T_ELEM_IN *)malloc(matrixSizeA * sizeof(A[0]));

  for (int i = 0; i < matrixSizeA; i++) A[i] = val1;
  cublasStat = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA, rowsA);
  assert(cublasStat == CUBLAS_STATUS_SUCCESS);

// ... allocate and initialize B and C matrices  ...
  size_t matrixSizeB = (size_t)rowsB * colsB;
  T_ELEM_IN *devPtrB = 0;

  cudaMalloc((void**)&devPtrB, matrixSizeB * sizeof(devPtrB[0]));
  T_ELEM_IN *B  = (T_ELEM_IN *)malloc(matrixSizeB * sizeof(B[0]));

  for (int i = 0; i < matrixSizeB; i++) B[i] = val1;
  cublasStat = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB, rowsB);
  assert(cublasStat == CUBLAS_STATUS_SUCCESS);

  size_t matrixSizeC = (size_t)rowsC * colsC;
  T_ELEM_IN *devPtrC = 0;

  cudaMalloc((void**)&devPtrC, matrixSizeC * sizeof(devPtrC[0]));
  T_ELEM_IN *C  = (T_ELEM_IN *)malloc(matrixSizeC * sizeof(C[0]));

  for (int i = 0; i < matrixSizeC; i++) C[i] = val0;
  cublasStat = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, devPtrC, rowsC);
  assert(cublasStat == CUBLAS_STATUS_SUCCESS);
  float alpha = 1.0f;
  float beta  = 0.f;
  int lda = m;
  int ldb = k;
  int ldc = m;
// Invoke the GEMM, ensuring k, lda, ldb, and ldc are all multiples of 8,
// and m is a multiple of 4:
  unsigned long long dt = dtime_usec(0);
  cublasStat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha,
                          devPtrA, CUDA_R_16F, lda,
                          devPtrB, CUDA_R_16F, ldb,
                          &beta, devPtrC, CUDA_R_16F, ldc, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
  cudaDeviceSynchronize();
  assert(cublasStat == CUBLAS_STATUS_SUCCESS);
  cudaError_t err = cudaGetLastError();
  assert(err == cudaSuccess);
  dt = dtime_usec(dt);
  cudaMemcpy(C, devPtrC, sizeof(C[0]) * matrixSizeC, cudaMemcpyDeviceToHost);
  std::cout << "C[0]: "  << __half2float(C[0]) << std::endl;
  std::cout << "duration: " << dt << "us" << std::endl;
  std::cout << "flops/s: "  << ((unsigned long long)m)*n*k*2/(float)dt << "MF/s" << std::endl;
}
# nvcc -o t340 t340.cu -lcublas
# ./t340
C[0]: 8192
duration: 93216us
flops/s: 1.17953e+07MF/s
#

(CUDA 12.2, L4 GPU)

More CUDA library sample codes are available here.

Note that FP16 has fairly limited range. Doing something seemingly innocuous like changing this:

half val1 = __float2half(1.0f);

to this:

half val1 = __float2half(4.0f);

will give a possibly unexpected result.

Also remember CUBLAS assumes column-major storage. This doesn’t matter for this square case where all the values are the same.

If you want to see something getting closer to full tensor core throughput, you may need to make the matrix dimensions larger, up to perhaps 32768.

thx, your example worked! I will digest your other comments more slowly, looks interesting. :).