Why does cublasSgemm uses `f16` for `float`?

It uses something called “automatic down-conversion”.

I don’t have a detailed description, but I believe the expectation is that your input data must fit within FP16 type in order for the calculation to produce expected results.

Example:

$ cat t2208.cu
#include <cublas_v2.h>
#include <limits>
#include <iostream>

int main(){

  cublasHandle_t handle;
  cublasCreate(&handle);
#ifdef USE_TC
  cublasSetMathMode( handle, CUBLAS_TENSOR_OP_MATH );
#endif
  float *h_X, *d_A = 0, *d_B = 0, *d_C = 0, alpha = 1.0f;
  const int N = 1024;
  const int n2 = N*N;
  h_X = new float[n2]();
  cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0]));
  cudaMalloc(reinterpret_cast<void **>(&d_B), n2 * sizeof(d_B[0]));
  cudaMalloc(reinterpret_cast<void **>(&d_C), n2 * sizeof(d_C[0]));
  for (int i = 0; i < n2; i+=N+1)
    h_X[i] = 0.1f;
  cudaMemcpy(d_A, h_X, n2*sizeof(h_X[0]), cudaMemcpyHostToDevice);
  for (int i = 0; i < n2; i++) h_X[i] = std::numeric_limits<float>::max();
  cudaMemcpy(d_B, h_X, n2*sizeof(h_X[0]), cudaMemcpyHostToDevice);
  float beta = 0.0f;
  cublasStatus_t s = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);
  cudaMemcpy(h_X, d_C, n2*sizeof(h_X[0]), cudaMemcpyDeviceToHost);
  std::cout << std::numeric_limits<float>::max() << std::endl;
  std::cout << "status: " << (int) s << std::endl;
  std::cout << "result[0]: " << h_X[0] << std::endl;
}
$ nvcc -o t2208 t2208.cu -lcublas
$ ./t2208
3.40282e+38
status: 0
result[0]: 3.40282e+37
$ nvcc -o t2208 t2208.cu -lcublas -DUSE_TC
$ ./t2208
3.40282e+38
status: 0
result[0]: nan
$

If you’d like to see an improvement in the documentation, please file a bug.