It uses something called “automatic down-conversion”.
I don’t have a detailed description, but I believe the expectation is that your input data must fit within FP16 type in order for the calculation to produce expected results.
Example:
$ cat t2208.cu
#include <cublas_v2.h>
#include <limits>
#include <iostream>
int main(){
cublasHandle_t handle;
cublasCreate(&handle);
#ifdef USE_TC
cublasSetMathMode( handle, CUBLAS_TENSOR_OP_MATH );
#endif
float *h_X, *d_A = 0, *d_B = 0, *d_C = 0, alpha = 1.0f;
const int N = 1024;
const int n2 = N*N;
h_X = new float[n2]();
cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0]));
cudaMalloc(reinterpret_cast<void **>(&d_B), n2 * sizeof(d_B[0]));
cudaMalloc(reinterpret_cast<void **>(&d_C), n2 * sizeof(d_C[0]));
for (int i = 0; i < n2; i+=N+1)
h_X[i] = 0.1f;
cudaMemcpy(d_A, h_X, n2*sizeof(h_X[0]), cudaMemcpyHostToDevice);
for (int i = 0; i < n2; i++) h_X[i] = std::numeric_limits<float>::max();
cudaMemcpy(d_B, h_X, n2*sizeof(h_X[0]), cudaMemcpyHostToDevice);
float beta = 0.0f;
cublasStatus_t s = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);
cudaMemcpy(h_X, d_C, n2*sizeof(h_X[0]), cudaMemcpyDeviceToHost);
std::cout << std::numeric_limits<float>::max() << std::endl;
std::cout << "status: " << (int) s << std::endl;
std::cout << "result[0]: " << h_X[0] << std::endl;
}
$ nvcc -o t2208 t2208.cu -lcublas
$ ./t2208
3.40282e+38
status: 0
result[0]: 3.40282e+37
$ nvcc -o t2208 t2208.cu -lcublas -DUSE_TC
$ ./t2208
3.40282e+38
status: 0
result[0]: nan
$
If you’d like to see an improvement in the documentation, please file a bug.