I am learning to perform algorithmic deep optimization using the Tensor Core API wmma.
I encountered an issue while using the API, and below is an example using GEMM where matrix A and B are FP16, and the resulting matrix C is FP32 (float), all of them are row-majored.
As you can see, the tensor core code below is from the official cuda-samples (I only replace col-majored input with row-majored), and the calculation results from Tensor Cores and CUDA Cores have an error greater than 1e-3.
Is this an issue with my code, or is this expected behavior?

These are results from Naive CUDA GEMM kernel, Optimized CUDA kernel and Tensor Core wmma API.
You can see that the first two results are consistent as expected, but the final Tensor Core result has a significant discrepancy compared to the first two.
#include <cuda_runtime.h>
#include <mma.h>
using namespace nvcuda;
#include <iostream>
#include <random>
#include <utility>
#include <vector>
#include <cuda_fp16.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
void fill_random_half_values(__half* arr, size_t n, std::default_random_engine& e)
{
std::uniform_real_distribution<float> uniform_dist(-128.0f, 127.0f);
for (size_t i{0}; i < n; ++i) {
arr[i] = __float2half(uniform_dist(e));
}
}
// Naive GEMM
template<typename T>
__global__ void __launch_bounds__(1024) gemm_naive(const T *__restrict__ dA, const T *__restrict__ dB, float *__restrict__ dC, int M, int K, int N)
{
int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
float tmp = 0;
if (row < M && col < N)
{
for (int s = 0; s < K; s++)
{
tmp += __half2float(dA[row * K + s] * dB[s * N + col]);
}
dC[row * N + col] = tmp;
}
}
template<typename T>
__global__ void __launch_bounds__(1024) gemm_CUDA(float *__restrict__ c, const T *__restrict__ a, const T *__restrict__ b, int M, int N, int K) {
const int bx = blockIdx.x;
const int by = blockIdx.y;
const int TILE_SIZE = 16;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int col = bx * TILE_SIZE + tx;
const int row = by * TILE_SIZE + ty;
__shared__ T SA[TILE_SIZE][TILE_SIZE];
__shared__ T SB[TILE_SIZE][TILE_SIZE];
float sum = 0;
for (int k = 0; k < (K + TILE_SIZE - 1)/TILE_SIZE; ++k) {
if (row < M && k * TILE_SIZE + tx < K) {
SA[ty][tx] = a[row * K + k * TILE_SIZE + tx];
} else {
SA[ty][tx] = 0;
}
if (col < N && k * TILE_SIZE + ty < K) {
SB[ty][tx] = b[col + (k * TILE_SIZE + ty) * N];
} else {
SB[ty][tx] = 0;
}
__syncthreads();
for (int n_k = 0; n_k < TILE_SIZE; ++n_k) {
sum += __half2float(SA[ty][n_k] * SB[n_k][tx]);
}
__syncthreads();
}
if (row < M && col < N) {
c[row * N + col] = sum;
}
}
#define WMMA_M 16
#define WMMA_N 16
#define WMMA_K 16
#define WARP_SIZE 32
__host__ __device__ int div_ceil(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
__global__ void wmmaNaiveKernel(const half *__restrict__ A, const half *__restrict__ B, float *__restrict__ C, size_t M, size_t N, size_t K) {
const size_t K_tiles = div_ceil(K, WMMA_K);
const size_t warp_row = blockIdx.y * WMMA_M;
const size_t warp_col = blockIdx.x * WMMA_N;
if (warp_row >= M || warp_col >= N) {
return;
}
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> C_frag;
wmma::fill_fragment(C_frag, 0.0f);
#pragma unroll
for (size_t i = 0; i < K_tiles; ++i) {
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> A_frag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> B_frag;
wmma::load_matrix_sync(A_frag, A + warp_row * K + i * WMMA_K, K);
wmma::load_matrix_sync(B_frag, B + warp_col + i * WMMA_K * N, N);
wmma::mma_sync(C_frag, A_frag, B_frag, C_frag);
}
wmma::store_matrix_sync(C + warp_row * N + warp_col, C_frag, N, wmma::mem_row_major);
}
int main() {
int M = 128;
int K = 128;
int N = 128;
std::cout << "Matrix Sizes" << std::endl;
std::cout << "M: " << M << std::endl;
std::cout << "N: " << N << std::endl;
std::cout << "K: " << K << std::endl;
unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
std::default_random_engine random_engine(seed);
thrust::host_vector<__half> h_a_vec(M*K);
thrust::host_vector<__half> h_b_vec(K*N);
fill_random_half_values(h_a_vec.data(), h_a_vec.size(), random_engine);
fill_random_half_values(h_b_vec.data(), h_b_vec.size(), random_engine);
thrust::device_vector<__half> d_a_vec = h_a_vec;
thrust::device_vector<__half> d_b_vec = h_b_vec;
thrust::device_vector<float> d_c_vec(M*N);
dim3 threadNum(16, 16);
dim3 blockNum((M + threadNum.x - 1)/threadNum.x, (N + threadNum.y - 1)/threadNum.y);
cudaEvent_t cuda_start, cuda_end;
cudaEventCreate(&cuda_start);
cudaEventCreate(&cuda_end);
const int numIterations = 1;
float naive_totalTime = 0.0f;
// 1. CUDA NAIVE
for (int i = 0; i < numIterations; ++i) {
cudaEventRecord(cuda_start, 0);
gemm_naive<__half><<<blockNum, threadNum>>>(d_a_vec.data().get(), d_b_vec.data().get(), d_c_vec.data().get(), M, K, N);
cudaEventRecord(cuda_end, 0);
cudaEventSynchronize(cuda_end);
float ms = 0.0f;
cudaEventElapsedTime(&ms, cuda_start, cuda_end);
naive_totalTime += ms;
}
thrust::host_vector<float> h_naive_c_vec = d_c_vec;
// 2. CUDA
float v2_totalTime = 0.0f;
for (int i = 0; i < numIterations; ++i) {
cudaEventRecord(cuda_start, 0);
gemm_CUDA<__half><<<blockNum, threadNum>>>(d_c_vec.data().get(), d_a_vec.data().get(), d_b_vec.data().get(), M, N, K);
cudaEventRecord(cuda_end, 0);
cudaEventSynchronize(cuda_end);
float ms = 0.0f;
cudaEventElapsedTime(&ms, cuda_start, cuda_end);
v2_totalTime += ms;
}
thrust::host_vector<float> h_c_vec = d_c_vec;
// 3. Tensor Core
dim3 block(WARP_SIZE);
dim3 grid(div_ceil(N, WMMA_N), div_ceil(M, WMMA_M));
float v3_totalTime = 0.0f;
for (int i = 0; i < numIterations; ++i) {
cudaEventRecord(cuda_start, 0);
wmmaNaiveKernel<<<grid, block>>>(d_a_vec.data().get(), d_b_vec.data().get(), d_c_vec.data().get(), M, N, K);
cudaEventRecord(cuda_end, 0);
cudaEventSynchronize(cuda_end);
float ms = 0.0f;
cudaEventElapsedTime(&ms, cuda_start, cuda_end);
v3_totalTime += ms;
}
thrust::host_vector<float> h_c_vec_v3 = d_c_vec;
// compare
bool flg = 0;
float cuda_error = 0.0f;
float tensor_error = 0.0f;
for (int r=0 ; r<M ; ++r) {
for (int c=0; c<N ; ++c) {
float naive_res = h_naive_c_vec[r * N + c];
float cuda_res = h_c_vec[r * N + c];
float tensor_res = h_c_vec_v3[r * N + c];
float err_cuda = abs(naive_res-cuda_res)/naive_res;
float err_tensor = abs(naive_res-tensor_res)/naive_res;
cuda_error += err_cuda;
tensor_error += err_tensor;
if (err_cuda > 1e-3 || err_tensor > 1e-3) {
printf("(%f, %f, %f)\n", h_naive_c_vec[r * N + c], h_c_vec[r * N + c], h_c_vec_v3[r * N + c]);
printf("Failed: cuda, tensor: %f, %f\n", err_cuda, err_tensor);
flg = 1;
}
if (flg) break;
}
if (flg) break;
}
if (!flg) {
std::cout << "NAIVE execution time: " << naive_totalTime/numIterations << " ms" << std::endl;
std::cout << "V2 execution time: " << v2_totalTime/numIterations << " ms, error: " << cuda_error/(M*N) << std::endl;
std::cout << "V3 execution time: " << v3_totalTime/numIterations << " ms, error: " << tensor_error/(M*N) << std::endl;
}
return 0;
}