I tried to write a simple matrix multiplication code on V100 to use tensor core, but it gave the wrong answer.
I used the mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 instruction, and here is my code
#include <cuda_fp16.h>
#include <iostream>
#include <mma.h>
using namespace nvcuda;
__global__ void mma_test(half* output)
{
int lane = threadIdx.x % 32;
uint out[4] = { 0 };
for (int i = 0; i < 10000; i++) {
uint MultiA[2] = { 0 };
uint MultiB[2] = { 0 };
half* test1 = reinterpret_cast<half*>(MultiA);
half* test2 = reinterpret_cast<half*>(MultiB);
test1[0] = 0.8;
test1[1] = 0.8;
test1[2] = 0.8;
test1[3] = 0.8;
test2[0] = 0.7;
test2[1] = 0.7;
test2[2] = 0.7;
test2[3] = 0.7;
asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 "
"{ %0, %1, %2, %3 },"
"{ %4, %5 },"
"{ %6, %7 },"
"{ %8, %9, %10, %11 };\n"
: "=r"(out[0]), "=r"(out[1]), "=r"(out[2]), "=r"(out[3])
: "r"(MultiA[0]), "r"(MultiA[1]),
"r"(MultiB[0]), "r"(MultiB[1]),
"r"(out[0]), "r"(out[1]), "r"(out[2]), "r"(out[3]));
}
int store_row = lane % 4 + lane / 16 * 4;
int store_col = (lane % 16) / 4;
reinterpret_cast<uint4*>(output)[store_row * 4 + store_col] = reinterpret_cast<uint4*>(out)[0];
}
int main()
{
half* output = (half*)malloc(sizeof(half) * 32 * 8);
half* output_d = NULL;
cudaMalloc(&output_d, sizeof(half) * 32 * 8);
mma_test<<<1, 32>>>(output_d);
cudaMemcpy(output, output_d, sizeof(half) * 32 * 8, cudaMemcpyDeviceToHost);
for (int i = 0; i < 32 * 8; i++) {
std::cout << (float)output[i] << " ";
}
std::cout << std::endl;
}
I found that when i was small, the answer approached the correct answer, but when i became large, the result was different.