I use fp16 mma m16n8k16 for calculation, the kernel is like below:
map contains some integer index and -1, when map[i] = -1, it means that index don’t need to be calculate (just for padding, because mma need to align to 8)
__global__ f(int* map, ......) {
__shared__ result[...];
half tmp_result[4];
...
unsigned *C = reinterpret_cast<unsigned *> (&tmp_result);
unsigned *D = reinterpret_cast<unsigned *> (&tmp_result);
// load result
tmp_result[0] = result[max(map[0], 0)];
tmp_result[1] = result[max(map[1], 0)];
tmp_result[2] = result[max(map[2], 0)];
tmp_result[3] = result[max(map[3], 0)];
__asm__ __volatile__ (
"mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
"{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
: "=r"(D[0]), "=r"(D[1])
:
"r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
"r"(B[0]), "r"(B[1]),
"r"(C[0]), "r"(C[1])
);
if (map[0] != -1) {result[map[0]] = tmp_result[0];}
if (map[1] != -1) {result[map[1]] = tmp_result[1];}
if (map[2] != -1) {result[map[2]] = tmp_result[2];}
if (map[3] != -1) {result[map[3]] = tmp_result[3];}
}
will there be any more efficient way or elegent way to realize the code? I think use 0 as the default is not a good way to avoid bank conflict.