When using mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16, I noticed that every register’s type was reinterpreted as unsigned int. So I wonder if I can just load half type data by reinterpreting them into unsigned int.
Here is original code clip
__global__ f(half* ptr) {
// load data from ptr to src
__shared__ half* src[...];
__half2 dst[4];
for (int i = 0; i < 4; i++) {
dst[i] = __halves2half2(src[...], src[...]);
}
unsigned *A = reinterpret_cast<unsigned *>(&dst);
__asm__ __volatile__ (
"mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
"{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
: "=r"(D[0]), "=r"(D[1])
:
"r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
"r"(B[0]), "r"(B[1]),
"r"(C[0]), "r"(C[1])
);
}
however, to achieve better loading efficiency, I want a warp can load 128B instead of 64B when loading data from ptr (in gmem) to src. So I want to interprete both into unsigned type like following code:
__global__ f(half* ptr) {
// load data from ptr to src
unsigned *cp_ptr = reinterpret_cast<unsigned *>(ptr);
__shared__ unsigned src[...];
unsigned A[4];
for (int i = 0; i < 4; i++) {
A[i] = src[...];
}
__asm__ __volatile__ (
"mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
"{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
: "=r"(D[0]), "=r"(D[1])
:
"r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
"r"(B[0]), "r"(B[1]),
"r"(C[0]), "r"(C[1])
);
}
I believe there are no align problem. But the second clip result in serious calculation accuracy problem while the first clip don’t. Why? How can I fully use cacheline and keep accuracy at the same time?