From official guide, I find tensor core can only read global and shared memory data https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-load-instruction-wmma-load , but my friend sent me this:
asm volatile (
".reg .f16x2 %Ra<4>, %Rb<2>, %Rc<2>; \n"
"ld.global.b32 %Ra0, [%0]; \n"
"ld.global.b32 %Ra1, [%0 + 256]; \n"
"ld.global.b32 %Ra2, [%0 + 16]; \n"
"ld.global.b32 %Ra3, [%0 + 272]; \n"
"ld.global.b32 %Rb0, [%1]; \n"
"ld.global.b32 %Rb1, [%1 + 16]; \n"
"ld.global.b32 %Rc0, [%2]; \n"
"ld.global.b32 %Rc1, [%2 + 128]; \n"
"mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 %Rc0, %Rc1, {%Ra0, %Ra1, %Ra2, %Ra3}, %Rb0, %Rb1, {%Rc0, %Rc1}; \n"
"st.global.b32 [%3], %Rc0; \n"
"st.global.b32 [%3 + 128], %Rc1; \n"
:
: "l"(A + A_index), "l"(B + B_col_major_index), "l"(C + C_index), "l"(D + D_index)
);
Seems directly use register data for tensor core is OK?? Really? Where can I find such doc?