I confused about LD instruction.
I have a cuda program about wmma, I just launch a block and the block size is 32. my cuda code is very simple, like this:
__global__ void ldmatrix_test(float *a, float *b, float *C)
{
// Declare the fragments
wmma::fragment<wmma::matrix_a, 16, 16, 8, wmma::precision::tf32, wmma::row_major> a_frag;
wmma::fragment<wmma::matrix_b, 16, 16, 8, wmma::precision::tf32, wmma::col_major> b_frag;
wmma::fragment<wmma::accumulator, 16, 16, 8, float> acc_frag;
wmma::fill_fragment(acc_frag, 0.0f);
wmma::load_matrix_sync(a_frag, a, 8);
wmma::load_matrix_sync(b_frag, b, 8);
// Perform the matrix multiplication
wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
wmma::store_matrix_sync(C, acc_frag, 16, wmma::mem_row_major);
}
A shap is [16, 8], and B shape is [8, 16], A is row-major, B is col-major.
I’m very confused about the SASS, as following:
code for sm_86
Function : _Z13ldmatrix_testPfS_S_
.headerflags @"EF_CUDA_SM86 EF_CUDA_PTX_SM(EF_CUDA_SM86)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R2, SR_LANEID ; /* 0x0000000000027919 */
/* 0x000e220000000000 */
/*0020*/ MOV R25, 0x4 ; /* 0x0000000400197802 */
/* 0x000fe20000000f00 */
/*0030*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fe20000000a00 */
/*0040*/ SHF.R.U32.HI R5, RZ, 0x2, R2 ; /* 0x00000002ff057819 */
/* 0x001fe40000011602 */
/*0050*/ LOP3.LUT R2, R2, 0x3, RZ, 0xc0, !PT ; /* 0x0000000302027812 */
/* 0x000fc800078ec0ff */
/*0060*/ LEA R0, R5, R2, 0x3 ; /* 0x0000000205007211 */
/* 0x000fc800078e18ff */
/*0070*/ IADD3 R14, R0.reuse, 0x40, RZ ; /* 0x00000040000e7810 */
/* 0x040fe20007ffe0ff */
/*0080*/ IMAD.WIDE R12, R0, R25, c[0x0][0x168] ; /* 0x00005a00000c7625 */
/* 0x000fc800078e0219 */
/*0090*/ IMAD.WIDE R10, R14, R25.reuse, c[0x0][0x160] ; /* 0x000058000e0a7625 */
/* 0x080fe400078e0219 */
/*00a0*/ LD.E R12, [R12.64] ; /* 0x000000040c0c7980 */
/* 0x000ea4000c101900 */
/*00b0*/ IMAD.WIDE R8, R0, R25.reuse, c[0x0][0x160] ; /* 0x0000580000087625 */
/* 0x080fe400078e0219 */
/*00c0*/ LD.E R7, [R10.64] ; /* 0x000000040a077980 */
/* 0x000ea4000c101900 */
/*00d0*/ IMAD.WIDE R14, R14, R25, c[0x0][0x168] ; /* 0x00005a000e0e7625 */
/* 0x000fe400078e0219 */
/*00e0*/ LD.E R6, [R8.64] ; /* 0x0000000408067980 */
/* 0x000ea2000c101900 */
/*00f0*/ IADD3 R18, R0, 0x4, RZ ; /* 0x0000000400127810 */
/* 0x000fc60007ffe0ff */
/*0100*/ LD.E R14, [R14.64] ; /* 0x000000040e0e7980 */
/* 0x000ee2000c101900 */
/*0110*/ IADD3 R0, R0, 0x44, RZ ; /* 0x0000004400007810 */
/* 0x000fe20007ffe0ff */
/*0120*/ IMAD.WIDE R22, R18, R25, c[0x0][0x168] ; /* 0x00005a0012167625 */
/* 0x000fc800078e0219 */
/*0130*/ IMAD.WIDE R18, R18, R25.reuse, c[0x0][0x160] ; /* 0x0000580012127625 */
/* 0x080fe400078e0219 */
/*0140*/ LD.E R22, [R22.64] ; /* 0x0000000416167980 */
/* 0x000f24000c101900 */
/*0150*/ IMAD.WIDE R20, R0.reuse, R25.reuse, c[0x0][0x160] ; /* 0x0000580000147625 */
/* 0x0c0fe400078e0219 */
/*0160*/ LD.E R16, [R18.64] ; /* 0x0000000412107980 */
/* 0x000f24000c101900 */
/*0170*/ IMAD.WIDE R24, R0, R25, c[0x0][0x168] ; /* 0x00005a0000187625 */
/* 0x000fe400078e0219 */
/*0180*/ LD.E R17, [R20.64] ; /* 0x0000000414117980 */
/* 0x000f28000c101900 */
/*0190*/ LD.E R24, [R24.64] ; /* 0x0000000418187980 */
/* 0x000f22000c101900 */
/*01a0*/ MOV R3, RZ ; /* 0x000000ff00037202 */
/* 0x000fca0000000f00 */
/*01b0*/ IMAD.WIDE.U32 R2, R5, 0x8, R2 ; /* 0x0000000805027825 */
/* 0x000fca00078e0002 */
/*01c0*/ LEA R4, P0, R2, c[0x0][0x170], 0x3 ; /* 0x00005c0002047a11 */
/* 0x000fc800078018ff */
/*01d0*/ LEA.HI.X R5, R2, c[0x0][0x174], R3, 0x3, P0 ; /* 0x00005d0002057a11 */
/* 0x000fe200000f1c03 */
/*01e0*/ HMMA.1684.F32.TF32 R8, R6.reuse, R12, RZ ; /* 0x0000000c0608723c */
/* 0x044f6800000850ff */
/*01f0*/ HMMA.1684.F32.TF32 R12, R6, R14, RZ ; /* 0x0000000e060c723c */
/* 0x008f5800000850ff */
/*0200*/ HMMA.1684.F32.TF32 R8, R16.reuse, R22, R8 ; /* 0x000000161008723c */
/* 0x070f680000085008 */
/*0210*/ HMMA.1684.F32.TF32 R12, R16, R24, R12 ; /* 0x00000018100c723c */
/* 0x000f76000008500c */
/*0220*/ ST.E.64 [R4.64], R8 ; /* 0x0000000804007985 */
/* 0x020fe8000c101b04 */
/*0230*/ ST.E.64 [R4.64+0x200], R10 ; /* 0x0002000a04007985 */
/* 0x000fe8000c101b04 */
/*0240*/ ST.E.64 [R4.64+0x20], R12 ; /* 0x0000200c04007985 */
/* 0x000fe8000c101b04 */
/*0250*/ ST.E.64 [R4.64+0x220], R14 ; /* 0x0002200e04007985 */
/* 0x000fe2000c101b04 */
/*0260*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0270*/ BRA 0x270; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*0280*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0290*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02a0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02b0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02c0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
I found this instruction:
/*00c0*/ LD.E R7, [R10.64] ; /* 0x000000040a077980 */
/* 0x000ea4000c101900 */
/*00d0*/ IMAD.WIDE R14, R14, R25, c[0x0][0x168] ; /* 0x00005a000e0e7625 */
/* 0x000fe400078e0219 */
/*00e0*/ LD.E R6, [R8.64] ; /* 0x0000000408067980 */
/* 0x000ea2000c101900 */
“LD.E R7, [R10.64]” is before “LD.E R6, [R8.64]”, I think “LD.E R7, [R10.64]” means:
load an 64bits word from R10, and save it into R7 and R8, because LD.E is extend 32bits into 64bits, and R10.64 means 64bits address。
but, “LD.E R6, [R8.64]” will load 64bits word into R6 and R7 again, so, it will overwrite R7,why?
there is a similar problem behind “LD.E R6, [R8.64]”:
/*0160*/ LD.E R16, [R18.64] ; /* 0x0000000412107980 */
/* 0x000f24000c101900 */
/*0170*/ IMAD.WIDE R24, R0, R25, c[0x0][0x168] ; /* 0x00005a0000187625 */
/* 0x000fe400078e0219 */
/*0180*/ LD.E R17, [R20.64] ; /* 0x0000000414117980 */
/* 0x000f28000c101900 */
why nvcc compile cuda-c like this? I think it is better as following:
......
......
LD R16, [R18]
......
......
LD R17, [R20]
......
......
I cannot sure the reason, is there anyone would like to teach me?
another problem: is there any reference documents about “HMMA.1684.F32.TF32”?