It seems the computation is lowered to IMAD,IADD
instruction, following is my SASS code:
Fatbin elf code:
================
arch = sm_80
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_80
Fatbin elf code:
================
arch = sm_80
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_80
Function : _Z13gemm_mma_1688P6__halfS0_Pfiiiff
.headerflags @"EF_CUDA_SM80 EF_CUDA_PTX_SM(EF_CUDA_SM80)"
/*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; /* 0x00000a00ff017624 */
/* 0x000fc400078e00ff */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
/*0020*/ IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x178] ; /* 0x00005e00ff027624 */
/* 0x000fc600078e00ff */
/*0030*/ S2R R3, SR_TID.Y ; /* 0x0000000000037919 */
/* 0x000e240000002200 */
/*0040*/ ISETP.GE.AND P0, PT, R2, 0x1, PT ; /* 0x000000010200780c */
/* 0x000fe20003f06270 */
/*0050*/ IMAD R0, R0, c[0x0][0x0], R3 ; /* 0x0000000000007a24 */
/* 0x001fd800078e0203 */
/*0060*/ @!P0 EXIT ; /* 0x000000000000894d */
/* 0x000fea0003800000 */
/*0070*/ IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x17c] ; /* 0x00005f00ff027624 */
/* 0x000fe200078e00ff */
/*0080*/ ISETP.GT.U32.AND P2, PT, R0.reuse, 0x7f, PT ; /* 0x0000007f0000780c */
/* 0x040fe20003f44070 */
/*0090*/ IMAD.SHL.U32 R8, R0.reuse, 0x2, RZ ; /* 0x0000000200087824 */
/* 0x040fe200078e00ff */
/*00a0*/ SHF.R.U32.HI R7, RZ, 0x2, R0 ; /* 0x00000002ff077819 */
/* 0x000fe20000011600 */
/*00b0*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fe20000000a00 */
/*00c0*/ ISETP.GE.U32.AND P0, PT, R0, 0x100, PT ; /* 0x000001000000780c */
/* 0x000fe20003f06070 */
/*00d0*/ IMAD.MOV.U32 R0, RZ, RZ, RZ ; /* 0x000000ffff007224 */
/* 0x000fe200078e00ff */
/*00e0*/ ISETP.GE.AND P1, PT, R2, 0x1, PT ; /* 0x000000010200780c */
/* 0x000fe40003f26270 */
/*00f0*/ SGXT.U32 R7, R7, 0x3 ; /* 0x000000030707781a */
/* 0x000fe40000000000 */
/*0100*/ LOP3.LUT R8, R8, 0x6, RZ, 0xc0, !PT ; /* 0x0000000608087812 */
/* 0x000fc400078ec0ff */
/*0110*/ SEL R6, RZ, 0xffffffff, !P2 ; /* 0xffffffffff067807 */
/* 0x000fce0005000000 */
/*0120*/ @!P1 BRA 0x4b0 ; /* 0x0000038000009947 */
/* 0x000fea0003800000 */
/*0130*/ MOV R9, RZ ; /* 0x000000ff00097202 */
/* 0x000fe20000000f00 */
/*0140*/ IMAD.IADD R17, R7, 0x1, R0 ; /* 0x0000000107117824 */
/* 0x000fc400078e0200 */
/*0150*/ S2R R2, SR_TID.Y ; /* 0x0000000000027919 */
/* 0x000e220000002200 */
/*0160*/ IMAD.MOV.U32 R18, RZ, RZ, RZ ; /* 0x000000ffff127224 */
/* 0x000fc600078e00ff */
/*0170*/ S2R R3, SR_TID.X ; /* 0x0000000000037919 */
/* 0x000e240000002100 */
/*0180*/ IMAD R2, R3, c[0x0][0x0], R2 ; /* 0x0000000003027a24 */
/* 0x001fca00078e0202 */
/*0190*/ SHF.R.U32.HI R2, RZ, 0x2, R2 ; /* 0x00000002ff027819 */
/* 0x000fc80000011602 */
/*01a0*/ LOP3.LUT R2, R2, 0x18, RZ, 0xc0, !PT ; /* 0x0000001802027812 */
/* 0x000fc800078ec0ff */
/*01b0*/ IADD3 R16, R2, R9, R8 ; /* 0x0000000902107210 */
/* 0x000fc40007ffe008 */
/*01c0*/ IADD3 R18, R18, 0x40, RZ ; /* 0x0000004012127810 */
/* 0x000fe20007ffe0ff */
/*01d0*/ IMAD.MOV.U32 R10, RZ, RZ, RZ ; /* 0x000000ffff0a7224 */
/* 0x000fc600078e00ff */
/*01e0*/ ISETP.GE.U32.AND P2, PT, R18, 0x1400, PT ; /* 0x000014001200780c */
/* 0x000fc40003f46070 */
/*01f0*/ IMAD.IADD R3, R17, 0x1, R10 ; /* 0x0000000111037824 */
/* 0x000fe200078e020a */
/*0200*/ IADD3 R12, R16, 0x400, RZ ; /* 0x00000400100c7810 */
/* 0x000fe20007ffe0ff */
/*0210*/ IMAD.MOV.U32 R13, RZ, RZ, RZ ; /* 0x000000ffff0d7224 */
/* 0x000fe400078e00ff */
/*0220*/ IMAD.MOV.U32 R15, RZ, RZ, 0x4 ; /* 0x00000004ff0f7424 */
/* 0x000fe200078e00ff */
/*0230*/ IADD3 R5, R3.reuse, 0x8, RZ ; /* 0x0000000803057810 */
/* 0x040fe20007ffe0ff */
/*0240*/ IMAD.MOV.U32 R14, RZ, RZ, c[0x0][0x4] ; /* 0x00000100ff0e7624 */
/* 0x000fe200078e00ff */
/*0250*/ ISETP.NE.AND P3, PT, R3, R6.reuse, PT ; /* 0x000000060300720c */
/* 0x080fe40003f65270 */
/*0260*/ ISETP.NE.AND P4, PT, R5, R6, PT ; /* 0x000000060500720c */
/* 0x000fe40003f85270 */
/*0270*/ MOV R11, R12 ; /* 0x0000000c000b7202 */
/* 0x000fd20000000f00 */
/*0280*/ @!P3 IADD3 R11, R16, RZ, RZ ; /* 0x000000ff100bb210 */
/* 0x000fe40007ffe0ff */
/*0290*/ @!P4 IMAD.MOV R12, RZ, RZ, R16 ; /* 0x000000ffff0cc224 */
/* 0x000fc600078e0210 */
/*02a0*/ IMAD.IADD R2, R11, 0x1, R13.reuse ; /* 0x000000010b027824 */
/* 0x101fe200078e020d */
/*02b0*/ @P0 WARPSYNC 0xffffffff ; /* 0xffffffff00000948 */
/* 0x000fe20003800000 */
/*02c0*/ IMAD.IADD R4, R12, 0x1, R13 ; /* 0x000000010c047824 */
/* 0x000fe200078e020d */
/*02d0*/ LEA R13, R14, R13, 0x1 ; /* 0x0000000d0e0d7211 */
/* 0x000fe200078e08ff */
/*02e0*/ IMAD.WIDE R2, R2, R15, c[0x0][0x170] ; /* 0x00005c0002027625 */
/* 0x000fc600078e020f */
/*02f0*/ ISETP.GE.AND P3, PT, R13, 0x40, PT ; /* 0x000000400d00780c */
/* 0x000fe20003f66270 */
/*0300*/ IMAD.WIDE R4, R4, R15, c[0x0][0x170] ; /* 0x00005c0004047625 */
/* 0x000fe200078e020f */
/*0310*/ @P0 STG.E [R2.64], RZ ; /* 0x000000ff02000986 */
/* 0x0001e8000c101904 */
/*0320*/ @P0 STG.E [R2.64+0x4], RZ ; /* 0x000004ff02000986 */
/* 0x0001e8000c101904 */
/*0330*/ @P0 STG.E [R4.64], RZ ; /* 0x000000ff04000986 */
/* 0x0001e8000c101904 */
/*0340*/ @P0 STG.E [R4.64+0x4], RZ ; /* 0x000004ff04000986 */
/* 0x0001e8000c101904 */
/*0350*/ @P0 STG.E [R2.64], RZ ; /* 0x000000ff02000986 */
/* 0x0001e8000c101904 */
/*0360*/ @P0 STG.E [R2.64+0x4], RZ ; /* 0x000004ff02000986 */
/* 0x0001e8000c101904 */
/*0370*/ @P0 STG.E [R4.64], RZ ; /* 0x000000ff04000986 */
/* 0x0001e8000c101904 */
/*0380*/ @P0 STG.E [R4.64+0x4], RZ ; /* 0x000004ff04000986 */
/* 0x0001e8000c101904 */
/*0390*/ @P0 STG.E [R2.64], RZ ; /* 0x000000ff02000986 */
/* 0x0001e8000c101904 */
/*03a0*/ @P0 STG.E [R2.64+0x4], RZ ; /* 0x000004ff02000986 */
/* 0x0001e8000c101904 */
/*03b0*/ @P0 STG.E [R4.64], RZ ; /* 0x000000ff04000986 */
/* 0x0001e8000c101904 */
/*03c0*/ @P0 STG.E [R4.64+0x4], RZ ; /* 0x000004ff04000986 */
/* 0x0001e8000c101904 */
/*03d0*/ @P0 STG.E [R2.64], RZ ; /* 0x000000ff02000986 */
/* 0x0001e8000c101904 */
/*03e0*/ @P0 STG.E [R2.64+0x4], RZ ; /* 0x000004ff02000986 */
/* 0x0001e8000c101904 */
/*03f0*/ @P0 STG.E [R4.64], RZ ; /* 0x000000ff04000986 */
/* 0x0001e8000c101904 */
/*0400*/ @P0 STG.E [R4.64+0x4], RZ ; /* 0x000004ff04000986 */
/* 0x0001e2000c101904 */
/*0410*/ @!P3 BRA 0x2a0 ; /* 0xfffffe800000b947 */
/* 0x000fea000383ffff */
/*0420*/ IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0x0] ; /* 0x00000000ff037624 */
/* 0x001fc800078e00ff */
/*0430*/ IMAD R10, R3, 0x2, R10 ; /* 0x00000002030a7824 */
/* 0x000fca00078e020a */
/*0440*/ ISETP.GE.AND P3, PT, R10, 0x40, PT ; /* 0x000000400a00780c */
/* 0x000fda0003f66270 */
/*0450*/ @!P3 BRA 0x1f0 ; /* 0xfffffd900000b947 */
/* 0x000fea000383ffff */
/*0460*/ @!P2 BRA 0x1c0 ; /* 0xfffffd500000a947 */
/* 0x000fea000383ffff */
/*0470*/ IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x10] ; /* 0x00000400ff027624 */
/* 0x000fca00078e00ff */
/*0480*/ LEA R9, R2, R9, 0x6 ; /* 0x0000000902097211 */
/* 0x000fc800078e30ff */
/*0490*/ ISETP.GE.AND P2, PT, R9, c[0x0][0x17c], PT ; /* 0x00005f0009007a0c */
/* 0x000fda0003f46270 */
/*04a0*/ @!P2 BRA 0x150 ; /* 0xfffffca00000a947 */
/* 0x000fea000383ffff */
/*04b0*/ IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0xc] ; /* 0x00000300ff037624 */
/* 0x000fc800078e00ff */
/*04c0*/ IMAD R0, R3, 0x40, R0 ; /* 0x0000004003007824 */
/* 0x000fca00078e0200 */
/*04d0*/ ISETP.GE.AND P2, PT, R0, c[0x0][0x178], PT ; /* 0x00005e0000007a0c */
/* 0x000fda0003f46270 */
/*04e0*/ @!P2 BRA 0x120 ; /* 0xfffffc300000a947 */
/* 0x000fea000383ffff */
/*04f0*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0500*/ BRA 0x500; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*0510*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0520*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0530*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0540*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0550*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0560*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0570*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0580*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0590*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05a0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05b0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05c0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
Fatbin ptx code:
================
arch = sm_80
code version = [7,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed