Someone told me Blackwell does not support INT4 native.
Correct. Native and packed types supported can be found here.
Or do you mean the Tensor Core formats?
yes, tensor core.
From the PTX documentation at least the following mma instructions exist for blackwell
mma.sync.aligned.shape.row.col{.satfinite}.s32.atype.btype.s32 d, a, b, c;
.shape = {.m8n8k32, .m16n8k32, .m16n8k64}
.atype = {.u4, .s4};
.btype = {.u4, .s4};
.u4/.s4 integer type mma operation with .m8n8k32 shape sm_75 or higher.
.u4/.s4 integer type mma operation with .m16n8k32 and .m16n8k64 shapes requires sm_80 or higher.
Compiling the following device function with -dc
for different architectures, we can see that on Hopper and Blackwell an emulation sequence is used.
__device__
void func(unsigned int (&D)[2], unsigned int A, unsigned int B, unsigned int (&C)[2]){
asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=r"(D[0]), "=r"(D[1])
: "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
}
code for sm_75
Function : _Z4funcRA2_jjjS0_
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM75 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM75)"
....
/*0030*/ IMMA.8832.S4.S4.SAT R6, R6.ROW, R7.COL, R10 ; /* 0x0000000706067237 */
/* 0x020e7400003c540a */
....
code for sm_80
Function : _Z4funcRA2_jjjS0_
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM80 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM80)"
....
/*0040*/ IMMA.8832.S4.S4.SAT R6, R6.ROW, R7.COL, R10 ; /* 0x0000000706067237 */
....
code for sm_86
Function : _Z4funcRA2_jjjS0_
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
....
/*0040*/ IMMA.8832.S4.S4.SAT R6, R6.ROW, R7.COL, R10 ; /* 0x0000000706067237 */
/* 0x020fe200003c540a */
....
code for sm_89
Function : _Z4funcRA2_jjjS0_
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM89 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM89)"
....
/* 0x000fe40003800000 */
/*0040*/ IMMA.8832.S4.S4.SAT R6, R6.ROW, R7.COL, R10 ; /* 0x0000000706067237 */
....
code for sm_90
Function : __cuda_sm_9x_mma_sub_byte_internal_m8n8k32_s4_s4_satfinite
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM90 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM90)"
/*0000*/ LOP3.LUT R3, R4.reuse, 0xf00, RZ, 0xc0, !PT ; /* 0x00000f0004037812 */
/* 0x040fe200078ec0ff */
/*0010*/ IMAD.SHL.U32 R0, R4.reuse, 0x10000000, RZ ; /* 0x1000000004007824 */
/* 0x040fe200078e00ff */
/*0020*/ LOP3.LUT R11, R5.reuse, 0xf00, RZ, 0xc0, !PT ; /* 0x00000f00050b7812 */
/* 0x040fe200078ec0ff */
/*0030*/ IMAD.SHL.U32 R10, R5, 0x10000000, RZ ; /* 0x10000000050a7824 */
/* 0x000fe200078e00ff */
/*0040*/ LOP3.LUT R8, R4, 0xf000, RZ, 0xc0, !PT ; /* 0x0000f00004087812 */
/* 0x000fe200078ec0ff */
/*0050*/ IMAD.SHL.U32 R3, R3, 0x100000, RZ ; /* 0x0010000003037824 */
/* 0x000fe200078e00ff */
/*0060*/ SHF.R.S32.HI R0, RZ, 0x1c, R0 ; /* 0x0000001cff007819 */
/* 0x000fe20000011400 */
/*0070*/ IMAD.SHL.U32 R11, R11, 0x100000, RZ ; /* 0x001000000b0b7824 */
/* 0x000fe200078e00ff */
/*0080*/ SHF.R.S32.HI R10, RZ, 0x1c, R10 ; /* 0x0000001cff0a7819 */
/* 0x000fe2000001140a */
/*0090*/ IMAD.U32 R8, R8, 0x10000, RZ ; /* 0x0001000008087824 */
/* 0x000fe200078e00ff */
/*00a0*/ LOP3.LUT R0, R0, 0xff, RZ, 0xc0, !PT ; /* 0x000000ff00007812 */
/* 0x000fe200078ec0ff */
/*00b0*/ WARPSYNC.ALL ; /* 0x0000000000007948 */
/* 0x000fea0003800000 */
/*00c0*/ NOP ; /* 0x0000000000007918 */
/* 0x000fe20000000000 */
/*00d0*/ SHF.R.S32.HI R9, RZ, 0x14, R3 ; /* 0x00000014ff097819 */
/* 0x000fc40000011403 */
/*00e0*/ LOP3.LUT R10, R10, 0xff, RZ, 0xc0, !PT ; /* 0x000000ff0a0a7812 */
/* 0x000fe400078ec0ff */
/*00f0*/ LOP3.LUT R0, R0, 0xff00, R9, 0xf8, !PT ; /* 0x0000ff0000007812 */
/* 0x000fe400078ef809 */
/*0100*/ LOP3.LUT R9, R4.reuse, 0xf0000, RZ, 0xc0, !PT ; /* 0x000f000004097812 */
/* 0x040fe400078ec0ff */
/*0110*/ SHF.R.S32.HI R11, RZ, 0x14, R11 ; /* 0x00000014ff0b7819 */
/* 0x000fe4000001140b */
/*0120*/ LOP3.LUT R3, R4, 0xf0, RZ, 0xc0, !PT ; /* 0x000000f004037812 */
/* 0x000fe200078ec0ff */
/*0130*/ IMAD.SHL.U32 R9, R9, 0x1000, RZ ; /* 0x0000100009097824 */
/* 0x000fe200078e00ff */
/*0140*/ LOP3.LUT R11, R10, 0xff00, R11, 0xf8, !PT ; /* 0x0000ff000a0b7812 */
/* 0x000fc400078ef80b */
/*0150*/ LOP3.LUT R10, R5, 0xf0, RZ, 0xc0, !PT ; /* 0x000000f0050a7812 */
/* 0x000fe200078ec0ff */
/*0160*/ IMAD.SHL.U32 R3, R3, 0x1000000, RZ ; /* 0x0100000003037824 */
/* 0x000fe200078e00ff */
/*0170*/ SHF.R.S32.HI R9, RZ, 0xc, R9 ; /* 0x0000000cff097819 */
/* 0x000fe40000011409 */
/*0180*/ LOP3.LUT R12, R5.reuse, 0xf0000, RZ, 0xc0, !PT ; /* 0x000f0000050c7812 */
/* 0x040fe200078ec0ff */
/*0190*/ IMAD.SHL.U32 R10, R10, 0x1000000, RZ ; /* 0x010000000a0a7824 */
/* 0x000fe200078e00ff */
/*01a0*/ LOP3.LUT R0, R0, 0xff0000, R9, 0xf8, !PT ; /* 0x00ff000000007812 */
/* 0x000fe400078ef809 */
/*01b0*/ LOP3.LUT R9, R5, 0xf000, RZ, 0xc0, !PT ; /* 0x0000f00005097812 */
/* 0x000fe200078ec0ff */
/*01c0*/ IMAD.SHL.U32 R12, R12, 0x1000, RZ ; /* 0x000010000c0c7824 */
/* 0x000fe200078e00ff */
/*01d0*/ SHF.R.S32.HI R3, RZ, 0x1c, R3 ; /* 0x0000001cff037819 */
/* 0x000fc40000011403 */
/*01e0*/ SHF.R.S32.HI R8, RZ, 0x14, R8 ; /* 0x00000014ff087819 */
/* 0x000fe20000011408 */
/*01f0*/ IMAD.U32 R9, R9, 0x10000, RZ ; /* 0x0001000009097824 */
/* 0x000fe200078e00ff */
/*0200*/ LOP3.LUT R3, R3, 0xff, RZ, 0xc0, !PT ; /* 0x000000ff03037812 */
/* 0x000fe400078ec0ff */
/*0210*/ SHF.R.S32.HI R10, RZ, 0x1c, R10 ; /* 0x0000001cff0a7819 */
/* 0x000fe4000001140a */
/*0220*/ LOP3.LUT R3, R3, 0xff00, R8, 0xf8, !PT ; /* 0x0000ff0003037812 */
/* 0x000fe400078ef808 */
/*0230*/ SHF.R.S32.HI R9, RZ, 0x14, R9 ; /* 0x00000014ff097819 */
/* 0x000fe40000011409 */
/*0240*/ LOP3.LUT R10, R10, 0xff, RZ, 0xc0, !PT ; /* 0x000000ff0a0a7812 */
/* 0x000fc400078ec0ff */
/*0250*/ SHF.R.S32.HI R12, RZ, 0xc, R12 ; /* 0x0000000cff0c7819 */
/* 0x000fe4000001140c */
/*0260*/ LOP3.LUT R8, R4, 0xf00000, RZ, 0xc0, !PT ; /* 0x00f0000004087812 */
/* 0x000fe400078ec0ff */
/*0270*/ LOP3.LUT R13, R5, 0xf00000, RZ, 0xc0, !PT ; /* 0x00f00000050d7812 */
/* 0x000fe400078ec0ff */
/*0280*/ LOP3.LUT R10, R10, 0xff00, R9, 0xf8, !PT ; /* 0x0000ff000a0a7812 */
/* 0x000fe200078ef809 */
/*0290*/ IMAD.SHL.U32 R8, R8, 0x100, RZ ; /* 0x0000010008087824 */
/* 0x000fe200078e00ff */
/*02a0*/ LOP3.LUT R11, R11, 0xff0000, R12, 0xf8, !PT ; /* 0x00ff00000b0b7812 */
/* 0x000fe200078ef80c */
/*02b0*/ IMAD.SHL.U32 R13, R13, 0x100, RZ ; /* 0x000001000d0d7824 */
/* 0x000fe200078e00ff */
/*02c0*/ LOP3.LUT R9, R4, 0xf000000, RZ, 0xc0, !PT ; /* 0x0f00000004097812 */
/* 0x000fc400078ec0ff */
/*02d0*/ LOP3.LUT R12, R5, 0xf000000, RZ, 0xc0, !PT ; /* 0x0f000000050c7812 */
/* 0x000fe400078ec0ff */
/*02e0*/ SHF.R.S32.HI R8, RZ, 0xc, R8 ; /* 0x0000000cff087819 */
/* 0x000fe20000011408 */
/*02f0*/ IMAD.SHL.U32 R9, R9, 0x10, RZ ; /* 0x0000001009097824 */
/* 0x000fe200078e00ff */
/*0300*/ SHF.R.S32.HI R13, RZ, 0xc, R13 ; /* 0x0000000cff0d7819 */
/* 0x000fe2000001140d */
/*0310*/ IMAD.SHL.U32 R12, R12, 0x10, RZ ; /* 0x000000100c0c7824 */
/* 0x000fe200078e00ff */
/*0320*/ LOP3.LUT R4, R4, 0xf0000000, RZ, 0xc0, !PT ; /* 0xf000000004047812 */
/* 0x000fe400078ec0ff */
/*0330*/ LOP3.LUT R5, R5, 0xf0000000, RZ, 0xc0, !PT ; /* 0xf000000005057812 */
/* 0x000fe400078ec0ff */
/*0340*/ LOP3.LUT R3, R3, 0xff0000, R8, 0xf8, !PT ; /* 0x00ff000003037812 */
/* 0x000fc400078ef808 */
/*0350*/ LOP3.LUT R10, R10, 0xff0000, R13, 0xf8, !PT ; /* 0x00ff00000a0a7812 */
/* 0x000fe400078ef80d */
/*0360*/ SHF.R.S32.HI R9, RZ, 0x4, R9 ; /* 0x00000004ff097819 */
/* 0x000fe40000011409 */
/*0370*/ SHF.R.S32.HI R12, RZ, 0x4, R12 ; /* 0x00000004ff0c7819 */
/* 0x000fe4000001140c */
/*0380*/ SHF.R.S32.HI R4, RZ, 0x4, R4 ; /* 0x00000004ff047819 */
/* 0x000fe40000011404 */
/*0390*/ SHF.R.S32.HI R5, RZ, 0x4, R5 ; /* 0x00000004ff057819 */
/* 0x000fe40000011405 */
/*03a0*/ LOP3.LUT R0, R0, 0xff000000, R9, 0xf8, !PT ; /* 0xff00000000007812 */
/* 0x000fc400078ef809 */
/*03b0*/ LOP3.LUT R11, R11, 0xff000000, R12, 0xf8, !PT ; /* 0xff0000000b0b7812 */
/* 0x000fe400078ef80c */
/*03c0*/ LOP3.LUT R3, R3, 0xff000000, R4, 0xf8, !PT ; /* 0xff00000003037812 */
/* 0x000fe400078ef804 */
/*03d0*/ LOP3.LUT R10, R10, 0xff000000, R5, 0xf8, !PT ; /* 0xff0000000a0a7812 */
/* 0x000fc600078ef805 */
/*03e0*/ IMMA.8816.S8.S8 R4, R0.ROW, R11.COL, RZ ; /* 0x0000000b00047237 */
/* 0x000fe800000054ff */
/*03f0*/ IMMA.8816.S8.S8 R10, R3.ROW, R10.COL, RZ ; /* 0x0000000a030a7237 */
/* 0x000fda00000054ff */
/*0400*/ IMAD.IADD R3, R4, 0x1, R10 ; /* 0x0000000104037824 */
/* 0x000fe400078e020a */
/*0410*/ IMAD.IADD R0, R5, 0x1, R11 ; /* 0x0000000105007824 */
/* 0x000fe400078e020b */
/*0420*/ IMAD.IADD R4, R3, 0x1, R6 ; /* 0x0000000103047824 */
/* 0x000fe400078e0206 */
/*0430*/ IMAD.IADD R5, R0, 0x1, R7 ; /* 0x0000000100057824 */
/* 0x000fc600078e0207 */
/*0440*/ PLOP3.LUT P0, PT, R3.reuse.SIGN, R6.reuse.SIGN, R4.reuse.SIGN, 0x2, 0x0 ; /* 0x000000060300721f */
/* 0x1c0fe40000700204 */
/*0450*/ PLOP3.LUT P2, PT, R0.reuse.SIGN, R7.reuse.SIGN, R5.reuse.SIGN, 0x2, 0x0 ; /* 0x000000070000721f */
/* 0x1c0fe40000740205 */
/*0460*/ PLOP3.LUT P1, PT, R3.SIGN, R6.SIGN, R4.SIGN, 0x40, 0x0 ; /* 0x000000060300721f */
/* 0x000fe40000724004 */
/*0470*/ PLOP3.LUT P3, PT, R0.SIGN, R7.SIGN, R5.SIGN, 0x40, 0x0 ; /* 0x000000070000721f */
/* 0x000fe40000764005 */
/*0480*/ SEL R4, R4, 0x7fffffff, !P0 ; /* 0x7fffffff04047807 */
/* 0x000fe40004000000 */
/*0490*/ SEL R5, R5, 0x7fffffff, !P2 ; /* 0x7fffffff05057807 */
/* 0x000fc40005000000 */
/*04a0*/ SEL R4, R4, 0x80000000, !P1 ; /* 0x8000000004047807 */
/* 0x000fe40004800000 */
/*04b0*/ SEL R5, R5, 0x80000000, !P3 ; /* 0x8000000005057807 */
/* 0x000fe20005800000 */
/*04c0*/ RET.ABS.NODEC R20 0x0 ; /* 0x0000000014007950 */
/* 0x000fec0003e00000 */
/*04d0*/ BRA 0x4d0; /* 0xfffffffc00fc7947 */
/* 0x000fc0000383ffff */
/*04e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*04f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0500*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0510*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0520*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0530*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0540*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0550*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0560*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0570*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
Function : _Z4funcRA2_jjjS0_
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM90 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM90)"
/*0000*/ IADD3 R1, R1, -0x8, RZ ; /* 0xfffffff801017810 */
/* 0x000fe20007ffe0ff */
/*0010*/ IMAD.MOV.U32 R0, RZ, RZ, R6 ; /* 0x000000ffff007224 */
/* 0x000fe200078e0006 */
/*0020*/ MOV R3, R7 ; /* 0x0000000700037202 */
/* 0x000fe20000000f00 */
/*0030*/ IMAD.MOV.U32 R32, RZ, RZ, R4 ; /* 0x000000ffff207224 */
/* 0x000fe200078e0004 */
/*0040*/ MOV R33, R5 ; /* 0x0000000500217202 */
/* 0x000fe20000000f00 */
/*0050*/ STL [R1+0x4], R21 ; /* 0x0000041501007387 */
/* 0x000fe80000100800 */
/*0060*/ STL [R1], R20 ; /* 0x0000001401007387 */
/* 0x0001e20000100800 */
/*0070*/ LDC.64 R34, c[0x0][0x208] ; /* 0x00008200ff227b82 */
/* 0x000e640000000a00 */
/*0080*/ R2UR UR4, R34 ; /* 0x00000000220472ca */
/* 0x002fc400000e0000 */
/*0090*/ R2UR UR5, R35.reuse ; /* 0x00000000230572ca */
/* 0x040fe400000e0000 */
/*00a0*/ R2UR UR6, R34 ; /* 0x00000000220672ca */
/* 0x000fe400000e0000 */
/*00b0*/ R2UR UR7, R35 ; /* 0x00000000230772ca */
/* 0x000fd200000e0000 */
/*00c0*/ LD.E R6, desc[UR4][R8.64] ; /* 0x0000000408067980 */
/* 0x000368000c101900 */
/*00d0*/ LD.E R7, desc[UR6][R8.64+0x4] ; /* 0x0000040608077980 */
/* 0x000362000c101900 */
/*00e0*/ IMAD.MOV.U32 R4, RZ, RZ, R0 ; /* 0x000000ffff047224 */
/* 0x000fe400078e0000 */
/*00f0*/ IMAD.MOV.U32 R5, RZ, RZ, R3 ; /* 0x000000ffff057224 */
/* 0x000fe200078e0003 */
/*0100*/ MOV R20, 0x0 ; /* 0x0000000000147802 */
/* 0x001fe40000000f00 */
/*0110*/ MOV R21, 0x0 ; /* 0x0000000000157802 */
/* 0x000fce0000000f00 */
/*0120*/ CALL.ABS.NOINC 0x0 ; /* 0x0000000000007943 */
/* 0x022fea0003c00000 */
/*0130*/ R2UR UR4, R34.reuse ; /* 0x00000000220472ca */
/* 0x040fe400000e0000 */
/*0140*/ R2UR UR5, R35.reuse ; /* 0x00000000230572ca */
/* 0x040fe400000e0000 */
/*0150*/ R2UR UR6, R34 ; /* 0x00000000220672ca */
/* 0x000fe400000e0000 */
/*0160*/ R2UR UR7, R35 ; /* 0x00000000230772ca */
/* 0x000fd200000e0000 */
/*0170*/ ST.E desc[UR4][R32.64], R4 ; /* 0x0000000420007985 */
/* 0x000fe8000c101904 */
/*0180*/ ST.E desc[UR6][R32.64+0x4], R5 ; /* 0x0000040520007985 */
/* 0x0001e8000c101906 */
/*0190*/ LDL R20, [R1] ; /* 0x0000000001147983 */
/* 0x000e280000100800 */
/*01a0*/ LDL R21, [R1+0x4] ; /* 0x0000040001157983 */
/* 0x0002240000100800 */
/*01b0*/ IADD3 R1, R1, 0x8, RZ ; /* 0x0000000801017810 */
/* 0x002fe20007ffe0ff */
/*01c0*/ RET.ABS.NODEC R20 0x0 ; /* 0x0000000014007950 */
/* 0x001fec0003e00000 */
/*01d0*/ BRA 0x1d0; /* 0xfffffffc00fc7947 */
code for sm_100
.target sm_100
Function : __cuda_sm_9x_mma_sub_byte_internal_m8n8k32_s4_s4_satfinite
.headerflags @"EF_CUDA_SM100 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM100)"
/*0000*/ LOP3.LUT R3, R4.reuse, 0xf00, RZ, 0xc0, !PT ; /* 0x00000f0004037812 */
/* 0x040fe200078ec0ff */
/*0010*/ IMAD.SHL.U32 R0, R4.reuse, 0x10000000, RZ ; /* 0x1000000004007824 */
/* 0x040fe200078e00ff */
/*0020*/ LOP3.LUT R8, R4, 0xf000, RZ, 0xc0, !PT ; /* 0x0000f00004087812 */
/* 0x000fe200078ec0ff */
/*0030*/ IMAD.SHL.U32 R10, R5.reuse, 0x10000000, RZ ; /* 0x10000000050a7824 */
/* 0x040fe200078e00ff */
/*0040*/ LOP3.LUT R11, R5, 0xf00, RZ, 0xc0, !PT ; /* 0x00000f00050b7812 */
/* 0x000fe200078ec0ff */
/*0050*/ IMAD.SHL.U32 R3, R3, 0x100000, RZ ; /* 0x0010000003037824 */
/* 0x000fe200078e00ff */
/*0060*/ SHF.R.S32.HI R0, RZ, 0x1c, R0 ; /* 0x0000001cff007819 */
/* 0x000fe20000011400 */
/*0070*/ IMAD.U32 R8, R8, 0x10000, RZ ; /* 0x0001000008087824 */
/* 0x000fe200078e00ff */
/*0080*/ SHF.R.S32.HI R10, RZ, 0x1c, R10 ; /* 0x0000001cff0a7819 */
/* 0x000fe2000001140a */
/*0090*/ IMAD.SHL.U32 R11, R11, 0x100000, RZ ; /* 0x001000000b0b7824 */
/* 0x000fe200078e00ff */
/*00a0*/ SHF.R.S32.HI R9, RZ, 0x14, R3 ; /* 0x00000014ff097819 */
/* 0x000fe20000011403 */
/*00b0*/ WARPSYNC.ALL ; /* 0x0000000000007948 */
/* 0x000fea0003800000 */
/*00c0*/ NOP ; /* 0x0000000000007918 */
/* 0x000fe20000000000 */
/*00d0*/ LOP3.LUT R3, R4, 0xf0, RZ, 0xc0, !PT ; /* 0x000000f004037812 */
/* 0x000fc400078ec0ff */
/*00e0*/ LOP3.LUT R0, R0, 0xff, RZ, 0xc0, !PT ; /* 0x000000ff00007812 */
/* 0x000fe400078ec0ff */
/*00f0*/ LOP3.LUT R10, R10, 0xff, RZ, 0xc0, !PT ; /* 0x000000ff0a0a7812 */
/* 0x000fe200078ec0ff */
/*0100*/ IMAD.SHL.U32 R3, R3, 0x1000000, RZ ; /* 0x0100000003037824 */
/* 0x000fe200078e00ff */
/*0110*/ LOP3.LUT R0, R0, 0xff00, R9, 0xf8, !PT ; /* 0x0000ff0000007812 */
/* 0x000fe400078ef809 */
/*0120*/ LOP3.LUT R9, R4, 0xf0000, RZ, 0xc0, !PT ; /* 0x000f000004097812 */
/* 0x000fe400078ec0ff */
/*0130*/ SHF.R.S32.HI R3, RZ, 0x1c, R3 ; /* 0x0000001cff037819 */
/* 0x000fe40000011403 */
/*0140*/ SHF.R.S32.HI R11, RZ, 0x14, R11 ; /* 0x00000014ff0b7819 */
/* 0x000fe2000001140b */
/*0150*/ IMAD.SHL.U32 R9, R9, 0x1000, RZ ; /* 0x0000100009097824 */
/* 0x000fe200078e00ff */
/*0160*/ SHF.R.S32.HI R8, RZ, 0x14, R8 ; /* 0x00000014ff087819 */
/* 0x000fc40000011408 */
/*0170*/ LOP3.LUT R3, R3, 0xff, RZ, 0xc0, !PT ; /* 0x000000ff03037812 */
/* 0x000fe400078ec0ff */
/*0180*/ SHF.R.S32.HI R9, RZ, 0xc, R9 ; /* 0x0000000cff097819 */
/* 0x000fe40000011409 */
/*0190*/ LOP3.LUT R11, R10, 0xff00, R11, 0xf8, !PT ; /* 0x0000ff000a0b7812 */
/* 0x000fe400078ef80b */
/*01a0*/ LOP3.LUT R3, R3, 0xff00, R8, 0xf8, !PT ; /* 0x0000ff0003037812 */
/* 0x000fe400078ef808 */
/*01b0*/ LOP3.LUT R12, R5.reuse, 0xf0000, RZ, 0xc0, !PT ; /* 0x000f0000050c7812 */
/* 0x040fe400078ec0ff */
/*01c0*/ LOP3.LUT R10, R5, 0xf0, RZ, 0xc0, !PT ; /* 0x000000f0050a7812 */
/* 0x000fc400078ec0ff */
/*01d0*/ LOP3.LUT R8, R4, 0xf00000, RZ, 0xc0, !PT ; /* 0x00f0000004087812 */
/* 0x000fe200078ec0ff */
/*01e0*/ IMAD.SHL.U32 R12, R12, 0x1000, RZ ; /* 0x000010000c0c7824 */
/* 0x000fe200078e00ff */
/*01f0*/ LOP3.LUT R0, R0, 0xff0000, R9, 0xf8, !PT ; /* 0x00ff000000007812 */
/* 0x000fe200078ef809 */
/*0200*/ IMAD.SHL.U32 R10, R10, 0x1000000, RZ ; /* 0x010000000a0a7824 */
/* 0x000fe200078e00ff */
/*0210*/ LOP3.LUT R9, R5.reuse, 0xf000, RZ, 0xc0, !PT ; /* 0x0000f00005097812 */
/* 0x040fe200078ec0ff */
/*0220*/ IMAD.SHL.U32 R8, R8, 0x100, RZ ; /* 0x0000010008087824 */
/* 0x000fe200078e00ff */
/*0230*/ LOP3.LUT R13, R5, 0xf00000, RZ, 0xc0, !PT ; /* 0x00f00000050d7812 */
/* 0x000fe400078ec0ff */
/*0240*/ SHF.R.S32.HI R12, RZ, 0xc, R12 ; /* 0x0000000cff0c7819 */
/* 0x000fe2000001140c */
/*0250*/ IMAD.U32 R9, R9, 0x10000, RZ ; /* 0x0001000009097824 */
/* 0x000fe200078e00ff */
/*0260*/ SHF.R.S32.HI R10, RZ, 0x1c, R10 ; /* 0x0000001cff0a7819 */
/* 0x000fe2000001140a */
/*0270*/ IMAD.SHL.U32 R13, R13, 0x100, RZ ; /* 0x000001000d0d7824 */
/* 0x000fe200078e00ff */
/*0280*/ SHF.R.S32.HI R8, RZ, 0xc, R8 ; /* 0x0000000cff087819 */
/* 0x000fc40000011408 */
/*0290*/ LOP3.LUT R11, R11, 0xff0000, R12, 0xf8, !PT ; /* 0x00ff00000b0b7812 */
/* 0x000fe400078ef80c */
/*02a0*/ SHF.R.S32.HI R9, RZ, 0x14, R9 ; /* 0x00000014ff097819 */
/* 0x000fe40000011409 */
/*02b0*/ LOP3.LUT R10, R10, 0xff, RZ, 0xc0, !PT ; /* 0x000000ff0a0a7812 */
/* 0x000fe400078ec0ff */
/*02c0*/ LOP3.LUT R12, R5, 0xf000000, RZ, 0xc0, !PT ; /* 0x0f000000050c7812 */
/* 0x000fe400078ec0ff */
/*02d0*/ LOP3.LUT R3, R3, 0xff0000, R8, 0xf8, !PT ; /* 0x00ff000003037812 */
/* 0x000fe400078ef808 */
/*02e0*/ LOP3.LUT R8, R4, 0xf000000, RZ, 0xc0, !PT ; /* 0x0f00000004087812 */
/* 0x000fe200078ec0ff */
/*02f0*/ IMAD.SHL.U32 R12, R12, 0x10, RZ ; /* 0x000000100c0c7824 */
/* 0x000fe200078e00ff */
/*0300*/ LOP3.LUT R9, R10, 0xff00, R9, 0xf8, !PT ; /* 0x0000ff000a097812 */
/* 0x000fc400078ef809 */
/*0310*/ SHF.R.S32.HI R10, RZ, 0xc, R13 ; /* 0x0000000cff0a7819 */
/* 0x000fe2000001140d */
/*0320*/ IMAD.SHL.U32 R8, R8, 0x10, RZ ; /* 0x0000001008087824 */
/* 0x000fe200078e00ff */
/*0330*/ LOP3.LUT R5, R5, 0xf0000000, RZ, 0xc0, !PT ; /* 0xf000000005057812 */
/* 0x000fe400078ec0ff */
/*0340*/ LOP3.LUT R4, R4, 0xf0000000, RZ, 0xc0, !PT ; /* 0xf000000004047812 */
/* 0x000fe400078ec0ff */
/*0350*/ LOP3.LUT R9, R9, 0xff0000, R10, 0xf8, !PT ; /* 0x00ff000009097812 */
/* 0x000fe400078ef80a */
/*0360*/ SHF.R.S32.HI R12, RZ, 0x4, R12 ; /* 0x00000004ff0c7819 */
/* 0x000fe4000001140c */
/*0370*/ SHF.R.S32.HI R10, RZ, 0x4, R5 ; /* 0x00000004ff0a7819 */
/* 0x000fc40000011405 */
/*0380*/ SHF.R.S32.HI R5, RZ, 0x4, R8 ; /* 0x00000004ff057819 */
/* 0x000fe40000011408 */
/*0390*/ SHF.R.S32.HI R4, RZ, 0x4, R4 ; /* 0x00000004ff047819 */
/* 0x000fe40000011404 */
/*03a0*/ LOP3.LUT R11, R11, 0xff000000, R12, 0xf8, !PT ; /* 0xff0000000b0b7812 */
/* 0x000fe400078ef80c */
/*03b0*/ LOP3.LUT R12, R9, 0xff000000, R10, 0xf8, !PT ; /* 0xff000000090c7812 */
/* 0x000fe400078ef80a */
/*03c0*/ LOP3.LUT R8, R0, 0xff000000, R5, 0xf8, !PT ; /* 0xff00000000087812 */
/* 0x000fe400078ef805 */
/*03d0*/ LOP3.LUT R4, R3, 0xff000000, R4, 0xf8, !PT ; /* 0xff00000003047812 */
/* 0x000fe200078ef804 */
/*03e0*/ IMAD.MOV.U32 R9, RZ, RZ, RZ ; /* 0x000000ffff097224 */
/* 0x000fc400078e00ff */
/*03f0*/ IMAD.MOV.U32 R5, RZ, RZ, RZ ; /* 0x000000ffff057224 */
/* 0x000fca00078e00ff */
/*0400*/ IMMA.16816.S8.S8 R8, R8.ROW, R11.COL, RZ ; /* 0x0000000b08087237 */
/* 0x000ff000004054ff */
/*0410*/ IMMA.16816.S8.S8 R12, R4.ROW, R12.COL, RZ ; /* 0x0000000c040c7237 */
/* 0x000fde00004054ff */
/*0420*/ NOP ; /* 0x0000000000007918 */
/* 0x000fc80000000000 */
/*0430*/ IMAD.IADD R3, R8, 0x1, R12 ; /* 0x0000000108037824 */
/* 0x000fe400078e020c */
/*0440*/ IMAD.IADD R0, R9, 0x1, R13 ; /* 0x0000000109007824 */
/* 0x000fe400078e020d */
/*0450*/ IMAD.IADD R4, R3, 0x1, R6 ; /* 0x0000000103047824 */
/* 0x000fe400078e0206 */
/*0460*/ IMAD.IADD R5, R0, 0x1, R7 ; /* 0x0000000100057824 */
/* 0x000fc600078e0207 */
/*0470*/ PLOP3.LUT P0, PT, R3.reuse.SIGN, R6.reuse.SIGN, R4.reuse.SIGN, 0x2, 0x0 ; /* 0x000000060300721f */
/* 0x1c0fe40000700204 */
/*0480*/ PLOP3.LUT P2, PT, R0.reuse.SIGN, R7.reuse.SIGN, R5.reuse.SIGN, 0x2, 0x0 ; /* 0x000000070000721f */
/* 0x1c0fe40000740205 */
/*0490*/ PLOP3.LUT P1, PT, R3.SIGN, R6.SIGN, R4.SIGN, 0x40, 0x0 ; /* 0x000000060300721f */
/* 0x000fe40000724004 */
/*04a0*/ PLOP3.LUT P3, PT, R0.SIGN, R7.SIGN, R5.SIGN, 0x40, 0x0 ; /* 0x000000070000721f */
/* 0x000fe40000764005 */
/*04b0*/ SEL R4, R4, 0x7fffffff, !P0 ; /* 0x7fffffff04047807 */
/* 0x000fe40004000000 */
/*04c0*/ SEL R5, R5, 0x7fffffff, !P2 ; /* 0x7fffffff05057807 */
/* 0x000fc40005000000 */
/*04d0*/ SEL R4, R4, 0x80000000, !P1 ; /* 0x8000000004047807 */
/* 0x000fe40004800000 */
/*04e0*/ SEL R5, R5, 0x80000000, !P3 ; /* 0x8000000005057807 */
/* 0x000fe20005800000 */
/*04f0*/ RET.ABS.NODEC R20 0x0 ; /* 0x0000000014007950 */
/* 0x000fec0003e00000 */
/*0500*/ BRA 0x500; /* 0xfffffffc00fc7947 */
/* 0x000fc0000383ffff */
/*0510*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0520*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0530*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0540*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0550*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0560*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0570*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0580*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0590*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05a0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05b0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05c0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*05f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
Function : _Z4funcRA2_jjjS0_
.headerflags @"EF_CUDA_SM100 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM100)"
/*0000*/ IADD3 R1, PT, PT, R1, -0x8, RZ ; /* 0xfffffff801017810 */
/* 0x000fe20007ffe0ff */
/*0010*/ IMAD.MOV.U32 R0, RZ, RZ, R6 ; /* 0x000000ffff007224 */
/* 0x000fe200078e0006 */
/*0020*/ MOV R3, R7 ; /* 0x0000000700037202 */
/* 0x000fe20000000f00 */
/*0030*/ IMAD.MOV.U32 R32, RZ, RZ, R4 ; /* 0x000000ffff207224 */
/* 0x000fe200078e0004 */
/*0040*/ MOV R33, R5 ; /* 0x0000000500217202 */
/* 0x000fe20000000f00 */
/*0050*/ STL [R1+0x4], R21 ; /* 0x0000041501007387 */
/* 0x000fe80000100800 */
/*0060*/ STL [R1], R20 ; /* 0x0000001401007387 */
/* 0x0001e20000100800 */
/*0070*/ LDC.64 R34, c[0x0][0x358] ; /* 0x0000d600ff227b82 */
/* 0x000e640000000a00 */
/*0080*/ R2UR UR4, R34 ; /* 0x00000000220472ca */
/* 0x002fc400000e0000 */
/*0090*/ R2UR UR5, R35.reuse ; /* 0x00000000230572ca */
/* 0x040fe400000e0000 */
/*00a0*/ R2UR UR6, R34 ; /* 0x00000000220672ca */
/* 0x000fe400000e0000 */
/*00b0*/ R2UR UR7, R35 ; /* 0x00000000230772ca */
/* 0x000fd200000e0000 */
/*00c0*/ LD.E R6, desc[UR4][R8.64] ; /* 0x0000000408067980 */
/* 0x000368000c101900 */
/*00d0*/ LD.E R7, desc[UR6][R8.64+0x4] ; /* 0x0000040608077980 */
/* 0x000362000c101900 */
/*00e0*/ IMAD.MOV.U32 R4, RZ, RZ, R0 ; /* 0x000000ffff047224 */
/* 0x000fe400078e0000 */
/*00f0*/ IMAD.MOV.U32 R5, RZ, RZ, R3 ; /* 0x000000ffff057224 */
/* 0x000fe200078e0003 */
/*0100*/ MOV R20, 0x0 ; /* 0x0000000000147802 */
/* 0x001fe40000000f00 */
/*0110*/ MOV R21, 0x0 ; /* 0x0000000000157802 */
/* 0x000fce0000000f00 */
/*0120*/ CALL.ABS.NOINC 0x0 ; /* 0x0000000000007943 */
/* 0x022fea0003c00000 */
/*0130*/ R2UR UR4, R34.reuse ; /* 0x00000000220472ca */
/* 0x040fe400000e0000 */
/*0140*/ R2UR UR5, R35.reuse ; /* 0x00000000230572ca */
/* 0x040fe400000e0000 */
/*0150*/ R2UR UR6, R34 ; /* 0x00000000220672ca */
/* 0x000fe400000e0000 */
/*0160*/ R2UR UR7, R35 ; /* 0x00000000230772ca */
/* 0x000fd200000e0000 */
/*0170*/ ST.E desc[UR4][R32.64], R4 ; /* 0x0000000420007985 */
/* 0x000fe8000c101904 */
/*0180*/ ST.E desc[UR6][R32.64+0x4], R5 ; /* 0x0000040520007985 */
/* 0x0001e8000c101906 */
/*0190*/ LDL R20, [R1] ; /* 0x0000000001147983 */
/* 0x000e280000100800 */
/*01a0*/ LDL R21, [R1+0x4] ; /* 0x0000040001157983 */
/* 0x0002240000100800 */
/*01b0*/ IADD3 R1, PT, PT, R1, 0x8, RZ ; /* 0x0000000801017810 */
/* 0x002fe20007ffe0ff */
/*01c0*/ RET.ABS.NODEC R20 0x0 ; /* 0x0000000014007950 */
/* 0x001fec0003e00000 */
/*01d0*/ BRA 0x1d0; /* 0xfffffffc00fc7947 */
While you are at it, could you test 1 bit XOR and 1 bit AND, please? XOR was deprecated with 9.0, too.
__device__
void wmma_xor_popc_func(unsigned int (&D)[2], unsigned int A, unsigned int B, unsigned int (&C)[2]){
asm volatile("wmma.mma.xor.popc.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=r"(D[0]), "=r"(D[1])
: "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
}
__device__
void wmma_and_popc_func(unsigned int (&D)[2], unsigned int A, unsigned int B, unsigned int (&C)[2]){
asm volatile("wmma.mma.and.popc.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=r"(D[0]), "=r"(D[1])
: "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
}
Ada: uses BMMA.88128.XOR.POPC
and BMMA.88128.AND.POPC
Hopper: AND uses BMMA.88128.AND.POPC
, XOR short emulation with two of those instructions.
Blackwell: AND long emulation with IMMA.16832.U8.U8
. XOR emulation based on AND
Thank you.
Have I understood correctly? XOR can be emulated with two ANDs?
The SASS code for xor looks like this on Hopper. (or godbold here: Compiler Explorer)
code for sm_90
Function : __cuda_sm_9x_mma_bit_internal_xor_m8n8k128
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM90 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM90)"
/*0000*/ WARPSYNC.ALL ; /* 0x0000000000007948 */
/* 0x000fea0003800000 */
/*0010*/ NOP ; /* 0x0000000000007918 */
/* 0x000fe20000000000 */
/*0020*/ LOP3.LUT R0, RZ, R4, RZ, 0x33, !PT ; /* 0x00000004ff007212 */
/* 0x000fe400078e33ff */
/*0030*/ LOP3.LUT R3, RZ, R5, RZ, 0x33, !PT ; /* 0x00000005ff037212 */
/* 0x000fce00078e33ff */
/*0040*/ BMMA.88128.AND.POPC R8, R4.ROW, R3.COL, RZ ; /* 0x000000030408723d */
/* 0x000fe800000144ff */
/*0050*/ BMMA.88128.AND.POPC R4, R0.ROW, R5.COL, RZ ; /* 0x000000050004723d */
/* 0x000fda00000144ff */
/*0060*/ IADD3 R4, R6, R8, R4 ; /* 0x0000000806047210 */
/* 0x000fe40007ffe004 */
/*0070*/ IADD3 R5, R7, R9, R5 ; /* 0x0000000907057210 */
/* 0x000fe20007ffe005 */
/*0080*/ RET.ABS.NODEC R20 0x0 ; /* 0x0000000014007950 */
/* 0x000fec0003e00000 */
/*0090*/ BRA 0x90; /* 0xfffffffc00fc7947 */