Does Blackwell support INT4 native?

Someone told me Blackwell does not support INT4 native.

Correct. Native and packed types supported can be found here.

Or do you mean the Tensor Core formats?

yes, tensor core.

From the PTX documentation at least the following mma instructions exist for blackwell

mma.sync.aligned.shape.row.col{.satfinite}.s32.atype.btype.s32 d, a, b, c;

.shape   = {.m8n8k32, .m16n8k32, .m16n8k64}
.atype   = {.u4, .s4};
.btype   = {.u4, .s4};

.u4/.s4 integer type mma operation with .m8n8k32 shape sm_75 or higher.
.u4/.s4 integer type mma operation with .m16n8k32 and .m16n8k64 shapes requires sm_80 or higher.

Compiling the following device function with -dc for different architectures, we can see that on Hopper and Blackwell an emulation sequence is used.

__device__
void func(unsigned int (&D)[2], unsigned int A, unsigned int B, unsigned int (&C)[2]){
    asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
      : "=r"(D[0]), "=r"(D[1])
      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
}
	code for sm_75
		Function : _Z4funcRA2_jjjS0_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM75 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM75)"
       ....
        /*0030*/                   IMMA.8832.S4.S4.SAT R6, R6.ROW, R7.COL, R10 ;  /* 0x0000000706067237 */
                                                                                  /* 0x020e7400003c540a */
       ....



code for sm_80
		Function : _Z4funcRA2_jjjS0_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM80 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM80)"
       ....
        /*0040*/                   IMMA.8832.S4.S4.SAT R6, R6.ROW, R7.COL, R10 ;  /* 0x0000000706067237 */
       ....


	code for sm_86
		Function : _Z4funcRA2_jjjS0_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
       ....
        /*0040*/                   IMMA.8832.S4.S4.SAT R6, R6.ROW, R7.COL, R10 ;  /* 0x0000000706067237 */
                                                                                  /* 0x020fe200003c540a */
        ....


	code for sm_89
		Function : _Z4funcRA2_jjjS0_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM89 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM89)"
       ....
                                                                                  /* 0x000fe40003800000 */
        /*0040*/                   IMMA.8832.S4.S4.SAT R6, R6.ROW, R7.COL, R10 ;  /* 0x0000000706067237 */
       ....


code for sm_90
		Function : __cuda_sm_9x_mma_sub_byte_internal_m8n8k32_s4_s4_satfinite
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM90 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM90)"
        /*0000*/                   LOP3.LUT R3, R4.reuse, 0xf00, RZ, 0xc0, !PT ;                              /* 0x00000f0004037812 */
                                                                                                              /* 0x040fe200078ec0ff */
        /*0010*/                   IMAD.SHL.U32 R0, R4.reuse, 0x10000000, RZ ;                                /* 0x1000000004007824 */
                                                                                                              /* 0x040fe200078e00ff */
        /*0020*/                   LOP3.LUT R11, R5.reuse, 0xf00, RZ, 0xc0, !PT ;                             /* 0x00000f00050b7812 */
                                                                                                              /* 0x040fe200078ec0ff */
        /*0030*/                   IMAD.SHL.U32 R10, R5, 0x10000000, RZ ;                                     /* 0x10000000050a7824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0040*/                   LOP3.LUT R8, R4, 0xf000, RZ, 0xc0, !PT ;                                   /* 0x0000f00004087812 */
                                                                                                              /* 0x000fe200078ec0ff */
        /*0050*/                   IMAD.SHL.U32 R3, R3, 0x100000, RZ ;                                        /* 0x0010000003037824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0060*/                   SHF.R.S32.HI R0, RZ, 0x1c, R0 ;                                            /* 0x0000001cff007819 */
                                                                                                              /* 0x000fe20000011400 */
        /*0070*/                   IMAD.SHL.U32 R11, R11, 0x100000, RZ ;                                      /* 0x001000000b0b7824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0080*/                   SHF.R.S32.HI R10, RZ, 0x1c, R10 ;                                          /* 0x0000001cff0a7819 */
                                                                                                              /* 0x000fe2000001140a */
        /*0090*/                   IMAD.U32 R8, R8, 0x10000, RZ ;                                             /* 0x0001000008087824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*00a0*/                   LOP3.LUT R0, R0, 0xff, RZ, 0xc0, !PT ;                                     /* 0x000000ff00007812 */
                                                                                                              /* 0x000fe200078ec0ff */
        /*00b0*/                   WARPSYNC.ALL ;                                                             /* 0x0000000000007948 */
                                                                                                              /* 0x000fea0003800000 */
        /*00c0*/                   NOP ;                                                                      /* 0x0000000000007918 */
                                                                                                              /* 0x000fe20000000000 */
        /*00d0*/                   SHF.R.S32.HI R9, RZ, 0x14, R3 ;                                            /* 0x00000014ff097819 */
                                                                                                              /* 0x000fc40000011403 */
        /*00e0*/                   LOP3.LUT R10, R10, 0xff, RZ, 0xc0, !PT ;                                   /* 0x000000ff0a0a7812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*00f0*/                   LOP3.LUT R0, R0, 0xff00, R9, 0xf8, !PT ;                                   /* 0x0000ff0000007812 */
                                                                                                              /* 0x000fe400078ef809 */
        /*0100*/                   LOP3.LUT R9, R4.reuse, 0xf0000, RZ, 0xc0, !PT ;                            /* 0x000f000004097812 */
                                                                                                              /* 0x040fe400078ec0ff */
        /*0110*/                   SHF.R.S32.HI R11, RZ, 0x14, R11 ;                                          /* 0x00000014ff0b7819 */
                                                                                                              /* 0x000fe4000001140b */
        /*0120*/                   LOP3.LUT R3, R4, 0xf0, RZ, 0xc0, !PT ;                                     /* 0x000000f004037812 */
                                                                                                              /* 0x000fe200078ec0ff */
        /*0130*/                   IMAD.SHL.U32 R9, R9, 0x1000, RZ ;                                          /* 0x0000100009097824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0140*/                   LOP3.LUT R11, R10, 0xff00, R11, 0xf8, !PT ;                                /* 0x0000ff000a0b7812 */
                                                                                                              /* 0x000fc400078ef80b */
        /*0150*/                   LOP3.LUT R10, R5, 0xf0, RZ, 0xc0, !PT ;                                    /* 0x000000f0050a7812 */
                                                                                                              /* 0x000fe200078ec0ff */
        /*0160*/                   IMAD.SHL.U32 R3, R3, 0x1000000, RZ ;                                       /* 0x0100000003037824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0170*/                   SHF.R.S32.HI R9, RZ, 0xc, R9 ;                                             /* 0x0000000cff097819 */
                                                                                                              /* 0x000fe40000011409 */
        /*0180*/                   LOP3.LUT R12, R5.reuse, 0xf0000, RZ, 0xc0, !PT ;                           /* 0x000f0000050c7812 */
                                                                                                              /* 0x040fe200078ec0ff */
        /*0190*/                   IMAD.SHL.U32 R10, R10, 0x1000000, RZ ;                                     /* 0x010000000a0a7824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*01a0*/                   LOP3.LUT R0, R0, 0xff0000, R9, 0xf8, !PT ;                                 /* 0x00ff000000007812 */
                                                                                                              /* 0x000fe400078ef809 */
        /*01b0*/                   LOP3.LUT R9, R5, 0xf000, RZ, 0xc0, !PT ;                                   /* 0x0000f00005097812 */
                                                                                                              /* 0x000fe200078ec0ff */
        /*01c0*/                   IMAD.SHL.U32 R12, R12, 0x1000, RZ ;                                        /* 0x000010000c0c7824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*01d0*/                   SHF.R.S32.HI R3, RZ, 0x1c, R3 ;                                            /* 0x0000001cff037819 */
                                                                                                              /* 0x000fc40000011403 */
        /*01e0*/                   SHF.R.S32.HI R8, RZ, 0x14, R8 ;                                            /* 0x00000014ff087819 */
                                                                                                              /* 0x000fe20000011408 */
        /*01f0*/                   IMAD.U32 R9, R9, 0x10000, RZ ;                                             /* 0x0001000009097824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0200*/                   LOP3.LUT R3, R3, 0xff, RZ, 0xc0, !PT ;                                     /* 0x000000ff03037812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*0210*/                   SHF.R.S32.HI R10, RZ, 0x1c, R10 ;                                          /* 0x0000001cff0a7819 */
                                                                                                              /* 0x000fe4000001140a */
        /*0220*/                   LOP3.LUT R3, R3, 0xff00, R8, 0xf8, !PT ;                                   /* 0x0000ff0003037812 */
                                                                                                              /* 0x000fe400078ef808 */
        /*0230*/                   SHF.R.S32.HI R9, RZ, 0x14, R9 ;                                            /* 0x00000014ff097819 */
                                                                                                              /* 0x000fe40000011409 */
        /*0240*/                   LOP3.LUT R10, R10, 0xff, RZ, 0xc0, !PT ;                                   /* 0x000000ff0a0a7812 */
                                                                                                              /* 0x000fc400078ec0ff */
        /*0250*/                   SHF.R.S32.HI R12, RZ, 0xc, R12 ;                                           /* 0x0000000cff0c7819 */
                                                                                                              /* 0x000fe4000001140c */
        /*0260*/                   LOP3.LUT R8, R4, 0xf00000, RZ, 0xc0, !PT ;                                 /* 0x00f0000004087812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*0270*/                   LOP3.LUT R13, R5, 0xf00000, RZ, 0xc0, !PT ;                                /* 0x00f00000050d7812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*0280*/                   LOP3.LUT R10, R10, 0xff00, R9, 0xf8, !PT ;                                 /* 0x0000ff000a0a7812 */
                                                                                                              /* 0x000fe200078ef809 */
        /*0290*/                   IMAD.SHL.U32 R8, R8, 0x100, RZ ;                                           /* 0x0000010008087824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*02a0*/                   LOP3.LUT R11, R11, 0xff0000, R12, 0xf8, !PT ;                              /* 0x00ff00000b0b7812 */
                                                                                                              /* 0x000fe200078ef80c */
        /*02b0*/                   IMAD.SHL.U32 R13, R13, 0x100, RZ ;                                         /* 0x000001000d0d7824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*02c0*/                   LOP3.LUT R9, R4, 0xf000000, RZ, 0xc0, !PT ;                                /* 0x0f00000004097812 */
                                                                                                              /* 0x000fc400078ec0ff */
        /*02d0*/                   LOP3.LUT R12, R5, 0xf000000, RZ, 0xc0, !PT ;                               /* 0x0f000000050c7812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*02e0*/                   SHF.R.S32.HI R8, RZ, 0xc, R8 ;                                             /* 0x0000000cff087819 */
                                                                                                              /* 0x000fe20000011408 */
        /*02f0*/                   IMAD.SHL.U32 R9, R9, 0x10, RZ ;                                            /* 0x0000001009097824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0300*/                   SHF.R.S32.HI R13, RZ, 0xc, R13 ;                                           /* 0x0000000cff0d7819 */
                                                                                                              /* 0x000fe2000001140d */
        /*0310*/                   IMAD.SHL.U32 R12, R12, 0x10, RZ ;                                          /* 0x000000100c0c7824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0320*/                   LOP3.LUT R4, R4, 0xf0000000, RZ, 0xc0, !PT ;                               /* 0xf000000004047812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*0330*/                   LOP3.LUT R5, R5, 0xf0000000, RZ, 0xc0, !PT ;                               /* 0xf000000005057812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*0340*/                   LOP3.LUT R3, R3, 0xff0000, R8, 0xf8, !PT ;                                 /* 0x00ff000003037812 */
                                                                                                              /* 0x000fc400078ef808 */
        /*0350*/                   LOP3.LUT R10, R10, 0xff0000, R13, 0xf8, !PT ;                              /* 0x00ff00000a0a7812 */
                                                                                                              /* 0x000fe400078ef80d */
        /*0360*/                   SHF.R.S32.HI R9, RZ, 0x4, R9 ;                                             /* 0x00000004ff097819 */
                                                                                                              /* 0x000fe40000011409 */
        /*0370*/                   SHF.R.S32.HI R12, RZ, 0x4, R12 ;                                           /* 0x00000004ff0c7819 */
                                                                                                              /* 0x000fe4000001140c */
        /*0380*/                   SHF.R.S32.HI R4, RZ, 0x4, R4 ;                                             /* 0x00000004ff047819 */
                                                                                                              /* 0x000fe40000011404 */
        /*0390*/                   SHF.R.S32.HI R5, RZ, 0x4, R5 ;                                             /* 0x00000004ff057819 */
                                                                                                              /* 0x000fe40000011405 */
        /*03a0*/                   LOP3.LUT R0, R0, 0xff000000, R9, 0xf8, !PT ;                               /* 0xff00000000007812 */
                                                                                                              /* 0x000fc400078ef809 */
        /*03b0*/                   LOP3.LUT R11, R11, 0xff000000, R12, 0xf8, !PT ;                            /* 0xff0000000b0b7812 */
                                                                                                              /* 0x000fe400078ef80c */
        /*03c0*/                   LOP3.LUT R3, R3, 0xff000000, R4, 0xf8, !PT ;                               /* 0xff00000003037812 */
                                                                                                              /* 0x000fe400078ef804 */
        /*03d0*/                   LOP3.LUT R10, R10, 0xff000000, R5, 0xf8, !PT ;                             /* 0xff0000000a0a7812 */
                                                                                                              /* 0x000fc600078ef805 */
        /*03e0*/                   IMMA.8816.S8.S8 R4, R0.ROW, R11.COL, RZ ;                                  /* 0x0000000b00047237 */
                                                                                                              /* 0x000fe800000054ff */
        /*03f0*/                   IMMA.8816.S8.S8 R10, R3.ROW, R10.COL, RZ ;                                 /* 0x0000000a030a7237 */
                                                                                                              /* 0x000fda00000054ff */
        /*0400*/                   IMAD.IADD R3, R4, 0x1, R10 ;                                               /* 0x0000000104037824 */
                                                                                                              /* 0x000fe400078e020a */
        /*0410*/                   IMAD.IADD R0, R5, 0x1, R11 ;                                               /* 0x0000000105007824 */
                                                                                                              /* 0x000fe400078e020b */
        /*0420*/                   IMAD.IADD R4, R3, 0x1, R6 ;                                                /* 0x0000000103047824 */
                                                                                                              /* 0x000fe400078e0206 */
        /*0430*/                   IMAD.IADD R5, R0, 0x1, R7 ;                                                /* 0x0000000100057824 */
                                                                                                              /* 0x000fc600078e0207 */
        /*0440*/                   PLOP3.LUT P0, PT, R3.reuse.SIGN, R6.reuse.SIGN, R4.reuse.SIGN, 0x2, 0x0 ;  /* 0x000000060300721f */
                                                                                                              /* 0x1c0fe40000700204 */
        /*0450*/                   PLOP3.LUT P2, PT, R0.reuse.SIGN, R7.reuse.SIGN, R5.reuse.SIGN, 0x2, 0x0 ;  /* 0x000000070000721f */
                                                                                                              /* 0x1c0fe40000740205 */
        /*0460*/                   PLOP3.LUT P1, PT, R3.SIGN, R6.SIGN, R4.SIGN, 0x40, 0x0 ;                   /* 0x000000060300721f */
                                                                                                              /* 0x000fe40000724004 */
        /*0470*/                   PLOP3.LUT P3, PT, R0.SIGN, R7.SIGN, R5.SIGN, 0x40, 0x0 ;                   /* 0x000000070000721f */
                                                                                                              /* 0x000fe40000764005 */
        /*0480*/                   SEL R4, R4, 0x7fffffff, !P0 ;                                              /* 0x7fffffff04047807 */
                                                                                                              /* 0x000fe40004000000 */
        /*0490*/                   SEL R5, R5, 0x7fffffff, !P2 ;                                              /* 0x7fffffff05057807 */
                                                                                                              /* 0x000fc40005000000 */
        /*04a0*/                   SEL R4, R4, 0x80000000, !P1 ;                                              /* 0x8000000004047807 */
                                                                                                              /* 0x000fe40004800000 */
        /*04b0*/                   SEL R5, R5, 0x80000000, !P3 ;                                              /* 0x8000000005057807 */
                                                                                                              /* 0x000fe20005800000 */
        /*04c0*/                   RET.ABS.NODEC R20 0x0 ;                                                    /* 0x0000000014007950 */
                                                                                                              /* 0x000fec0003e00000 */
        /*04d0*/                   BRA 0x4d0;                                                                 /* 0xfffffffc00fc7947 */
                                                                                                              /* 0x000fc0000383ffff */
        /*04e0*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*04f0*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0500*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0510*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0520*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0530*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0540*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0550*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0560*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0570*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
		..........


		Function : _Z4funcRA2_jjjS0_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM90 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM90)"
        /*0000*/                   IADD3 R1, R1, -0x8, RZ ;                                                   /* 0xfffffff801017810 */
                                                                                                              /* 0x000fe20007ffe0ff */
        /*0010*/                   IMAD.MOV.U32 R0, RZ, RZ, R6 ;                                              /* 0x000000ffff007224 */
                                                                                                              /* 0x000fe200078e0006 */
        /*0020*/                   MOV R3, R7 ;                                                               /* 0x0000000700037202 */
                                                                                                              /* 0x000fe20000000f00 */
        /*0030*/                   IMAD.MOV.U32 R32, RZ, RZ, R4 ;                                             /* 0x000000ffff207224 */
                                                                                                              /* 0x000fe200078e0004 */
        /*0040*/                   MOV R33, R5 ;                                                              /* 0x0000000500217202 */
                                                                                                              /* 0x000fe20000000f00 */
        /*0050*/                   STL [R1+0x4], R21 ;                                                        /* 0x0000041501007387 */
                                                                                                              /* 0x000fe80000100800 */
        /*0060*/                   STL [R1], R20 ;                                                            /* 0x0000001401007387 */
                                                                                                              /* 0x0001e20000100800 */
        /*0070*/                   LDC.64 R34, c[0x0][0x208] ;                                                /* 0x00008200ff227b82 */
                                                                                                              /* 0x000e640000000a00 */
        /*0080*/                   R2UR UR4, R34 ;                                                            /* 0x00000000220472ca */
                                                                                                              /* 0x002fc400000e0000 */
        /*0090*/                   R2UR UR5, R35.reuse ;                                                      /* 0x00000000230572ca */
                                                                                                              /* 0x040fe400000e0000 */
        /*00a0*/                   R2UR UR6, R34 ;                                                            /* 0x00000000220672ca */
                                                                                                              /* 0x000fe400000e0000 */
        /*00b0*/                   R2UR UR7, R35 ;                                                            /* 0x00000000230772ca */
                                                                                                              /* 0x000fd200000e0000 */
        /*00c0*/                   LD.E R6, desc[UR4][R8.64] ;                                                /* 0x0000000408067980 */
                                                                                                              /* 0x000368000c101900 */
        /*00d0*/                   LD.E R7, desc[UR6][R8.64+0x4] ;                                            /* 0x0000040608077980 */
                                                                                                              /* 0x000362000c101900 */
        /*00e0*/                   IMAD.MOV.U32 R4, RZ, RZ, R0 ;                                              /* 0x000000ffff047224 */
                                                                                                              /* 0x000fe400078e0000 */
        /*00f0*/                   IMAD.MOV.U32 R5, RZ, RZ, R3 ;                                              /* 0x000000ffff057224 */
                                                                                                              /* 0x000fe200078e0003 */
        /*0100*/                   MOV R20, 0x0 ;                                                             /* 0x0000000000147802 */
                                                                                                              /* 0x001fe40000000f00 */
        /*0110*/                   MOV R21, 0x0 ;                                                             /* 0x0000000000157802 */
                                                                                                              /* 0x000fce0000000f00 */
        /*0120*/                   CALL.ABS.NOINC 0x0 ;                                                       /* 0x0000000000007943 */
                                                                                                              /* 0x022fea0003c00000 */
        /*0130*/                   R2UR UR4, R34.reuse ;                                                      /* 0x00000000220472ca */
                                                                                                              /* 0x040fe400000e0000 */
        /*0140*/                   R2UR UR5, R35.reuse ;                                                      /* 0x00000000230572ca */
                                                                                                              /* 0x040fe400000e0000 */
        /*0150*/                   R2UR UR6, R34 ;                                                            /* 0x00000000220672ca */
                                                                                                              /* 0x000fe400000e0000 */
        /*0160*/                   R2UR UR7, R35 ;                                                            /* 0x00000000230772ca */
                                                                                                              /* 0x000fd200000e0000 */
        /*0170*/                   ST.E desc[UR4][R32.64], R4 ;                                               /* 0x0000000420007985 */
                                                                                                              /* 0x000fe8000c101904 */
        /*0180*/                   ST.E desc[UR6][R32.64+0x4], R5 ;                                           /* 0x0000040520007985 */
                                                                                                              /* 0x0001e8000c101906 */
        /*0190*/                   LDL R20, [R1] ;                                                            /* 0x0000000001147983 */
                                                                                                              /* 0x000e280000100800 */
        /*01a0*/                   LDL R21, [R1+0x4] ;                                                        /* 0x0000040001157983 */
                                                                                                              /* 0x0002240000100800 */
        /*01b0*/                   IADD3 R1, R1, 0x8, RZ ;                                                    /* 0x0000000801017810 */
                                                                                                              /* 0x002fe20007ffe0ff */
        /*01c0*/                   RET.ABS.NODEC R20 0x0 ;                                                    /* 0x0000000014007950 */
                                                                                                              /* 0x001fec0003e00000 */
        /*01d0*/                   BRA 0x1d0;                                                                 /* 0xfffffffc00fc7947 */




code for sm_100
	.target	sm_100

		Function : __cuda_sm_9x_mma_sub_byte_internal_m8n8k32_s4_s4_satfinite
	.headerflags	@"EF_CUDA_SM100 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM100)"
        /*0000*/                   LOP3.LUT R3, R4.reuse, 0xf00, RZ, 0xc0, !PT ;                              /* 0x00000f0004037812 */
                                                                                                              /* 0x040fe200078ec0ff */
        /*0010*/                   IMAD.SHL.U32 R0, R4.reuse, 0x10000000, RZ ;                                /* 0x1000000004007824 */
                                                                                                              /* 0x040fe200078e00ff */
        /*0020*/                   LOP3.LUT R8, R4, 0xf000, RZ, 0xc0, !PT ;                                   /* 0x0000f00004087812 */
                                                                                                              /* 0x000fe200078ec0ff */
        /*0030*/                   IMAD.SHL.U32 R10, R5.reuse, 0x10000000, RZ ;                               /* 0x10000000050a7824 */
                                                                                                              /* 0x040fe200078e00ff */
        /*0040*/                   LOP3.LUT R11, R5, 0xf00, RZ, 0xc0, !PT ;                                   /* 0x00000f00050b7812 */
                                                                                                              /* 0x000fe200078ec0ff */
        /*0050*/                   IMAD.SHL.U32 R3, R3, 0x100000, RZ ;                                        /* 0x0010000003037824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0060*/                   SHF.R.S32.HI R0, RZ, 0x1c, R0 ;                                            /* 0x0000001cff007819 */
                                                                                                              /* 0x000fe20000011400 */
        /*0070*/                   IMAD.U32 R8, R8, 0x10000, RZ ;                                             /* 0x0001000008087824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0080*/                   SHF.R.S32.HI R10, RZ, 0x1c, R10 ;                                          /* 0x0000001cff0a7819 */
                                                                                                              /* 0x000fe2000001140a */
        /*0090*/                   IMAD.SHL.U32 R11, R11, 0x100000, RZ ;                                      /* 0x001000000b0b7824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*00a0*/                   SHF.R.S32.HI R9, RZ, 0x14, R3 ;                                            /* 0x00000014ff097819 */
                                                                                                              /* 0x000fe20000011403 */
        /*00b0*/                   WARPSYNC.ALL ;                                                             /* 0x0000000000007948 */
                                                                                                              /* 0x000fea0003800000 */
        /*00c0*/                   NOP ;                                                                      /* 0x0000000000007918 */
                                                                                                              /* 0x000fe20000000000 */
        /*00d0*/                   LOP3.LUT R3, R4, 0xf0, RZ, 0xc0, !PT ;                                     /* 0x000000f004037812 */
                                                                                                              /* 0x000fc400078ec0ff */
        /*00e0*/                   LOP3.LUT R0, R0, 0xff, RZ, 0xc0, !PT ;                                     /* 0x000000ff00007812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*00f0*/                   LOP3.LUT R10, R10, 0xff, RZ, 0xc0, !PT ;                                   /* 0x000000ff0a0a7812 */
                                                                                                              /* 0x000fe200078ec0ff */
        /*0100*/                   IMAD.SHL.U32 R3, R3, 0x1000000, RZ ;                                       /* 0x0100000003037824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0110*/                   LOP3.LUT R0, R0, 0xff00, R9, 0xf8, !PT ;                                   /* 0x0000ff0000007812 */
                                                                                                              /* 0x000fe400078ef809 */
        /*0120*/                   LOP3.LUT R9, R4, 0xf0000, RZ, 0xc0, !PT ;                                  /* 0x000f000004097812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*0130*/                   SHF.R.S32.HI R3, RZ, 0x1c, R3 ;                                            /* 0x0000001cff037819 */
                                                                                                              /* 0x000fe40000011403 */
        /*0140*/                   SHF.R.S32.HI R11, RZ, 0x14, R11 ;                                          /* 0x00000014ff0b7819 */
                                                                                                              /* 0x000fe2000001140b */
        /*0150*/                   IMAD.SHL.U32 R9, R9, 0x1000, RZ ;                                          /* 0x0000100009097824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0160*/                   SHF.R.S32.HI R8, RZ, 0x14, R8 ;                                            /* 0x00000014ff087819 */
                                                                                                              /* 0x000fc40000011408 */
        /*0170*/                   LOP3.LUT R3, R3, 0xff, RZ, 0xc0, !PT ;                                     /* 0x000000ff03037812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*0180*/                   SHF.R.S32.HI R9, RZ, 0xc, R9 ;                                             /* 0x0000000cff097819 */
                                                                                                              /* 0x000fe40000011409 */
        /*0190*/                   LOP3.LUT R11, R10, 0xff00, R11, 0xf8, !PT ;                                /* 0x0000ff000a0b7812 */
                                                                                                              /* 0x000fe400078ef80b */
        /*01a0*/                   LOP3.LUT R3, R3, 0xff00, R8, 0xf8, !PT ;                                   /* 0x0000ff0003037812 */
                                                                                                              /* 0x000fe400078ef808 */
        /*01b0*/                   LOP3.LUT R12, R5.reuse, 0xf0000, RZ, 0xc0, !PT ;                           /* 0x000f0000050c7812 */
                                                                                                              /* 0x040fe400078ec0ff */
        /*01c0*/                   LOP3.LUT R10, R5, 0xf0, RZ, 0xc0, !PT ;                                    /* 0x000000f0050a7812 */
                                                                                                              /* 0x000fc400078ec0ff */
        /*01d0*/                   LOP3.LUT R8, R4, 0xf00000, RZ, 0xc0, !PT ;                                 /* 0x00f0000004087812 */
                                                                                                              /* 0x000fe200078ec0ff */
        /*01e0*/                   IMAD.SHL.U32 R12, R12, 0x1000, RZ ;                                        /* 0x000010000c0c7824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*01f0*/                   LOP3.LUT R0, R0, 0xff0000, R9, 0xf8, !PT ;                                 /* 0x00ff000000007812 */
                                                                                                              /* 0x000fe200078ef809 */
        /*0200*/                   IMAD.SHL.U32 R10, R10, 0x1000000, RZ ;                                     /* 0x010000000a0a7824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0210*/                   LOP3.LUT R9, R5.reuse, 0xf000, RZ, 0xc0, !PT ;                             /* 0x0000f00005097812 */
                                                                                                              /* 0x040fe200078ec0ff */
        /*0220*/                   IMAD.SHL.U32 R8, R8, 0x100, RZ ;                                           /* 0x0000010008087824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0230*/                   LOP3.LUT R13, R5, 0xf00000, RZ, 0xc0, !PT ;                                /* 0x00f00000050d7812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*0240*/                   SHF.R.S32.HI R12, RZ, 0xc, R12 ;                                           /* 0x0000000cff0c7819 */
                                                                                                              /* 0x000fe2000001140c */
        /*0250*/                   IMAD.U32 R9, R9, 0x10000, RZ ;                                             /* 0x0001000009097824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0260*/                   SHF.R.S32.HI R10, RZ, 0x1c, R10 ;                                          /* 0x0000001cff0a7819 */
                                                                                                              /* 0x000fe2000001140a */
        /*0270*/                   IMAD.SHL.U32 R13, R13, 0x100, RZ ;                                         /* 0x000001000d0d7824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0280*/                   SHF.R.S32.HI R8, RZ, 0xc, R8 ;                                             /* 0x0000000cff087819 */
                                                                                                              /* 0x000fc40000011408 */
        /*0290*/                   LOP3.LUT R11, R11, 0xff0000, R12, 0xf8, !PT ;                              /* 0x00ff00000b0b7812 */
                                                                                                              /* 0x000fe400078ef80c */
        /*02a0*/                   SHF.R.S32.HI R9, RZ, 0x14, R9 ;                                            /* 0x00000014ff097819 */
                                                                                                              /* 0x000fe40000011409 */
        /*02b0*/                   LOP3.LUT R10, R10, 0xff, RZ, 0xc0, !PT ;                                   /* 0x000000ff0a0a7812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*02c0*/                   LOP3.LUT R12, R5, 0xf000000, RZ, 0xc0, !PT ;                               /* 0x0f000000050c7812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*02d0*/                   LOP3.LUT R3, R3, 0xff0000, R8, 0xf8, !PT ;                                 /* 0x00ff000003037812 */
                                                                                                              /* 0x000fe400078ef808 */
        /*02e0*/                   LOP3.LUT R8, R4, 0xf000000, RZ, 0xc0, !PT ;                                /* 0x0f00000004087812 */
                                                                                                              /* 0x000fe200078ec0ff */
        /*02f0*/                   IMAD.SHL.U32 R12, R12, 0x10, RZ ;                                          /* 0x000000100c0c7824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0300*/                   LOP3.LUT R9, R10, 0xff00, R9, 0xf8, !PT ;                                  /* 0x0000ff000a097812 */
                                                                                                              /* 0x000fc400078ef809 */
        /*0310*/                   SHF.R.S32.HI R10, RZ, 0xc, R13 ;                                           /* 0x0000000cff0a7819 */
                                                                                                              /* 0x000fe2000001140d */
        /*0320*/                   IMAD.SHL.U32 R8, R8, 0x10, RZ ;                                            /* 0x0000001008087824 */
                                                                                                              /* 0x000fe200078e00ff */
        /*0330*/                   LOP3.LUT R5, R5, 0xf0000000, RZ, 0xc0, !PT ;                               /* 0xf000000005057812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*0340*/                   LOP3.LUT R4, R4, 0xf0000000, RZ, 0xc0, !PT ;                               /* 0xf000000004047812 */
                                                                                                              /* 0x000fe400078ec0ff */
        /*0350*/                   LOP3.LUT R9, R9, 0xff0000, R10, 0xf8, !PT ;                                /* 0x00ff000009097812 */
                                                                                                              /* 0x000fe400078ef80a */
        /*0360*/                   SHF.R.S32.HI R12, RZ, 0x4, R12 ;                                           /* 0x00000004ff0c7819 */
                                                                                                              /* 0x000fe4000001140c */
        /*0370*/                   SHF.R.S32.HI R10, RZ, 0x4, R5 ;                                            /* 0x00000004ff0a7819 */
                                                                                                              /* 0x000fc40000011405 */
        /*0380*/                   SHF.R.S32.HI R5, RZ, 0x4, R8 ;                                             /* 0x00000004ff057819 */
                                                                                                              /* 0x000fe40000011408 */
        /*0390*/                   SHF.R.S32.HI R4, RZ, 0x4, R4 ;                                             /* 0x00000004ff047819 */
                                                                                                              /* 0x000fe40000011404 */
        /*03a0*/                   LOP3.LUT R11, R11, 0xff000000, R12, 0xf8, !PT ;                            /* 0xff0000000b0b7812 */
                                                                                                              /* 0x000fe400078ef80c */
        /*03b0*/                   LOP3.LUT R12, R9, 0xff000000, R10, 0xf8, !PT ;                             /* 0xff000000090c7812 */
                                                                                                              /* 0x000fe400078ef80a */
        /*03c0*/                   LOP3.LUT R8, R0, 0xff000000, R5, 0xf8, !PT ;                               /* 0xff00000000087812 */
                                                                                                              /* 0x000fe400078ef805 */
        /*03d0*/                   LOP3.LUT R4, R3, 0xff000000, R4, 0xf8, !PT ;                               /* 0xff00000003047812 */
                                                                                                              /* 0x000fe200078ef804 */
        /*03e0*/                   IMAD.MOV.U32 R9, RZ, RZ, RZ ;                                              /* 0x000000ffff097224 */
                                                                                                              /* 0x000fc400078e00ff */
        /*03f0*/                   IMAD.MOV.U32 R5, RZ, RZ, RZ ;                                              /* 0x000000ffff057224 */
                                                                                                              /* 0x000fca00078e00ff */
        /*0400*/                   IMMA.16816.S8.S8 R8, R8.ROW, R11.COL, RZ ;                                 /* 0x0000000b08087237 */
                                                                                                              /* 0x000ff000004054ff */
        /*0410*/                   IMMA.16816.S8.S8 R12, R4.ROW, R12.COL, RZ ;                                /* 0x0000000c040c7237 */
                                                                                                              /* 0x000fde00004054ff */
        /*0420*/                   NOP ;                                                                      /* 0x0000000000007918 */
                                                                                                              /* 0x000fc80000000000 */
        /*0430*/                   IMAD.IADD R3, R8, 0x1, R12 ;                                               /* 0x0000000108037824 */
                                                                                                              /* 0x000fe400078e020c */
        /*0440*/                   IMAD.IADD R0, R9, 0x1, R13 ;                                               /* 0x0000000109007824 */
                                                                                                              /* 0x000fe400078e020d */
        /*0450*/                   IMAD.IADD R4, R3, 0x1, R6 ;                                                /* 0x0000000103047824 */
                                                                                                              /* 0x000fe400078e0206 */
        /*0460*/                   IMAD.IADD R5, R0, 0x1, R7 ;                                                /* 0x0000000100057824 */
                                                                                                              /* 0x000fc600078e0207 */
        /*0470*/                   PLOP3.LUT P0, PT, R3.reuse.SIGN, R6.reuse.SIGN, R4.reuse.SIGN, 0x2, 0x0 ;  /* 0x000000060300721f */
                                                                                                              /* 0x1c0fe40000700204 */
        /*0480*/                   PLOP3.LUT P2, PT, R0.reuse.SIGN, R7.reuse.SIGN, R5.reuse.SIGN, 0x2, 0x0 ;  /* 0x000000070000721f */
                                                                                                              /* 0x1c0fe40000740205 */
        /*0490*/                   PLOP3.LUT P1, PT, R3.SIGN, R6.SIGN, R4.SIGN, 0x40, 0x0 ;                   /* 0x000000060300721f */
                                                                                                              /* 0x000fe40000724004 */
        /*04a0*/                   PLOP3.LUT P3, PT, R0.SIGN, R7.SIGN, R5.SIGN, 0x40, 0x0 ;                   /* 0x000000070000721f */
                                                                                                              /* 0x000fe40000764005 */
        /*04b0*/                   SEL R4, R4, 0x7fffffff, !P0 ;                                              /* 0x7fffffff04047807 */
                                                                                                              /* 0x000fe40004000000 */
        /*04c0*/                   SEL R5, R5, 0x7fffffff, !P2 ;                                              /* 0x7fffffff05057807 */
                                                                                                              /* 0x000fc40005000000 */
        /*04d0*/                   SEL R4, R4, 0x80000000, !P1 ;                                              /* 0x8000000004047807 */
                                                                                                              /* 0x000fe40004800000 */
        /*04e0*/                   SEL R5, R5, 0x80000000, !P3 ;                                              /* 0x8000000005057807 */
                                                                                                              /* 0x000fe20005800000 */
        /*04f0*/                   RET.ABS.NODEC R20 0x0 ;                                                    /* 0x0000000014007950 */
                                                                                                              /* 0x000fec0003e00000 */
        /*0500*/                   BRA 0x500;                                                                 /* 0xfffffffc00fc7947 */
                                                                                                              /* 0x000fc0000383ffff */
        /*0510*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0520*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0530*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0540*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0550*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0560*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0570*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0580*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*0590*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*05a0*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*05b0*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*05c0*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*05d0*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*05e0*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
        /*05f0*/                   NOP;                                                                       /* 0x0000000000007918 */
                                                                                                              /* 0x000fc00000000000 */
		..........


		Function : _Z4funcRA2_jjjS0_
	.headerflags	@"EF_CUDA_SM100 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM100)"
        /*0000*/                   IADD3 R1, PT, PT, R1, -0x8, RZ ;                                           /* 0xfffffff801017810 */
                                                                                                              /* 0x000fe20007ffe0ff */
        /*0010*/                   IMAD.MOV.U32 R0, RZ, RZ, R6 ;                                              /* 0x000000ffff007224 */
                                                                                                              /* 0x000fe200078e0006 */
        /*0020*/                   MOV R3, R7 ;                                                               /* 0x0000000700037202 */
                                                                                                              /* 0x000fe20000000f00 */
        /*0030*/                   IMAD.MOV.U32 R32, RZ, RZ, R4 ;                                             /* 0x000000ffff207224 */
                                                                                                              /* 0x000fe200078e0004 */
        /*0040*/                   MOV R33, R5 ;                                                              /* 0x0000000500217202 */
                                                                                                              /* 0x000fe20000000f00 */
        /*0050*/                   STL [R1+0x4], R21 ;                                                        /* 0x0000041501007387 */
                                                                                                              /* 0x000fe80000100800 */
        /*0060*/                   STL [R1], R20 ;                                                            /* 0x0000001401007387 */
                                                                                                              /* 0x0001e20000100800 */
        /*0070*/                   LDC.64 R34, c[0x0][0x358] ;                                                /* 0x0000d600ff227b82 */
                                                                                                              /* 0x000e640000000a00 */
        /*0080*/                   R2UR UR4, R34 ;                                                            /* 0x00000000220472ca */
                                                                                                              /* 0x002fc400000e0000 */
        /*0090*/                   R2UR UR5, R35.reuse ;                                                      /* 0x00000000230572ca */
                                                                                                              /* 0x040fe400000e0000 */
        /*00a0*/                   R2UR UR6, R34 ;                                                            /* 0x00000000220672ca */
                                                                                                              /* 0x000fe400000e0000 */
        /*00b0*/                   R2UR UR7, R35 ;                                                            /* 0x00000000230772ca */
                                                                                                              /* 0x000fd200000e0000 */
        /*00c0*/                   LD.E R6, desc[UR4][R8.64] ;                                                /* 0x0000000408067980 */
                                                                                                              /* 0x000368000c101900 */
        /*00d0*/                   LD.E R7, desc[UR6][R8.64+0x4] ;                                            /* 0x0000040608077980 */
                                                                                                              /* 0x000362000c101900 */
        /*00e0*/                   IMAD.MOV.U32 R4, RZ, RZ, R0 ;                                              /* 0x000000ffff047224 */
                                                                                                              /* 0x000fe400078e0000 */
        /*00f0*/                   IMAD.MOV.U32 R5, RZ, RZ, R3 ;                                              /* 0x000000ffff057224 */
                                                                                                              /* 0x000fe200078e0003 */
        /*0100*/                   MOV R20, 0x0 ;                                                             /* 0x0000000000147802 */
                                                                                                              /* 0x001fe40000000f00 */
        /*0110*/                   MOV R21, 0x0 ;                                                             /* 0x0000000000157802 */
                                                                                                              /* 0x000fce0000000f00 */
        /*0120*/                   CALL.ABS.NOINC 0x0 ;                                                       /* 0x0000000000007943 */
                                                                                                              /* 0x022fea0003c00000 */
        /*0130*/                   R2UR UR4, R34.reuse ;                                                      /* 0x00000000220472ca */
                                                                                                              /* 0x040fe400000e0000 */
        /*0140*/                   R2UR UR5, R35.reuse ;                                                      /* 0x00000000230572ca */
                                                                                                              /* 0x040fe400000e0000 */
        /*0150*/                   R2UR UR6, R34 ;                                                            /* 0x00000000220672ca */
                                                                                                              /* 0x000fe400000e0000 */
        /*0160*/                   R2UR UR7, R35 ;                                                            /* 0x00000000230772ca */
                                                                                                              /* 0x000fd200000e0000 */
        /*0170*/                   ST.E desc[UR4][R32.64], R4 ;                                               /* 0x0000000420007985 */
                                                                                                              /* 0x000fe8000c101904 */
        /*0180*/                   ST.E desc[UR6][R32.64+0x4], R5 ;                                           /* 0x0000040520007985 */
                                                                                                              /* 0x0001e8000c101906 */
        /*0190*/                   LDL R20, [R1] ;                                                            /* 0x0000000001147983 */
                                                                                                              /* 0x000e280000100800 */
        /*01a0*/                   LDL R21, [R1+0x4] ;                                                        /* 0x0000040001157983 */
                                                                                                              /* 0x0002240000100800 */
        /*01b0*/                   IADD3 R1, PT, PT, R1, 0x8, RZ ;                                            /* 0x0000000801017810 */
                                                                                                              /* 0x002fe20007ffe0ff */
        /*01c0*/                   RET.ABS.NODEC R20 0x0 ;                                                    /* 0x0000000014007950 */
                                                                                                              /* 0x001fec0003e00000 */
        /*01d0*/                   BRA 0x1d0;                                                                 /* 0xfffffffc00fc7947 */
1 Like

While you are at it, could you test 1 bit XOR and 1 bit AND, please? XOR was deprecated with 9.0, too.

__device__
void wmma_xor_popc_func(unsigned int (&D)[2], unsigned int A, unsigned int B, unsigned int (&C)[2]){ 
    asm volatile("wmma.mma.xor.popc.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
      : "=r"(D[0]), "=r"(D[1])
      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
}

__device__
void wmma_and_popc_func(unsigned int (&D)[2], unsigned int A, unsigned int B, unsigned int (&C)[2]){ 
    asm volatile("wmma.mma.and.popc.sync.aligned.m8n8k128.row.col.s32.b1.b1.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
      : "=r"(D[0]), "=r"(D[1])
      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
}

Ada: uses BMMA.88128.XOR.POPC and BMMA.88128.AND.POPC
Hopper: AND uses BMMA.88128.AND.POPC , XOR short emulation with two of those instructions.
Blackwell: AND long emulation with IMMA.16832.U8.U8. XOR emulation based on AND

1 Like

Thank you.

Have I understood correctly? XOR can be emulated with two ANDs?

The SASS code for xor looks like this on Hopper. (or godbold here: Compiler Explorer)

code for sm_90
		Function : __cuda_sm_9x_mma_bit_internal_xor_m8n8k128
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM90 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM90)"
        /*0000*/                   WARPSYNC.ALL ;                                /* 0x0000000000007948 */
                                                                                 /* 0x000fea0003800000 */
        /*0010*/                   NOP ;                                         /* 0x0000000000007918 */
                                                                                 /* 0x000fe20000000000 */
        /*0020*/                   LOP3.LUT R0, RZ, R4, RZ, 0x33, !PT ;          /* 0x00000004ff007212 */
                                                                                 /* 0x000fe400078e33ff */
        /*0030*/                   LOP3.LUT R3, RZ, R5, RZ, 0x33, !PT ;          /* 0x00000005ff037212 */
                                                                                 /* 0x000fce00078e33ff */
        /*0040*/                   BMMA.88128.AND.POPC R8, R4.ROW, R3.COL, RZ ;  /* 0x000000030408723d */
                                                                                 /* 0x000fe800000144ff */
        /*0050*/                   BMMA.88128.AND.POPC R4, R0.ROW, R5.COL, RZ ;  /* 0x000000050004723d */
                                                                                 /* 0x000fda00000144ff */
        /*0060*/                   IADD3 R4, R6, R8, R4 ;                        /* 0x0000000806047210 */
                                                                                 /* 0x000fe40007ffe004 */
        /*0070*/                   IADD3 R5, R7, R9, R5 ;                        /* 0x0000000907057210 */
                                                                                 /* 0x000fe20007ffe005 */
        /*0080*/                   RET.ABS.NODEC R20 0x0 ;                       /* 0x0000000014007950 */
                                                                                 /* 0x000fec0003e00000 */
        /*0090*/                   BRA 0x90;                                     /* 0xfffffffc00fc7947 */
1 Like