PTX instruction `mma` not lowered to tensor core related SASS instruction

Description

mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 in PTX is not lowered to tensor core related instruction (like HMMA)

Environment

GPU Type: Tesla A100
CUDA Version: 11.4
CUDNN Version: N/A
Operating System + Version: Ubuntu 18.04

Detailed Description

I try to implement a GeMM using Tensor Core as follow:

// Every warp use mma.16816 to conduct 16-8-16 sub matrix multiplication
// According to PTX ISA, every threads in a warp holds:
//                 4 float32 accumulator
//                 8 float16 elements from sub matrix A
//                 4 float16 elements from sub matrix B
float Accum[4];
uint Multi_A[4];
uint Multi_B[2];

// Load element into reg from global
Multi_A[...] = ...;
Multi_B[...] = ...;
{
    unsigned const* A = reinterpret_cast<unsigned const*>(Multi_A);
    unsigned const* B = reinterpret_cast<unsigned const*>(Multi_B);
    float const* C = reinterpret_cast<float const*>(Accum);
    float* D = reinterpret_cast<float*>(Accum);

    // Inline asm, I want to use tensor core operations
    __asm__ __volatile__(
        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13}" : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]), "+r"(A[0]), "+r"(A[1]), "+r"(A[2]), "+r"(A[3]), "+r"(B[0]), "+r"(B[1]), "+f"(C[0]), "+f"(C[1]), "+f"(C[2]), "+f"(C[3])
    );
}
// Store element into global from reg
C[...] = Accum[...];

However, the kernel did not use any tensor core operation:

  • I use ncu to observe the execution and there is no tensor op at all (I check the computation pipeline and instruction stat).
  • I use cuobjdump and check the SASS, I don’t find any HMMA instructions

While in .ptx file, I found my inline asm code. My compile command is:

nvcc -o gemm -arch=sm_80 gemm.cu

I wonder how can I use tensor core operation correctly?

It seems the computation is lowered to IMAD,IADD instruction, following is my SASS code:


Fatbin elf code:
================
arch = sm_80
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

	code for sm_80

Fatbin elf code:
================
arch = sm_80
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

	code for sm_80
		Function : _Z13gemm_mma_1688P6__halfS0_Pfiiiff
	.headerflags    @"EF_CUDA_SM80 EF_CUDA_PTX_SM(EF_CUDA_SM80)"
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;        /* 0x00000a00ff017624 */
                                                                                  /* 0x000fc400078e00ff */
        /*0010*/                   S2R R0, SR_TID.X ;                             /* 0x0000000000007919 */
                                                                                  /* 0x000e220000002100 */
        /*0020*/                   IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x178] ;       /* 0x00005e00ff027624 */
                                                                                  /* 0x000fc600078e00ff */
        /*0030*/                   S2R R3, SR_TID.Y ;                             /* 0x0000000000037919 */
                                                                                  /* 0x000e240000002200 */
        /*0040*/                   ISETP.GE.AND P0, PT, R2, 0x1, PT ;             /* 0x000000010200780c */
                                                                                  /* 0x000fe20003f06270 */
        /*0050*/                   IMAD R0, R0, c[0x0][0x0], R3 ;                 /* 0x0000000000007a24 */
                                                                                  /* 0x001fd800078e0203 */
        /*0060*/              @!P0 EXIT ;                                         /* 0x000000000000894d */
                                                                                  /* 0x000fea0003800000 */
        /*0070*/                   IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x17c] ;       /* 0x00005f00ff027624 */
                                                                                  /* 0x000fe200078e00ff */
        /*0080*/                   ISETP.GT.U32.AND P2, PT, R0.reuse, 0x7f, PT ;  /* 0x0000007f0000780c */
                                                                                  /* 0x040fe20003f44070 */
        /*0090*/                   IMAD.SHL.U32 R8, R0.reuse, 0x2, RZ ;           /* 0x0000000200087824 */
                                                                                  /* 0x040fe200078e00ff */
        /*00a0*/                   SHF.R.U32.HI R7, RZ, 0x2, R0 ;                 /* 0x00000002ff077819 */
                                                                                  /* 0x000fe20000011600 */
        /*00b0*/                   ULDC.64 UR4, c[0x0][0x118] ;                   /* 0x0000460000047ab9 */
                                                                                  /* 0x000fe20000000a00 */
        /*00c0*/                   ISETP.GE.U32.AND P0, PT, R0, 0x100, PT ;       /* 0x000001000000780c */
                                                                                  /* 0x000fe20003f06070 */
        /*00d0*/                   IMAD.MOV.U32 R0, RZ, RZ, RZ ;                  /* 0x000000ffff007224 */
                                                                                  /* 0x000fe200078e00ff */
        /*00e0*/                   ISETP.GE.AND P1, PT, R2, 0x1, PT ;             /* 0x000000010200780c */
                                                                                  /* 0x000fe40003f26270 */
        /*00f0*/                   SGXT.U32 R7, R7, 0x3 ;                         /* 0x000000030707781a */
                                                                                  /* 0x000fe40000000000 */
        /*0100*/                   LOP3.LUT R8, R8, 0x6, RZ, 0xc0, !PT ;          /* 0x0000000608087812 */
                                                                                  /* 0x000fc400078ec0ff */
        /*0110*/                   SEL R6, RZ, 0xffffffff, !P2 ;                  /* 0xffffffffff067807 */
                                                                                  /* 0x000fce0005000000 */
        /*0120*/              @!P1 BRA 0x4b0 ;                                    /* 0x0000038000009947 */
                                                                                  /* 0x000fea0003800000 */
        /*0130*/                   MOV R9, RZ ;                                   /* 0x000000ff00097202 */
                                                                                  /* 0x000fe20000000f00 */
        /*0140*/                   IMAD.IADD R17, R7, 0x1, R0 ;                   /* 0x0000000107117824 */
                                                                                  /* 0x000fc400078e0200 */
        /*0150*/                   S2R R2, SR_TID.Y ;                             /* 0x0000000000027919 */
                                                                                  /* 0x000e220000002200 */
        /*0160*/                   IMAD.MOV.U32 R18, RZ, RZ, RZ ;                 /* 0x000000ffff127224 */
                                                                                  /* 0x000fc600078e00ff */
        /*0170*/                   S2R R3, SR_TID.X ;                             /* 0x0000000000037919 */
                                                                                  /* 0x000e240000002100 */
        /*0180*/                   IMAD R2, R3, c[0x0][0x0], R2 ;                 /* 0x0000000003027a24 */
                                                                                  /* 0x001fca00078e0202 */
        /*0190*/                   SHF.R.U32.HI R2, RZ, 0x2, R2 ;                 /* 0x00000002ff027819 */
                                                                                  /* 0x000fc80000011602 */
        /*01a0*/                   LOP3.LUT R2, R2, 0x18, RZ, 0xc0, !PT ;         /* 0x0000001802027812 */
                                                                                  /* 0x000fc800078ec0ff */
        /*01b0*/                   IADD3 R16, R2, R9, R8 ;                        /* 0x0000000902107210 */
                                                                                  /* 0x000fc40007ffe008 */
        /*01c0*/                   IADD3 R18, R18, 0x40, RZ ;                     /* 0x0000004012127810 */
                                                                                  /* 0x000fe20007ffe0ff */
        /*01d0*/                   IMAD.MOV.U32 R10, RZ, RZ, RZ ;                 /* 0x000000ffff0a7224 */
                                                                                  /* 0x000fc600078e00ff */
        /*01e0*/                   ISETP.GE.U32.AND P2, PT, R18, 0x1400, PT ;     /* 0x000014001200780c */
                                                                                  /* 0x000fc40003f46070 */
        /*01f0*/                   IMAD.IADD R3, R17, 0x1, R10 ;                  /* 0x0000000111037824 */
                                                                                  /* 0x000fe200078e020a */
        /*0200*/                   IADD3 R12, R16, 0x400, RZ ;                    /* 0x00000400100c7810 */
                                                                                  /* 0x000fe20007ffe0ff */
        /*0210*/                   IMAD.MOV.U32 R13, RZ, RZ, RZ ;                 /* 0x000000ffff0d7224 */
                                                                                  /* 0x000fe400078e00ff */
        /*0220*/                   IMAD.MOV.U32 R15, RZ, RZ, 0x4 ;                /* 0x00000004ff0f7424 */
                                                                                  /* 0x000fe200078e00ff */
        /*0230*/                   IADD3 R5, R3.reuse, 0x8, RZ ;                  /* 0x0000000803057810 */
                                                                                  /* 0x040fe20007ffe0ff */
        /*0240*/                   IMAD.MOV.U32 R14, RZ, RZ, c[0x0][0x4] ;        /* 0x00000100ff0e7624 */
                                                                                  /* 0x000fe200078e00ff */
        /*0250*/                   ISETP.NE.AND P3, PT, R3, R6.reuse, PT ;        /* 0x000000060300720c */
                                                                                  /* 0x080fe40003f65270 */
        /*0260*/                   ISETP.NE.AND P4, PT, R5, R6, PT ;              /* 0x000000060500720c */
                                                                                  /* 0x000fe40003f85270 */
        /*0270*/                   MOV R11, R12 ;                                 /* 0x0000000c000b7202 */
                                                                                  /* 0x000fd20000000f00 */
        /*0280*/              @!P3 IADD3 R11, R16, RZ, RZ ;                       /* 0x000000ff100bb210 */
                                                                                  /* 0x000fe40007ffe0ff */
        /*0290*/              @!P4 IMAD.MOV R12, RZ, RZ, R16 ;                    /* 0x000000ffff0cc224 */
                                                                                  /* 0x000fc600078e0210 */
        /*02a0*/                   IMAD.IADD R2, R11, 0x1, R13.reuse ;            /* 0x000000010b027824 */
                                                                                  /* 0x101fe200078e020d */
        /*02b0*/               @P0 WARPSYNC 0xffffffff ;                          /* 0xffffffff00000948 */
                                                                                  /* 0x000fe20003800000 */
        /*02c0*/                   IMAD.IADD R4, R12, 0x1, R13 ;                  /* 0x000000010c047824 */
                                                                                  /* 0x000fe200078e020d */
        /*02d0*/                   LEA R13, R14, R13, 0x1 ;                       /* 0x0000000d0e0d7211 */
                                                                                  /* 0x000fe200078e08ff */
        /*02e0*/                   IMAD.WIDE R2, R2, R15, c[0x0][0x170] ;         /* 0x00005c0002027625 */
                                                                                  /* 0x000fc600078e020f */
        /*02f0*/                   ISETP.GE.AND P3, PT, R13, 0x40, PT ;           /* 0x000000400d00780c */
                                                                                  /* 0x000fe20003f66270 */
        /*0300*/                   IMAD.WIDE R4, R4, R15, c[0x0][0x170] ;         /* 0x00005c0004047625 */
                                                                                  /* 0x000fe200078e020f */
        /*0310*/               @P0 STG.E [R2.64], RZ ;                            /* 0x000000ff02000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*0320*/               @P0 STG.E [R2.64+0x4], RZ ;                        /* 0x000004ff02000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*0330*/               @P0 STG.E [R4.64], RZ ;                            /* 0x000000ff04000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*0340*/               @P0 STG.E [R4.64+0x4], RZ ;                        /* 0x000004ff04000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*0350*/               @P0 STG.E [R2.64], RZ ;                            /* 0x000000ff02000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*0360*/               @P0 STG.E [R2.64+0x4], RZ ;                        /* 0x000004ff02000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*0370*/               @P0 STG.E [R4.64], RZ ;                            /* 0x000000ff04000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*0380*/               @P0 STG.E [R4.64+0x4], RZ ;                        /* 0x000004ff04000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*0390*/               @P0 STG.E [R2.64], RZ ;                            /* 0x000000ff02000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*03a0*/               @P0 STG.E [R2.64+0x4], RZ ;                        /* 0x000004ff02000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*03b0*/               @P0 STG.E [R4.64], RZ ;                            /* 0x000000ff04000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*03c0*/               @P0 STG.E [R4.64+0x4], RZ ;                        /* 0x000004ff04000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*03d0*/               @P0 STG.E [R2.64], RZ ;                            /* 0x000000ff02000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*03e0*/               @P0 STG.E [R2.64+0x4], RZ ;                        /* 0x000004ff02000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*03f0*/               @P0 STG.E [R4.64], RZ ;                            /* 0x000000ff04000986 */
                                                                                  /* 0x0001e8000c101904 */
        /*0400*/               @P0 STG.E [R4.64+0x4], RZ ;                        /* 0x000004ff04000986 */
                                                                                  /* 0x0001e2000c101904 */
        /*0410*/              @!P3 BRA 0x2a0 ;                                    /* 0xfffffe800000b947 */
                                                                                  /* 0x000fea000383ffff */
        /*0420*/                   IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0x0] ;         /* 0x00000000ff037624 */
                                                                                  /* 0x001fc800078e00ff */
        /*0430*/                   IMAD R10, R3, 0x2, R10 ;                       /* 0x00000002030a7824 */
                                                                                  /* 0x000fca00078e020a */
        /*0440*/                   ISETP.GE.AND P3, PT, R10, 0x40, PT ;           /* 0x000000400a00780c */
                                                                                  /* 0x000fda0003f66270 */
        /*0450*/              @!P3 BRA 0x1f0 ;                                    /* 0xfffffd900000b947 */
                                                                                  /* 0x000fea000383ffff */
        /*0460*/              @!P2 BRA 0x1c0 ;                                    /* 0xfffffd500000a947 */
                                                                                  /* 0x000fea000383ffff */
        /*0470*/                   IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x10] ;        /* 0x00000400ff027624 */
                                                                                  /* 0x000fca00078e00ff */
        /*0480*/                   LEA R9, R2, R9, 0x6 ;                          /* 0x0000000902097211 */
                                                                                  /* 0x000fc800078e30ff */
        /*0490*/                   ISETP.GE.AND P2, PT, R9, c[0x0][0x17c], PT ;   /* 0x00005f0009007a0c */
                                                                                  /* 0x000fda0003f46270 */
        /*04a0*/              @!P2 BRA 0x150 ;                                    /* 0xfffffca00000a947 */
                                                                                  /* 0x000fea000383ffff */
        /*04b0*/                   IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0xc] ;         /* 0x00000300ff037624 */
                                                                                  /* 0x000fc800078e00ff */
        /*04c0*/                   IMAD R0, R3, 0x40, R0 ;                        /* 0x0000004003007824 */
                                                                                  /* 0x000fca00078e0200 */
        /*04d0*/                   ISETP.GE.AND P2, PT, R0, c[0x0][0x178], PT ;   /* 0x00005e0000007a0c */
                                                                                  /* 0x000fda0003f46270 */
        /*04e0*/              @!P2 BRA 0x120 ;                                    /* 0xfffffc300000a947 */
                                                                                  /* 0x000fea000383ffff */
        /*04f0*/                   EXIT ;                                         /* 0x000000000000794d */
                                                                                  /* 0x000fea0003800000 */
        /*0500*/                   BRA 0x500;                                     /* 0xfffffff000007947 */
                                                                                  /* 0x000fc0000383ffff */
        /*0510*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*0520*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*0530*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*0540*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*0550*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*0560*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*0570*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*0580*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*0590*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*05a0*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*05b0*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*05c0*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*05d0*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*05e0*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
        /*05f0*/                   NOP;                                           /* 0x0000000000007918 */
                                                                                  /* 0x000fc00000000000 */
		..........



Fatbin ptx code:
================
arch = sm_80
code version = [7,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed

Hi,

This doesn’t look like TensorRT related. We recommend you to please post your concern on a related platform.

Thank you.