I look at the sass and find that for no flag version, 16 HMMA instructions are grouped. But for flag verison, 4 HMMA are grouped. They all have 16 instructions in all.
here is a clip of flaged version
/*19b0*/ @!P0 BRA `(.L_x_11) ;
/*19c0*/ LDS R84, [R97.X4+0x7180] ;
/*19d0*/ WARPSYNC 0xffffffff ;
/*19e0*/ ISETP.NE.AND P0, PT, R89, RZ, PT ;
/*19f0*/ LEA.HI R85, R84, R84, RZ, 0x1 ;
/*1a00*/ SHF.R.S32.HI R86, RZ, 0x1, R85.reuse ;
/*1a10*/ SHF.R.S32.HI R87, RZ, 0x1f, R85 ;
/*1a20*/ LOP3.LUT R85, R85, 0xffffffe, RZ, 0xc0, !PT ;
/*1a30*/ LEA.HI R87, R87, R86, RZ, 0x2 ;
/*1a40*/ IMAD.IADD R85, R84, 0x1, -R85 ;
/*1a50*/ LOP3.LUT R87, R87, 0x3ffffffc, RZ, 0xc0, !PT ;
/*1a60*/ IMAD.SHL.U32 R85, R85, 0x10, RZ ;
/*1a70*/ IMAD.IADD R87, R86.reuse, 0x1, -R87 ;
/*1a80*/ IMAD.SHL.U32 R86, R86, 0x20, RZ ;
/*1a90*/ LOP3.LUT R87, R87, R98, RZ, 0x3c, !PT ;
/*1aa0*/ IMAD.SHL.U32 R87, R87, 0x4, RZ ;
/*1ab0*/ IADD3 R84, R87, R86, R85 ;
/*1ac0*/ LEA R84, R84, c[0x0][0x18], 0x2 ;
/*1ad0*/ IADD3 R84, R84, -c[0x0][0x18], RZ ;
/*1ae0*/ LDSM.16.M88.4 R84, [R84] ;
/*1af0*/ HMMA.16816.F32 R44, R84.reuse, R80, R44 ;
/*1b00*/ HMMA.16816.F32 R48, R84.reuse, R82, R48 ;
/*1b10*/ HMMA.16816.F32 R52, R84.reuse, R76, R52 ;
/*1b20*/ HMMA.16816.F32 R56, R84, R78, R56 ;
and this is no flag version, all HMMA are consecutive
/*1420*/ BAR.SYNC 0x0 ;
/*1430*/ LDS R78, [R106.X4+0x7100] ;
/*1440*/ IADD3 R115, R115, 0x1, RZ ;
/*1450*/ LDS R83, [R106.X4+0x7140] ;
/*1460*/ ISETP.NE.AND P1, PT, R115, 0x1b, PT ;
/*1470*/ LDS R77, [R106.X4+0x7180] ;
/*1480*/ LDS R76, [R106.X4+0x71c0] ;
/*1490*/ LEA.HI R79, R78, R78, RZ, 0x1 ;
/*14a0*/ SHF.R.S32.HI R80, RZ, 0x1, R79.reuse ;
/*14b0*/ SHF.R.S32.HI R81, RZ, 0x1f, R79 ;
/*14c0*/ LEA.HI R84, R83, R83, RZ, 0x1 ;
/*14d0*/ LEA.HI R81, R81, R80, RZ, 0x2 ;
/*14e0*/ LOP3.LUT R79, R79, 0xffffffe, RZ, 0xc0, !PT ;
/*14f0*/ LOP3.LUT R81, R81, 0x3ffffffc, RZ, 0xc0, !PT ;
/*1500*/ SHF.R.S32.HI R86, RZ, 0x1f, R84.reuse ;
/*1510*/ IMAD.IADD R79, R78, 0x1, -R79 ;
/*1520*/ LEA.HI R85, R76, R76, RZ, 0x1 ;
/*1530*/ IMAD.IADD R82, R80.reuse, 0x1, -R81 ;
/*1540*/ SHF.R.S32.HI R81, RZ, 0x1, R84 ;
/*1550*/ IMAD.SHL.U32 R78, R80, 0x20, RZ ;
/*1560*/ LOP3.LUT R84, R84, 0xffffffe, RZ, 0xc0, !PT ;
/*1570*/ IMAD.SHL.U32 R79, R79, 0x10, RZ ;
/*1580*/ LOP3.LUT R82, R82, R107, RZ, 0x3c, !PT ;
/*1590*/ LEA.HI R86, R86, R81, RZ, 0x2 ;
/*15a0*/ IMAD.IADD R84, R83, 0x1, -R84 ;
/*15b0*/ SHF.R.S32.HI R87, RZ, 0x1, R85.reuse ;
/*15c0*/ IMAD.SHL.U32 R82, R82, 0x4, RZ ;
/*15d0*/ LOP3.LUT R86, R86, 0x3ffffffc, RZ, 0xc0, !PT ;
/*15e0*/ IMAD.SHL.U32 R84, R84, 0x10, RZ ;
/*15f0*/ SHF.R.S32.HI R88, RZ, 0x1f, R85 ;
/*1600*/ IADD3 R78, R82, R78, R79 ;
/*1610*/ IMAD.IADD R86, R81, 0x1, -R86 ;
/*1620*/ LEA.HI R79, R77, R77, RZ, 0x1 ;
/*1630*/ IMAD.SHL.U32 R81, R81, 0x20, RZ ;
/*1640*/ LEA.HI R88, R88, R87, RZ, 0x2 ;
/*1650*/ SHF.R.S32.HI R80, RZ, 0x1, R79.reuse ;
/*1660*/ SHF.R.S32.HI R83, RZ, 0x1f, R79 ;
/*1670*/ LOP3.LUT R86, R86, R107, RZ, 0x3c, !PT ;
/*1680*/ LEA.HI R83, R83, R80, RZ, 0x2 ;
/*1690*/ LOP3.LUT R88, R88, 0x3ffffffc, RZ, 0xc0, !PT ;
/*16a0*/ IMAD.SHL.U32 R86, R86, 0x4, RZ ;
/*16b0*/ LOP3.LUT R83, R83, 0x3ffffffc, RZ, 0xc0, !PT ;
/*16c0*/ LOP3.LUT R82, R79, 0xffffffe, RZ, 0xc0, !PT ;
/*16d0*/ IMAD.IADD R88, R87, 0x1, -R88 ;
/*16e0*/ IADD3 R81, R86, R81, R84 ;
/*16f0*/ IMAD.IADD R84, R80, 0x1, -R83 ;
/*1700*/ LOP3.LUT R85, R85, 0xffffffe, RZ, 0xc0, !PT ;
/*1710*/ IMAD.IADD R82, R77, 0x1, -R82 ;
/*1720*/ LOP3.LUT R88, R88, R107.reuse, RZ, 0x3c, !PT ;
/*1730*/ IMAD.SHL.U32 R80, R80, 0x20, RZ ;
/*1740*/ LOP3.LUT R84, R84, R107, RZ, 0x3c, !PT ;
/*1750*/ IMAD.IADD R85, R76, 0x1, -R85 ;
/*1760*/ LEA R78, R78, c[0x0][0x18], 0x2 ;
/*1770*/ IMAD.SHL.U32 R87, R87, 0x20, RZ ;
/*1780*/ LEA R89, R81, c[0x0][0x18], 0x2 ;
/*1790*/ IMAD.SHL.U32 R77, R82, 0x10, RZ ;
/*17a0*/ IADD3 R81, R78, -c[0x0][0x18], RZ ;
/*17b0*/ IMAD.SHL.U32 R84, R84, 0x4, RZ ;
/*17c0*/ IADD3 R89, R89, -c[0x0][0x18], RZ ;
/*17d0*/ IMAD.SHL.U32 R76, R85, 0x10, RZ ;
/*17e0*/ IMAD.SHL.U32 R88, R88, 0x4, RZ ;
/*17f0*/ IADD3 R80, R84, R80, R77 ;
/*1800*/ IADD3 R88, R88, R87, R76 ;
/*1810*/ LEA R92, R80, c[0x0][0x18], 0x2 ;
/*1820*/ LDSM.16.M88.4 R76, [R100] ;
/*1830*/ LEA R96, R88, c[0x0][0x18], 0x2 ;
/*1840*/ IADD3 R92, R92, -c[0x0][0x18], RZ ;
/*1850*/ LDSM.16.M88.4 R80, [R81] ;
/*1860*/ IADD3 R96, R96, -c[0x0][0x18], RZ ;
/*1870*/ LDSM.16.M88.4 R84, [R101] ;
/*1880*/ LDSM.16.M88.4 R88, [R89] ;
/*1890*/ LDSM.16.M88.4 R92, [R92] ;
/*18a0*/ LDSM.16.M88.4 R96, [R96] ;
/*18b0*/ HMMA.16816.F32 R12, R80.reuse, R76, R12 ;
/*18c0*/ HMMA.16816.F32 R16, R80.reuse, R78, R16 ;
/*18d0*/ HMMA.16816.F32 R20, R80.reuse, R84, R20 ;
/*18e0*/ HMMA.16816.F32 R24, R80, R86, R24 ;
/*18f0*/ HMMA.16816.F32 R28, R88.reuse, R76, R28 ;
/*1900*/ HMMA.16816.F32 R32, R88.reuse, R78, R32 ;
/*1910*/ HMMA.16816.F32 R36, R88.reuse, R84, R36 ;
/*1920*/ HMMA.16816.F32 R40, R88, R86, R40 ;
/*1930*/ HMMA.16816.F32 R44, R92.reuse, R76, R44 ;
/*1940*/ HMMA.16816.F32 R48, R92.reuse, R78, R48 ;
/*1950*/ HMMA.16816.F32 R52, R92.reuse, R84, R52 ;
/*1960*/ HMMA.16816.F32 R56, R92, R86, R56 ;
/*1970*/ HMMA.16816.F32 R60, R96.reuse, R76, R60 ;
/*1980*/ HMMA.16816.F32 R64, R96.reuse, R78, R64 ;
/*1990*/ HMMA.16816.F32 R68, R96.reuse, R84, R68 ;
/*19a0*/ HMMA.16816.F32 R72, R96, R86, R72 ;
/*19b0*/ BAR.SYNC 0x0 ;