// Disassembling __globfunc__Z9matrixMulPfS_S_ii (0) .entry __globfunc__Z9matrixMulPfS_S_ii { .lmem 0 .smem 4260 .reg 11 .bar 1 cvt.u32.u16 $r3, s[0x000e] mov.b32 $r2, s[0x001c] mul24.lo.u32.u16.u16 $r4, $r3.lo, $r2.hi mad24.lo.u32.u16.u16.u32 $r4, $r3.hi, $r2.lo, $r4 shl.u32 $r4, $r4, 0x00000010 mad24.lo.u32.u16.u16.u32 $r2, $r3.lo, $r2.lo, $r4 shl.u32 $r3, $r2, 0x00000004 add.u32 $r2, s[0x001c], $r3 add.b32 $r2, $r2, 0xffffffff set.lt.s32 $p0|$o127, $r2, $r3 @$p0.ne bra.label label2 and.b16 $r2.lo, $r0.hi, c1[0x0000] mov.b32 $r1, s[0x0020] cvt.u32.u16 $r4, $r2.lo mov.b32 $r2, s[0x001c] mul24.half.lo.u32.u16.u16 $r5, $r4.lo, $r1.hi mul24.half.lo.u32.u16.u16 $r6, $r2.hi, $r4.lo mad24.lo.u32.u16.u16.u32 $r5, $r4.hi, $r1.lo, $r5 mad24.lo.u32.u16.u16.u32 $r6, $r2.lo, $r4.hi, $r6 shl.u32 $r5, $r5, 0x00000010 mov.b16 $r1.hi, s[0x000c] shl.u32 $r6, $r6, 0x00000010 mad24.lo.u32.u16.u16.u32 $r5, $r4.lo, $r1.lo, $r5 mul24.lo.s32.s16.s16 $r65, $r1.hi, 0x0010 mad24.lo.u32.u16.u16.u32 $r4, $r2.lo, $r4.lo, $r6 cvt.u32.u16 $r2, $r0.lo add.half.b32 $r1, $r5, $r1 add.half.b32 $r4, $r4, $r2 add.half.b32 $r1, $r2, $r1 add.half.b32 $r2, $r4, $r3 shl.u32 $r1, $r1, 0x00000002 shl.u32 $r2, $r2, 0x00000002 add.u32 $r6, s[0x0018], $r1 join.label label1 mov.b32 $r1, $r124 add.u32 $r5, s[0x0014], $r2 mov.b32 s[0x0864], $r6 label0: mov.u32 $r4, g[$r5] mov.u32 $r7, g[$r6] and.b16 $r2.lo, $r0.hi, c1[0x0000] cvt.u32.u16 $r2, $r2.lo cvt.u32.u16 $r8, $r0.lo shl.u32 $r3, $r2, 0x00000006 shl.u32 $r2, $r2, 0x00000002 mad24.lo.u32 $r3, $r8, c1[0x0008], $r3 mad24.lo.u32 $r2, $r8, c1[0x0004], $r2 movsh.b32 $ofs2, $r3, 0x00000000 movsh.b32 $ofs1, $r2, 0x00000000 mov.b32 s[$ofs2+0x0024], $r4 mov.b32 s[$ofs1+0x0824], $r7 bar.sync.u32 0x00000000 cvt.u32.u16 $r2, $r0.lo mul24.lo.u32 $r4, $r2, 0x00000044 and.b16 $r3.lo, $r0.hi, c1[0x0000] movsh.b32 $ofs1, $r4, 0x00000000 cvt.u32.u16 $r7, $r3.lo add.b32 $ofs2, $ofs1, 0x00000824 movsh.b32 $ofs1, $r7, 0x00000006 mov.b32 $r3, s[$ofs2+0x0000] mad.rn.f32 $r3, s[$ofs1+0x0024], $r3, $r1 mov.b32 $r1, s[$ofs2+0x0004] mad.rn.f32 $r3, s[$ofs1+0x0028], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0008] mad.rn.f32 $r3, s[$ofs1+0x002c], $r1, $r3 mov.b32 $r1, s[$ofs2+0x000c] mad.rn.f32 $r3, s[$ofs1+0x0030], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0010] mad.rn.f32 $r3, s[$ofs1+0x0034], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0014] mad.rn.f32 $r3, s[$ofs1+0x0038], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0018] mad.rn.f32 $r3, s[$ofs1+0x003c], $r1, $r3 mov.b32 $r1, s[$ofs2+0x001c] mad.rn.f32 $r3, s[$ofs1+0x0040], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0020] mad.rn.f32 $r3, s[$ofs1+0x0044], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0024] mad.rn.f32 $r3, s[$ofs1+0x0048], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0028] mad.rn.f32 $r3, s[$ofs1+0x004c], $r1, $r3 mov.b32 $r1, s[$ofs2+0x002c] mad.rn.f32 $r3, s[$ofs1+0x0050], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0030] mad.rn.f32 $r8, s[$ofs1+0x0054], $r1, $r3 mov.b32 $r1, s[0x0020] mov.b32 $r4, 0x00000040 mov.b32 $r3, s[$ofs2+0x0034] mad.rn.f32 $r8, s[$ofs1+0x0058], $r3, $r8 mov.half.b32 $r3, s[$ofs2+0x0038] mul24.half.lo.u32.u16.u16 $r9, $r1.lo, $r4.hi mad.rn.f32 $r10, s[$ofs1+0x005c], $r3, $r8 mad24.lo.u32.u16.u16.u32 $r9, $r1.hi, $r4.lo, $r9 mov.b32 $r8, s[$ofs2+0x003c] shl.u32 $r3, $r7, 0x00000002 shl.u32 $r9, $r9, 0x00000010 mad.rn.f32 $r8, s[$ofs1+0x0060], $r8, $r10 mad24.lo.u32 $r7, $r7, c1[0x000c], $r2 mad24.lo.u32 $r3, $r2, c1[0x0004], $r3 mad24.lo.u32.u16.u16.u32 $r1, $r1.lo, $r4.lo, $r9 add.b32 $r2, $r5, 0x00000040 mov.u32 $r4, g[$r2] movsh.b32 $ofs1, $r7, 0x00000002 movsh.b32 $ofs2, $r3, 0x00000000 add.u32 $r3, $r1, $r6 mov.u32 $r2, g[$r3] mov.b32 s[$ofs1+0x0424], $r4 add.u32 $r6, $r3, $r1 mov.b32 s[$ofs2+0x0c64], $r2 add.b32 $r5, $r5, 0x00000080 bar.sync.u32 0x00000000 cvt.u32.u16 $r7, $r0.lo and.b16 $r1.lo, $r0.hi, c1[0x0000] mul24.lo.u32 $r3, $r7, 0x00000044 cvt.u32.u16 $r2, $r1.lo movsh.b32 $ofs2, $r3, 0x00000000 movsh.b32 $ofs1, $r2, 0x00000006 add.b32 $ofs2, $ofs2, 0x00000c64 add.b32 $ofs1, $ofs1, 0x00000424 mov.b32 $r1, s[$ofs2+0x0000] mad.rn.f32 $r3, s[$ofs1+0x0000], $r1, $r8 mov.b32 $r1, s[$ofs2+0x0004] mad.rn.f32 $r3, s[$ofs1+0x0004], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0008] mad.rn.f32 $r3, s[$ofs1+0x0008], $r1, $r3 mov.b32 $r1, s[$ofs2+0x000c] mad.rn.f32 $r3, s[$ofs1+0x000c], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0010] mad.rn.f32 $r3, s[$ofs1+0x0010], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0014] mad.rn.f32 $r3, s[$ofs1+0x0014], $r1, $r3 mov.b32 $r1, s[$ofs2+0x0018] mad.rn.f32 $r8, s[$ofs1+0x0018], $r1, $r3 cvt.u32.u16 $r3, s[0x000e] mov.half.b32 $r1, s[0x001c] mov.half.b32 $r4, s[$ofs2+0x001c] mad.rn.f32 $r8, s[$ofs1+0x001c], $r4, $r8 mov.half.b32 $r4, s[$ofs2+0x0020] mul24.half.lo.u32.u16.u16 $r9, $r3.lo, $r1.hi mad.rn.f32 $r8, s[$ofs1+0x0020], $r4, $r8 mad24.lo.u32.u16.u16.u32 $r9, $r3.hi, $r1.lo, $r9 mov.half.b32 $r4, s[$ofs2+0x0024] mul24.half.lo.u32.u16.u16 $r10, $r2.lo, $r1.hi shl.u32 $r9, $r9, 0x00000010 mad.rn.f32 $r8, s[$ofs1+0x0024], $r4, $r8 mad24.lo.u32.u16.u16.u32 $r10, $r2.hi, $r1.lo, $r10 mov.b32 $r4, s[$ofs2+0x0028] mad24.lo.u32.u16.u16.u32 $r3, $r3.lo, $r1.lo, $r9 shl.u32 $r9, $r10, 0x00000010 mad.rn.f32 $r4, s[$ofs1+0x0028], $r4, $r8 shl.u32 $r8, $r3, 0x00000004 mov.b32 $r3, s[$ofs2+0x002c] mad24.lo.u32.u16.u16.u32 $r1, $r2.lo, $r1.lo, $r9 add.u32 $r2, s[0x001c], $r8 mad.rn.f32 $r3, s[$ofs1+0x002c], $r3, $r4 add.half.b32 $r4, $r7, $r1 mov.half.b32 $r1, s[$ofs2+0x0030] add.b32 $r7, $r2, 0xffffffff mad.rn.f32 $r2, s[$ofs1+0x0030], $r1, $r3 add.half.b32 $r3, $r4, $r7 mov.half.b32 $r1, s[$ofs2+0x0034] shl.u32 $r3, $r3, 0x00000002 mad.rn.f32 $r2, s[$ofs1+0x0034], $r1, $r2 mov.half.b32 $r1, s[$ofs2+0x0038] add.half.b32 $r3, s[0x0014], $r3 mad.rn.f32 $r2, s[$ofs1+0x0038], $r1, $r2 mov.b32 $r1, s[$ofs2+0x003c] set.le.s32 $p0|$o127, $r5, $r3 mad.rn.f32 $r1, s[$ofs1+0x003c], $r1, $r2 @$p0.ne bra.label label0 label1: nop.join label2: bar.sync.u32 0x00000000 cvt.u32.u16 $r2, s[0x000e] mov.b32 $r3, 0x00000010 mul24.lo.u32.u16.u16 $r4, $r2.lo, $r3.hi mad24.lo.u32.u16.u16.u32 $r4, $r2.hi, $r3.lo, $r4 shl.u32 $r4, $r4, 0x00000010 and.b16 $r0.hi, $r0.hi, c1[0x0000] mad24.lo.u32.u16.u16.u32 $r4, $r2.lo, $r3.lo, $r4 cvt.u32.u16 $r2, $r0.hi cvt.u32.u16 $r5, s[0x000c] add.half.b32 $r4, $r4, $r2 mov.half.b32 $r2, s[0x0020] mul24.lo.u32.u16.u16 $r6, $r3.hi, $r5.lo mad24.lo.u32.u16.u16.u32 $r6, $r3.lo, $r5.hi, $r6 mul24.lo.u32.u16.u16 $r7, $r2.lo, $r4.hi shl.u32 $r6, $r6, 0x00000010 mad24.lo.u32.u16.u16.u32 $r7, $r2.hi, $r4.lo, $r7 mad24.lo.u32.u16.u16.u32 $r3, $r3.lo, $r5.lo, $r6 cvt.u32.u16 $r0, $r0.lo shl.u32 $r5, $r7, 0x00000010 add.u32 $r0, $r3, $r0 mad24.lo.u32.u16.u16.u32 $r2, $r2.lo, $r4.lo, $r5 add.u32 $r0, $r2, $r0 shl.u32 $r0, $r0, 0x00000002 add.u32 $r0, s[0x0010], $r0 mov.end.u32 g[$r0], $r1 #.constseg 1:0x0000 const #{ #d.32 0x000003ff, 0x00000044, 0x00000004, 0x00000010 // 0000 #} }