Live registers at the start of the kernel?

Profiling my kernel I’ve noticed that I have 28 live registers at the first SASS call. Are those some requirements for system calls? The number seems to differ between kernels but 28 seems pretty high, how can I check why are those registers required?

Without further context, it’s hard to say why there is this exact number in your case. The Live Registers metric is based on static analysis provided by the compiler, and the number can include registers that need to be retained for ABI calls, i.e. when your kernel calls a different function by creating a separate stack frame. This will normally be at CALL instructions, though.

We fixed an issue regarding live registers in version 2024.1 where in some cases values would be reported too high. If you are using an older version of ncu to open this report, it may be worthwhile trying with 2024.1, or the latest available.

1 Like

Hmm, I’ve looked further into the code and it seems like there is a function call added when going from PTX to sass
this is the sass snippet


Am i understanding correctly that it’s calling a NOP?
It looks like it’s generated from this PTX part

        add.s32         %ridx_s32_1, %ridx_s32_1, 1;
        setp.lt.u32     %pred_pred_1, %ridx_s32_1, 28;
        @%pred_pred_1   bra $loop_1;
        add.s32         %ridx_s32_0, %ridx_s32_0, 1;
        setp.lt.u32     %pred_pred_2, %ridx_s32_0, 8;
        @%pred_pred_2   bra $loop_0;

It doesn’t seem like a very efficient way to do a branch, here is the full PTX kernel that I’m compiling - how can I avoid that extra registers?

 .version 7.5
.target sm_86
.address_size 64
.visible .entry r_16_4_32_2_2_2_2_2_8_28_2_16_4_4_4n1(
        .param .u64 data0,
        .param .u64 data1,
        .param .u64 data2
)
{
        .reg            .u64 %dat_u64_<3>;
        .reg            .f16 %const_f16_<1>;
        .reg            .s32 %alu_s32_<46>;
        .reg            .s64 %cast_s64_<3>;
        .reg            .s64 %alu_s64_<6>;
        .reg            .f32 %acc_f32_<64>;
        .reg            .s32 %ridx_s32_<3>;
        .reg            .f16 %val_f16_<48>;
        .reg            .pred %alu_pred_<2>;
        .reg            .b32 %wmma_b32_<96>;
        .reg            .f32 %wmma_f32_<64>;
        .reg            .pred %pred_pred_<3>;
        .reg            .u32 %lidx5;
        .reg            .u32 %gidx0;
        .reg            .u32 %lidx4;
        .reg            .u32 %gidx1;
        .reg            .u32 %lidx3;
        .reg            .u32 %gidx2;
        .reg            .b64 %fw<6>;
        .reg            .b32 %fd<7>;
        .reg            .u32 %rem_32_<3>;
        .reg            .u64 %rem_64_<3>;
        ld.param.u64    %dat_u64_0, [data0+0];
        ld.param.u64    %dat_u64_1, [data1+0];
        ld.param.u64    %dat_u64_2, [data2+0];
        mov.b16         %const_f16_0, 0x0000;
        mov.u32         %gidx2, %ctaid.x;
        mov.u32         %lidx3, %tid.x;
        mov.u32         %gidx1, %ctaid.y;
        mov.u32         %lidx4, %tid.y;
        mov.u32         %gidx0, %ctaid.z;
        mov.u32         %lidx5, %tid.z;
        shr.s32         %alu_s32_0, %lidx3, 1;
        shr.s32         %alu_s32_1, %lidx3, 2;
        shr.b32          %rem_32_0, %lidx3, 31;
        add.s32          %rem_32_1, %lidx3, %rem_32_0;
        and.b32           %rem_32_2, %rem_32_1, -2;
        sub.s32          %alu_s32_2, %lidx3, %rem_32_2;
        shl.b32         %alu_s32_3, %gidx1, 11;
        mul.lo.s32      %alu_s32_4, %gidx1, 200704;
        shl.b32         %alu_s32_6, %gidx0, 18;
        mul.lo.s32      %alu_s32_7, %gidx0, 25088;
        shl.b32         %alu_s32_8, %lidx5, 7;
        mad.lo.s32      %alu_s32_9, %lidx5, 12544, %alu_s32_4;
        shr.b32          %rem_32_0, %alu_s32_0, 31;
        add.s32          %rem_32_1, %alu_s32_0, %rem_32_0;
        and.b32           %rem_32_2, %rem_32_1, -2;
        sub.s32          %alu_s32_10, %alu_s32_0, %rem_32_2;
        mad.lo.s32      %alu_s32_12, %alu_s32_1, 784, %alu_s32_7;
        mad.lo.s32      %alu_s32_13, %lidx4, 1568, %alu_s32_12;
        mad.lo.s32      %alu_s32_14, %lidx5, 3136, %alu_s32_13;
        mad.lo.s32      %alu_s32_15, %gidx2, 3211264, %alu_s32_14;
        mad.lo.s32      %alu_s32_16, %alu_s32_1, 3136, %alu_s32_9;
        mad.lo.s32      %alu_s32_17, %lidx4, 6272, %alu_s32_16;
        mad.lo.s32      %alu_s32_18, %gidx2, 6422528, %alu_s32_17;
        shl.b32         %alu_s32_19, %alu_s32_2, 1;
        shl.b32         %alu_s32_20, %alu_s32_2, 2;
        shl.b32         %alu_s32_23, %alu_s32_10, 2;
        shl.b32         %alu_s32_24, %alu_s32_10, 3;
        mov.b32         %acc_f32_0, 0f00000000;
        mov.b32         %acc_f32_1, 0f00000000;
        mov.b32         %acc_f32_2, 0f00000000;
        mov.b32         %acc_f32_3, 0f00000000;
        mov.b32         %acc_f32_4, 0f00000000;
        mov.b32         %acc_f32_5, 0f00000000;
        mov.b32         %acc_f32_6, 0f00000000;
        mov.b32         %acc_f32_7, 0f00000000;
        mov.b32         %acc_f32_8, 0f00000000;
        mov.b32         %acc_f32_9, 0f00000000;
        mov.b32         %acc_f32_10, 0f00000000;
        mov.b32         %acc_f32_11, 0f00000000;
        mov.b32         %acc_f32_12, 0f00000000;
        mov.b32         %acc_f32_13, 0f00000000;
        mov.b32         %acc_f32_14, 0f00000000;
        mov.b32         %acc_f32_15, 0f00000000;
        mov.b32         %acc_f32_16, 0f00000000;
        mov.b32         %acc_f32_17, 0f00000000;
        mov.b32         %acc_f32_18, 0f00000000;
        mov.b32         %acc_f32_19, 0f00000000;
        mov.b32         %acc_f32_20, 0f00000000;
        mov.b32         %acc_f32_21, 0f00000000;
        mov.b32         %acc_f32_22, 0f00000000;
        mov.b32         %acc_f32_23, 0f00000000;
        mov.b32         %acc_f32_24, 0f00000000;
        mov.b32         %acc_f32_25, 0f00000000;
        mov.b32         %acc_f32_26, 0f00000000;
        mov.b32         %acc_f32_27, 0f00000000;
        mov.b32         %acc_f32_28, 0f00000000;
        mov.b32         %acc_f32_29, 0f00000000;
        mov.b32         %acc_f32_30, 0f00000000;
        mov.b32         %acc_f32_31, 0f00000000;
        mov.b32         %acc_f32_32, 0f00000000;
        mov.b32         %acc_f32_33, 0f00000000;
        mov.b32         %acc_f32_34, 0f00000000;
        mov.b32         %acc_f32_35, 0f00000000;
        mov.b32         %acc_f32_36, 0f00000000;
        mov.b32         %acc_f32_37, 0f00000000;
        mov.b32         %acc_f32_38, 0f00000000;
        mov.b32         %acc_f32_39, 0f00000000;
        mov.b32         %acc_f32_40, 0f00000000;
        mov.b32         %acc_f32_41, 0f00000000;
        mov.b32         %acc_f32_42, 0f00000000;
        mov.b32         %acc_f32_43, 0f00000000;
        mov.b32         %acc_f32_44, 0f00000000;
        mov.b32         %acc_f32_45, 0f00000000;
        mov.b32         %acc_f32_46, 0f00000000;
        mov.b32         %acc_f32_47, 0f00000000;
        mov.b32         %acc_f32_48, 0f00000000;
        mov.b32         %acc_f32_49, 0f00000000;
        mov.b32         %acc_f32_50, 0f00000000;
        mov.b32         %acc_f32_51, 0f00000000;
        mov.b32         %acc_f32_52, 0f00000000;
        mov.b32         %acc_f32_53, 0f00000000;
        mov.b32         %acc_f32_54, 0f00000000;
        mov.b32         %acc_f32_55, 0f00000000;
        mov.b32         %acc_f32_56, 0f00000000;
        mov.b32         %acc_f32_57, 0f00000000;
        mov.b32         %acc_f32_58, 0f00000000;
        mov.b32         %acc_f32_59, 0f00000000;
        mov.b32         %acc_f32_60, 0f00000000;
        mov.b32         %acc_f32_61, 0f00000000;
        mov.b32         %acc_f32_62, 0f00000000;
        mov.b32         %acc_f32_63, 0f00000000;
        mov.u32         %ridx_s32_0, 0;
$loop_0:
        mad.lo.s32      %alu_s32_32, %ridx_s32_0, 401408, %alu_s32_15;
        mad.lo.s32      %alu_s32_33, %ridx_s32_0, 802816, %alu_s32_18;
        mov.u32         %ridx_s32_1, 0;
$loop_1:
        mad.lo.s32      %alu_s32_34, %ridx_s32_1, 28, %alu_s32_32;
        mad.lo.s32      %alu_s32_35, %ridx_s32_1, 112, %alu_s32_33;
        mov.u32         %ridx_s32_2, 0;
$loop_2:
        shl.b32         %alu_s32_36, %ridx_s32_2, 4;
        add.s32         %alu_s32_37, %alu_s32_34, %alu_s32_36;
        add.s32         %alu_s32_38, %alu_s32_37, %alu_s32_19;
        add.s32         %alu_s32_39, %alu_s32_38, %alu_s32_23;
        mul.wide.s32    %alu_s64_2, %alu_s32_39, 2;
        add.s64         %alu_s64_3, %alu_s64_2, %dat_u64_2;
        ld.global.b16   %val_f16_0, [%alu_s64_3+0];
        ld.global.b16   %val_f16_1, [%alu_s64_3+2];
        ld.global.b16   %val_f16_2, [%alu_s64_3+12544];
        ld.global.b16   %val_f16_3, [%alu_s64_3+12546];
        ld.global.b16   %val_f16_4, [%alu_s64_3+25088];
        ld.global.b16   %val_f16_5, [%alu_s64_3+25090];
        ld.global.b16   %val_f16_6, [%alu_s64_3+37632];
        ld.global.b16   %val_f16_7, [%alu_s64_3+37634];
        add.s32         %alu_s32_40, %alu_s32_36, %alu_s32_19;
        add.s32         %alu_s32_41, %alu_s32_40, %alu_s32_23;
        setp.lt.s32     %alu_pred_0, %alu_s32_41, 19;
        shl.b32         %alu_s32_42, %ridx_s32_2, 5;
        add.s32         %alu_s32_43, %alu_s32_35, %alu_s32_42;
        add.s32         %alu_s32_44, %alu_s32_43, %alu_s32_20;
        add.s32         %alu_s32_45, %alu_s32_44, %alu_s32_24;
        mul.wide.s32    %alu_s64_4, %alu_s32_45, 2;
        add.s64         %alu_s64_5, %alu_s64_4, %dat_u64_1;
        @!%alu_pred_0   bra $if_0;
        ld.global.b16   %val_f16_8, [%alu_s64_5+351268];
        ld.global.b16   %val_f16_9, [%alu_s64_5+301092];
        ld.global.b16   %val_f16_10, [%alu_s64_5+250916];
        ld.global.b16   %val_f16_11, [%alu_s64_5+200740];
        ld.global.b16   %val_f16_12, [%alu_s64_5+150564];
        ld.global.b16   %val_f16_13, [%alu_s64_5+100388];
        ld.global.b16   %val_f16_14, [%alu_s64_5+50212];
        ld.global.b16   %val_f16_15, [%alu_s64_5+36];
        ld.global.b16   %val_f16_16, [%alu_s64_3+37650];
        ld.global.b16   %val_f16_17, [%alu_s64_3+25106];
        ld.global.b16   %val_f16_18, [%alu_s64_3+12562];
        ld.global.b16   %val_f16_19, [%alu_s64_3+18];
        @%alu_pred_0    bra $if_0_true;
$if_0:
        mov.b16         %val_f16_19, %const_f16_0;
        mov.b16         %val_f16_18, %const_f16_0;
        mov.b16         %val_f16_17, %const_f16_0;
        mov.b16         %val_f16_16, %const_f16_0;
        mov.b16         %val_f16_15, %const_f16_0;
        mov.b16         %val_f16_14, %const_f16_0;
        mov.b16         %val_f16_13, %const_f16_0;
        mov.b16         %val_f16_12, %const_f16_0;
        mov.b16         %val_f16_11, %const_f16_0;
        mov.b16         %val_f16_10, %const_f16_0;
        mov.b16         %val_f16_9, %const_f16_0;
        mov.b16         %val_f16_8, %const_f16_0;
$if_0_true:
        setp.lt.s32     %alu_pred_1, %alu_s32_41, 20;
        @!%alu_pred_1   bra $if_1;
        ld.global.b16   %val_f16_20, [%alu_s64_5+351264];
        ld.global.b16   %val_f16_21, [%alu_s64_5+301088];
        ld.global.b16   %val_f16_22, [%alu_s64_5+250912];
        ld.global.b16   %val_f16_23, [%alu_s64_5+200736];
        ld.global.b16   %val_f16_24, [%alu_s64_5+150560];
        ld.global.b16   %val_f16_25, [%alu_s64_5+100384];
        ld.global.b16   %val_f16_26, [%alu_s64_5+50208];
        ld.global.b16   %val_f16_27, [%alu_s64_5+32];
        ld.global.b16   %val_f16_28, [%alu_s64_3+37648];
        ld.global.b16   %val_f16_29, [%alu_s64_3+25104];
        ld.global.b16   %val_f16_30, [%alu_s64_3+12560];
        ld.global.b16   %val_f16_31, [%alu_s64_3+16];
        @%alu_pred_1    bra $if_1_true;
$if_1:
        mov.b16         %val_f16_31, %const_f16_0;
        mov.b16         %val_f16_30, %const_f16_0;
        mov.b16         %val_f16_29, %const_f16_0;
        mov.b16         %val_f16_28, %const_f16_0;
        mov.b16         %val_f16_27, %const_f16_0;
        mov.b16         %val_f16_26, %const_f16_0;
        mov.b16         %val_f16_25, %const_f16_0;
        mov.b16         %val_f16_24, %const_f16_0;
        mov.b16         %val_f16_23, %const_f16_0;
        mov.b16         %val_f16_22, %const_f16_0;
        mov.b16         %val_f16_21, %const_f16_0;
        mov.b16         %val_f16_20, %const_f16_0;
$if_1_true:
        ld.global.b16   %val_f16_32, [%alu_s64_5+0];
        ld.global.b16   %val_f16_33, [%alu_s64_5+4];
        ld.global.b16   %val_f16_34, [%alu_s64_5+50176];
        ld.global.b16   %val_f16_35, [%alu_s64_5+50180];
        mov.b32         %wmma_b32_0, {%val_f16_32, %val_f16_33};
        mov.b32         %wmma_b32_1, {%val_f16_34, %val_f16_35};
        mov.b32         %wmma_b32_2, {%val_f16_27, %val_f16_15};
        mov.b32         %wmma_b32_3, {%val_f16_26, %val_f16_14};
        mov.b32         %wmma_b32_4, {%val_f16_0, %val_f16_1};
        mov.b32         %wmma_b32_5, {%val_f16_31, %val_f16_19};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_0, %wmma_f32_1, %wmma_f32_2, %wmma_f32_3}, {%wmma_b32_0, %wmma_b32_1, %wmma_b32_2, %wmma_b32_3}, {%wmma_b32_4, %wmma_b32_5}, {%acc_f32_0, %acc_f32_1, %acc_f32_2, %acc_f32_3};
        mov.b32         %wmma_b32_6, {%val_f16_32, %val_f16_33};
        mov.b32         %wmma_b32_7, {%val_f16_34, %val_f16_35};
        mov.b32         %wmma_b32_8, {%val_f16_27, %val_f16_15};
        mov.b32         %wmma_b32_9, {%val_f16_26, %val_f16_14};
        mov.b32         %wmma_b32_10, {%val_f16_2, %val_f16_3};
        mov.b32         %wmma_b32_11, {%val_f16_30, %val_f16_18};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_4, %wmma_f32_5, %wmma_f32_6, %wmma_f32_7}, {%wmma_b32_6, %wmma_b32_7, %wmma_b32_8, %wmma_b32_9}, {%wmma_b32_10, %wmma_b32_11}, {%acc_f32_16, %acc_f32_17, %acc_f32_18, %acc_f32_19};
        mov.b32         %wmma_b32_12, {%val_f16_32, %val_f16_33};
        mov.b32         %wmma_b32_13, {%val_f16_34, %val_f16_35};
        mov.b32         %wmma_b32_14, {%val_f16_27, %val_f16_15};
        mov.b32         %wmma_b32_15, {%val_f16_26, %val_f16_14};
        mov.b32         %wmma_b32_16, {%val_f16_4, %val_f16_5};
        mov.b32         %wmma_b32_17, {%val_f16_29, %val_f16_17};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_8, %wmma_f32_9, %wmma_f32_10, %wmma_f32_11}, {%wmma_b32_12, %wmma_b32_13, %wmma_b32_14, %wmma_b32_15}, {%wmma_b32_16, %wmma_b32_17}, {%acc_f32_32, %acc_f32_33, %acc_f32_34, %acc_f32_35};
        mov.b32         %wmma_b32_18, {%val_f16_32, %val_f16_33};
        mov.b32         %wmma_b32_19, {%val_f16_34, %val_f16_35};
        mov.b32         %wmma_b32_20, {%val_f16_27, %val_f16_15};
        mov.b32         %wmma_b32_21, {%val_f16_26, %val_f16_14};
        mov.b32         %wmma_b32_22, {%val_f16_6, %val_f16_7};
        mov.b32         %wmma_b32_23, {%val_f16_28, %val_f16_16};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_12, %wmma_f32_13, %wmma_f32_14, %wmma_f32_15}, {%wmma_b32_18, %wmma_b32_19, %wmma_b32_20, %wmma_b32_21}, {%wmma_b32_22, %wmma_b32_23}, {%acc_f32_48, %acc_f32_49, %acc_f32_50, %acc_f32_51};
        ld.global.b16   %val_f16_36, [%alu_s64_5+100352];
        ld.global.b16   %val_f16_37, [%alu_s64_5+100356];
        ld.global.b16   %val_f16_38, [%alu_s64_5+150528];
        ld.global.b16   %val_f16_39, [%alu_s64_5+150532];
        mov.b32         %wmma_b32_24, {%val_f16_36, %val_f16_37};
        mov.b32         %wmma_b32_25, {%val_f16_38, %val_f16_39};
        mov.b32         %wmma_b32_26, {%val_f16_25, %val_f16_13};
        mov.b32         %wmma_b32_27, {%val_f16_24, %val_f16_12};
        mov.b32         %wmma_b32_28, {%val_f16_0, %val_f16_1};
        mov.b32         %wmma_b32_29, {%val_f16_31, %val_f16_19};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_16, %wmma_f32_17, %wmma_f32_18, %wmma_f32_19}, {%wmma_b32_24, %wmma_b32_25, %wmma_b32_26, %wmma_b32_27}, {%wmma_b32_28, %wmma_b32_29}, {%acc_f32_4, %acc_f32_5, %acc_f32_6, %acc_f32_7};
        mov.b32         %wmma_b32_30, {%val_f16_36, %val_f16_37};
        mov.b32         %wmma_b32_31, {%val_f16_38, %val_f16_39};
        mov.b32         %wmma_b32_32, {%val_f16_25, %val_f16_13};
        mov.b32         %wmma_b32_33, {%val_f16_24, %val_f16_12};
        mov.b32         %wmma_b32_34, {%val_f16_2, %val_f16_3};
        mov.b32         %wmma_b32_35, {%val_f16_30, %val_f16_18};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_20, %wmma_f32_21, %wmma_f32_22, %wmma_f32_23}, {%wmma_b32_30, %wmma_b32_31, %wmma_b32_32, %wmma_b32_33}, {%wmma_b32_34, %wmma_b32_35}, {%acc_f32_20, %acc_f32_21, %acc_f32_22, %acc_f32_23};
        mov.b32         %wmma_b32_36, {%val_f16_36, %val_f16_37};
        mov.b32         %wmma_b32_37, {%val_f16_38, %val_f16_39};
        mov.b32         %wmma_b32_38, {%val_f16_25, %val_f16_13};
        mov.b32         %wmma_b32_39, {%val_f16_24, %val_f16_12};
        mov.b32         %wmma_b32_40, {%val_f16_4, %val_f16_5};
        mov.b32         %wmma_b32_41, {%val_f16_29, %val_f16_17};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_24, %wmma_f32_25, %wmma_f32_26, %wmma_f32_27}, {%wmma_b32_36, %wmma_b32_37, %wmma_b32_38, %wmma_b32_39}, {%wmma_b32_40, %wmma_b32_41}, {%acc_f32_36, %acc_f32_37, %acc_f32_38, %acc_f32_39};
        mov.b32         %wmma_b32_42, {%val_f16_36, %val_f16_37};
        mov.b32         %wmma_b32_43, {%val_f16_38, %val_f16_39};
        mov.b32         %wmma_b32_44, {%val_f16_25, %val_f16_13};
        mov.b32         %wmma_b32_45, {%val_f16_24, %val_f16_12};
        mov.b32         %wmma_b32_46, {%val_f16_6, %val_f16_7};
        mov.b32         %wmma_b32_47, {%val_f16_28, %val_f16_16};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_28, %wmma_f32_29, %wmma_f32_30, %wmma_f32_31}, {%wmma_b32_42, %wmma_b32_43, %wmma_b32_44, %wmma_b32_45}, {%wmma_b32_46, %wmma_b32_47}, {%acc_f32_52, %acc_f32_53, %acc_f32_54, %acc_f32_55};
        ld.global.b16   %val_f16_40, [%alu_s64_5+200704];
        ld.global.b16   %val_f16_41, [%alu_s64_5+200708];
        ld.global.b16   %val_f16_42, [%alu_s64_5+250880];
        ld.global.b16   %val_f16_43, [%alu_s64_5+250884];
        mov.b32         %wmma_b32_48, {%val_f16_40, %val_f16_41};
        mov.b32         %wmma_b32_49, {%val_f16_42, %val_f16_43};
        mov.b32         %wmma_b32_50, {%val_f16_23, %val_f16_11};
        mov.b32         %wmma_b32_51, {%val_f16_22, %val_f16_10};
        mov.b32         %wmma_b32_52, {%val_f16_0, %val_f16_1};
        mov.b32         %wmma_b32_53, {%val_f16_31, %val_f16_19};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_32, %wmma_f32_33, %wmma_f32_34, %wmma_f32_35}, {%wmma_b32_48, %wmma_b32_49, %wmma_b32_50, %wmma_b32_51}, {%wmma_b32_52, %wmma_b32_53}, {%acc_f32_8, %acc_f32_9, %acc_f32_10, %acc_f32_11};
        mov.b32         %wmma_b32_54, {%val_f16_40, %val_f16_41};
        mov.b32         %wmma_b32_55, {%val_f16_42, %val_f16_43};
        mov.b32         %wmma_b32_56, {%val_f16_23, %val_f16_11};
        mov.b32         %wmma_b32_57, {%val_f16_22, %val_f16_10};
        mov.b32         %wmma_b32_58, {%val_f16_2, %val_f16_3};
        mov.b32         %wmma_b32_59, {%val_f16_30, %val_f16_18};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_36, %wmma_f32_37, %wmma_f32_38, %wmma_f32_39}, {%wmma_b32_54, %wmma_b32_55, %wmma_b32_56, %wmma_b32_57}, {%wmma_b32_58, %wmma_b32_59}, {%acc_f32_24, %acc_f32_25, %acc_f32_26, %acc_f32_27};
        mov.b32         %wmma_b32_60, {%val_f16_40, %val_f16_41};
        mov.b32         %wmma_b32_61, {%val_f16_42, %val_f16_43};
        mov.b32         %wmma_b32_62, {%val_f16_23, %val_f16_11};
        mov.b32         %wmma_b32_63, {%val_f16_22, %val_f16_10};
        mov.b32         %wmma_b32_64, {%val_f16_4, %val_f16_5};
        mov.b32         %wmma_b32_65, {%val_f16_29, %val_f16_17};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_40, %wmma_f32_41, %wmma_f32_42, %wmma_f32_43}, {%wmma_b32_60, %wmma_b32_61, %wmma_b32_62, %wmma_b32_63}, {%wmma_b32_64, %wmma_b32_65}, {%acc_f32_40, %acc_f32_41, %acc_f32_42, %acc_f32_43};
        mov.b32         %wmma_b32_66, {%val_f16_40, %val_f16_41};
        mov.b32         %wmma_b32_67, {%val_f16_42, %val_f16_43};
        mov.b32         %wmma_b32_68, {%val_f16_23, %val_f16_11};
        mov.b32         %wmma_b32_69, {%val_f16_22, %val_f16_10};
        mov.b32         %wmma_b32_70, {%val_f16_6, %val_f16_7};
        mov.b32         %wmma_b32_71, {%val_f16_28, %val_f16_16};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_44, %wmma_f32_45, %wmma_f32_46, %wmma_f32_47}, {%wmma_b32_66, %wmma_b32_67, %wmma_b32_68, %wmma_b32_69}, {%wmma_b32_70, %wmma_b32_71}, {%acc_f32_56, %acc_f32_57, %acc_f32_58, %acc_f32_59};
        ld.global.b16   %val_f16_44, [%alu_s64_5+301056];
        ld.global.b16   %val_f16_45, [%alu_s64_5+301060];
        ld.global.b16   %val_f16_46, [%alu_s64_5+351232];
        ld.global.b16   %val_f16_47, [%alu_s64_5+351236];
        mov.b32         %wmma_b32_72, {%val_f16_44, %val_f16_45};
        mov.b32         %wmma_b32_73, {%val_f16_46, %val_f16_47};
        mov.b32         %wmma_b32_74, {%val_f16_21, %val_f16_9};
        mov.b32         %wmma_b32_75, {%val_f16_20, %val_f16_8};
        mov.b32         %wmma_b32_76, {%val_f16_0, %val_f16_1};
        mov.b32         %wmma_b32_77, {%val_f16_31, %val_f16_19};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_48, %wmma_f32_49, %wmma_f32_50, %wmma_f32_51}, {%wmma_b32_72, %wmma_b32_73, %wmma_b32_74, %wmma_b32_75}, {%wmma_b32_76, %wmma_b32_77}, {%acc_f32_12, %acc_f32_13, %acc_f32_14, %acc_f32_15};
        mov.b32         %wmma_b32_78, {%val_f16_44, %val_f16_45};
        mov.b32         %wmma_b32_79, {%val_f16_46, %val_f16_47};
        mov.b32         %wmma_b32_80, {%val_f16_21, %val_f16_9};
        mov.b32         %wmma_b32_81, {%val_f16_20, %val_f16_8};
        mov.b32         %wmma_b32_82, {%val_f16_2, %val_f16_3};
        mov.b32         %wmma_b32_83, {%val_f16_30, %val_f16_18};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_52, %wmma_f32_53, %wmma_f32_54, %wmma_f32_55}, {%wmma_b32_78, %wmma_b32_79, %wmma_b32_80, %wmma_b32_81}, {%wmma_b32_82, %wmma_b32_83}, {%acc_f32_28, %acc_f32_29, %acc_f32_30, %acc_f32_31};
        mov.b32         %wmma_b32_84, {%val_f16_44, %val_f16_45};
        mov.b32         %wmma_b32_85, {%val_f16_46, %val_f16_47};
        mov.b32         %wmma_b32_86, {%val_f16_21, %val_f16_9};
        mov.b32         %wmma_b32_87, {%val_f16_20, %val_f16_8};
        mov.b32         %wmma_b32_88, {%val_f16_4, %val_f16_5};
        mov.b32         %wmma_b32_89, {%val_f16_29, %val_f16_17};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_56, %wmma_f32_57, %wmma_f32_58, %wmma_f32_59}, {%wmma_b32_84, %wmma_b32_85, %wmma_b32_86, %wmma_b32_87}, {%wmma_b32_88, %wmma_b32_89}, {%acc_f32_44, %acc_f32_45, %acc_f32_46, %acc_f32_47};
        mov.b32         %wmma_b32_90, {%val_f16_44, %val_f16_45};
        mov.b32         %wmma_b32_91, {%val_f16_46, %val_f16_47};
        mov.b32         %wmma_b32_92, {%val_f16_21, %val_f16_9};
        mov.b32         %wmma_b32_93, {%val_f16_20, %val_f16_8};
        mov.b32         %wmma_b32_94, {%val_f16_6, %val_f16_7};
        mov.b32         %wmma_b32_95, {%val_f16_28, %val_f16_16};
        mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32                  {%wmma_f32_60, %wmma_f32_61, %wmma_f32_62, %wmma_f32_63}, {%wmma_b32_90, %wmma_b32_91, %wmma_b32_92, %wmma_b32_93}, {%wmma_b32_94, %wmma_b32_95}, {%acc_f32_60, %acc_f32_61, %acc_f32_62, %acc_f32_63};
        mov.b32         %acc_f32_0, %wmma_f32_0;
        mov.b32         %acc_f32_1, %wmma_f32_1;
        mov.b32         %acc_f32_2, %wmma_f32_2;
        mov.b32         %acc_f32_3, %wmma_f32_3;
        mov.b32         %acc_f32_4, %wmma_f32_16;
        mov.b32         %acc_f32_5, %wmma_f32_17;
        mov.b32         %acc_f32_6, %wmma_f32_18;
        mov.b32         %acc_f32_7, %wmma_f32_19;
        mov.b32         %acc_f32_8, %wmma_f32_32;
        mov.b32         %acc_f32_9, %wmma_f32_33;
        mov.b32         %acc_f32_10, %wmma_f32_34;
        mov.b32         %acc_f32_11, %wmma_f32_35;
        mov.b32         %acc_f32_12, %wmma_f32_48;
        mov.b32         %acc_f32_13, %wmma_f32_49;
        mov.b32         %acc_f32_14, %wmma_f32_50;
        mov.b32         %acc_f32_15, %wmma_f32_51;
        mov.b32         %acc_f32_16, %wmma_f32_4;
        mov.b32         %acc_f32_17, %wmma_f32_5;
        mov.b32         %acc_f32_18, %wmma_f32_6;
        mov.b32         %acc_f32_19, %wmma_f32_7;
        mov.b32         %acc_f32_20, %wmma_f32_20;
        mov.b32         %acc_f32_21, %wmma_f32_21;
        mov.b32         %acc_f32_22, %wmma_f32_22;
        mov.b32         %acc_f32_23, %wmma_f32_23;
        mov.b32         %acc_f32_24, %wmma_f32_36;
        mov.b32         %acc_f32_25, %wmma_f32_37;
        mov.b32         %acc_f32_26, %wmma_f32_38;
        mov.b32         %acc_f32_27, %wmma_f32_39;
        mov.b32         %acc_f32_28, %wmma_f32_52;
        mov.b32         %acc_f32_29, %wmma_f32_53;
        mov.b32         %acc_f32_30, %wmma_f32_54;
        mov.b32         %acc_f32_31, %wmma_f32_55;
        mov.b32         %acc_f32_32, %wmma_f32_8;
        mov.b32         %acc_f32_33, %wmma_f32_9;
        mov.b32         %acc_f32_34, %wmma_f32_10;
        mov.b32         %acc_f32_35, %wmma_f32_11;
        mov.b32         %acc_f32_36, %wmma_f32_24;
        mov.b32         %acc_f32_37, %wmma_f32_25;
        mov.b32         %acc_f32_38, %wmma_f32_26;
        mov.b32         %acc_f32_39, %wmma_f32_27;
        mov.b32         %acc_f32_40, %wmma_f32_40;
        mov.b32         %acc_f32_41, %wmma_f32_41;
        mov.b32         %acc_f32_42, %wmma_f32_42;
        mov.b32         %acc_f32_43, %wmma_f32_43;
        mov.b32         %acc_f32_44, %wmma_f32_56;
        mov.b32         %acc_f32_45, %wmma_f32_57;
        mov.b32         %acc_f32_46, %wmma_f32_58;
        mov.b32         %acc_f32_47, %wmma_f32_59;
        mov.b32         %acc_f32_48, %wmma_f32_12;
        mov.b32         %acc_f32_49, %wmma_f32_13;
        mov.b32         %acc_f32_50, %wmma_f32_14;
        mov.b32         %acc_f32_51, %wmma_f32_15;
        mov.b32         %acc_f32_52, %wmma_f32_28;
        mov.b32         %acc_f32_53, %wmma_f32_29;
        mov.b32         %acc_f32_54, %wmma_f32_30;
        mov.b32         %acc_f32_55, %wmma_f32_31;
        mov.b32         %acc_f32_56, %wmma_f32_44;
        mov.b32         %acc_f32_57, %wmma_f32_45;
        mov.b32         %acc_f32_58, %wmma_f32_46;
        mov.b32         %acc_f32_59, %wmma_f32_47;
        mov.b32         %acc_f32_60, %wmma_f32_60;
        mov.b32         %acc_f32_61, %wmma_f32_61;
        mov.b32         %acc_f32_62, %wmma_f32_62;
        mov.b32         %acc_f32_63, %wmma_f32_63;
        add.s32         %ridx_s32_2, %ridx_s32_2, 1;
        setp.lt.u32     %pred_pred_0, %ridx_s32_2, 2;
        @%pred_pred_0   bra $loop_2;
        add.s32         %ridx_s32_1, %ridx_s32_1, 1;
        setp.lt.u32     %pred_pred_1, %ridx_s32_1, 28;
        @%pred_pred_1   bra $loop_1;
        add.s32         %ridx_s32_0, %ridx_s32_0, 1;
        setp.lt.u32     %pred_pred_2, %ridx_s32_0, 8;
        @%pred_pred_2   bra $loop_0;
        shl.b32         %alu_s32_21, %alu_s32_2, 14;
        add.s32         %alu_s32_22, %alu_s32_6, %alu_s32_3;
        shl.b32         %alu_s32_25, %alu_s32_10, 15;
        add.s32         %alu_s32_26, %alu_s32_22, %gidx2;
        add.s32         %alu_s32_27, %alu_s32_26, %alu_s32_21;
        add.s32         %alu_s32_28, %alu_s32_27, %alu_s32_8;
        add.s32         %alu_s32_29, %alu_s32_28, %alu_s32_25;
        shl.b32         %alu_s32_11, %alu_s32_1, 5;
        add.s32         %alu_s32_30, %alu_s32_29, %alu_s32_11;
        mov.u32         %lidx4, %tid.y;
        shl.b32         %alu_s32_5, %lidx4, 6;
        add.s32         %alu_s32_31, %alu_s32_30, %alu_s32_5;
        mul.wide.s32    %alu_s64_0, %alu_s32_31, 4;
        add.s64         %alu_s64_1, %alu_s64_0, %dat_u64_0;
        st.global.f32   [%alu_s64_1+0], %acc_f32_0;
        st.global.f32   [%alu_s64_1+1024], %acc_f32_2;
        st.global.f32   [%alu_s64_1+2048], %acc_f32_4;
        st.global.f32   [%alu_s64_1+3072], %acc_f32_6;
        st.global.f32   [%alu_s64_1+4096], %acc_f32_8;
        st.global.f32   [%alu_s64_1+5120], %acc_f32_10;
        st.global.f32   [%alu_s64_1+6144], %acc_f32_12;
        st.global.f32   [%alu_s64_1+7168], %acc_f32_14;
        st.global.f32   [%alu_s64_1+32768], %acc_f32_1;
        st.global.f32   [%alu_s64_1+33792], %acc_f32_3;
        st.global.f32   [%alu_s64_1+34816], %acc_f32_5;
        st.global.f32   [%alu_s64_1+35840], %acc_f32_7;
        st.global.f32   [%alu_s64_1+36864], %acc_f32_9;
        st.global.f32   [%alu_s64_1+37888], %acc_f32_11;
        st.global.f32   [%alu_s64_1+38912], %acc_f32_13;
        st.global.f32   [%alu_s64_1+39936], %acc_f32_15;
        st.global.f32   [%alu_s64_1+262144], %acc_f32_16;
        st.global.f32   [%alu_s64_1+263168], %acc_f32_18;
        st.global.f32   [%alu_s64_1+264192], %acc_f32_20;
        st.global.f32   [%alu_s64_1+265216], %acc_f32_22;
        st.global.f32   [%alu_s64_1+266240], %acc_f32_24;
        st.global.f32   [%alu_s64_1+267264], %acc_f32_26;
        st.global.f32   [%alu_s64_1+268288], %acc_f32_28;
        st.global.f32   [%alu_s64_1+269312], %acc_f32_30;
        st.global.f32   [%alu_s64_1+294912], %acc_f32_17;
        st.global.f32   [%alu_s64_1+295936], %acc_f32_19;
        st.global.f32   [%alu_s64_1+296960], %acc_f32_21;
        st.global.f32   [%alu_s64_1+297984], %acc_f32_23;
        st.global.f32   [%alu_s64_1+299008], %acc_f32_25;
        st.global.f32   [%alu_s64_1+300032], %acc_f32_27;
        st.global.f32   [%alu_s64_1+301056], %acc_f32_29;
        st.global.f32   [%alu_s64_1+302080], %acc_f32_31;
        st.global.f32   [%alu_s64_1+524288], %acc_f32_32;
        st.global.f32   [%alu_s64_1+525312], %acc_f32_34;
        st.global.f32   [%alu_s64_1+526336], %acc_f32_36;
        st.global.f32   [%alu_s64_1+527360], %acc_f32_38;
        st.global.f32   [%alu_s64_1+528384], %acc_f32_40;
        st.global.f32   [%alu_s64_1+529408], %acc_f32_42;
        st.global.f32   [%alu_s64_1+530432], %acc_f32_44;
        st.global.f32   [%alu_s64_1+531456], %acc_f32_46;
        st.global.f32   [%alu_s64_1+557056], %acc_f32_33;
        st.global.f32   [%alu_s64_1+558080], %acc_f32_35;
        st.global.f32   [%alu_s64_1+559104], %acc_f32_37;
        st.global.f32   [%alu_s64_1+560128], %acc_f32_39;
        st.global.f32   [%alu_s64_1+561152], %acc_f32_41;
        st.global.f32   [%alu_s64_1+562176], %acc_f32_43;
        st.global.f32   [%alu_s64_1+563200], %acc_f32_45;
        st.global.f32   [%alu_s64_1+564224], %acc_f32_47;
        st.global.f32   [%alu_s64_1+786432], %acc_f32_48;
        st.global.f32   [%alu_s64_1+787456], %acc_f32_50;
        st.global.f32   [%alu_s64_1+788480], %acc_f32_52;
        st.global.f32   [%alu_s64_1+789504], %acc_f32_54;
        st.global.f32   [%alu_s64_1+790528], %acc_f32_56;
        st.global.f32   [%alu_s64_1+791552], %acc_f32_58;
        st.global.f32   [%alu_s64_1+792576], %acc_f32_60;
        st.global.f32   [%alu_s64_1+793600], %acc_f32_62;
        st.global.f32   [%alu_s64_1+819200], %acc_f32_49;
        st.global.f32   [%alu_s64_1+820224], %acc_f32_51;
        st.global.f32   [%alu_s64_1+821248], %acc_f32_53;
        st.global.f32   [%alu_s64_1+822272], %acc_f32_55;
        st.global.f32   [%alu_s64_1+823296], %acc_f32_57;
        st.global.f32   [%alu_s64_1+824320], %acc_f32_59;
        st.global.f32   [%alu_s64_1+825344], %acc_f32_61;
        st.global.f32   [%alu_s64_1+826368], %acc_f32_63;
        ret;
}