I’m in the middle of profiling 2 PTX kernels, kernel A:
.version 7.5
.target sm_86
.address_size 64
.visible .entry r_2_2_64_5_2_16_4_1024_4_4_4n2(
.param .u64 data0,
.param .u64 data1,
.param .u64 data2
)
{
.reg .u64 %dat_u64_<3>;
.reg .f32 %acc_f32_<16>;
.reg .s32 %const_s32_<25>;
.reg .s32 %alu_s32_<37>;
.reg .u64 %cast_u64_<3>;
.reg .u64 %alu_u64_<3>;
.reg .s32 %ridx_s32_<1>;
.reg .f32 %val_f32_<32>;
.reg .f32 %alu_f32_<80>;
.reg .pred %pred_pred_<1>;
.reg .u32 %lidx6;
.reg .u32 %lidx5;
.reg .u32 %lidx4;
.reg .u32 %gidx2;
.reg .u32 %gidx1;
.reg .u32 %gidx0;
ld.param.u64 %dat_u64_0, [data0+0];
ld.param.u64 %dat_u64_1, [data1+0];
ld.param.u64 %dat_u64_2, [data2+0];
mov.u32 %gidx0, %ctaid.z;
mov.u32 %gidx1, %ctaid.y;
mov.u32 %gidx2, %ctaid.x;
mov.u32 %lidx4, %tid.z;
mov.u32 %lidx5, %tid.y;
mov.u32 %lidx6, %tid.x;
mov.b32 %acc_f32_0, 0f00000000;
mov.b32 %acc_f32_1, 0f00000000;
mov.b32 %acc_f32_2, 0f00000000;
mov.b32 %acc_f32_3, 0f00000000;
mov.b32 %acc_f32_4, 0f00000000;
mov.b32 %acc_f32_5, 0f00000000;
mov.b32 %acc_f32_6, 0f00000000;
mov.b32 %acc_f32_7, 0f00000000;
mov.b32 %acc_f32_8, 0f00000000;
mov.b32 %acc_f32_9, 0f00000000;
mov.b32 %acc_f32_10, 0f00000000;
mov.b32 %acc_f32_11, 0f00000000;
mov.b32 %acc_f32_12, 0f00000000;
mov.b32 %acc_f32_13, 0f00000000;
mov.b32 %acc_f32_14, 0f00000000;
mov.b32 %acc_f32_15, 0f00000000;
mov.b32 %const_s32_0, 0;
mov.b32 %const_s32_1, 1024;
mov.b32 %const_s32_2, 134217728;
mul.lo.s32 %alu_s32_0, %gidx0, %const_s32_2;
mov.b32 %const_s32_3, 67108864;
mul.lo.s32 %alu_s32_1, %gidx1, %const_s32_3;
add.s32 %alu_s32_2, %alu_s32_0, %alu_s32_1;
mov.b32 %const_s32_4, 5;
div.s32 %alu_s32_3, %gidx2, %const_s32_4;
mov.b32 %const_s32_5, 262144;
mul.lo.s32 %alu_s32_4, %alu_s32_3, %const_s32_5;
add.s32 %alu_s32_5, %alu_s32_2, %alu_s32_4;
mov.b32 %const_s32_6, 16384;
mul.lo.s32 %alu_s32_6, %lidx5, %const_s32_6;
add.s32 %alu_s32_7, %alu_s32_5, %alu_s32_6;
mov.b32 %const_s32_7, 16777216;
mul.lo.s32 %alu_s32_8, %lidx6, %const_s32_7;
add.s32 %alu_s32_9, %alu_s32_7, %alu_s32_8;
mov.b32 %const_s32_8, 4;
mov.b32 %const_s32_9, 1310720;
mul.lo.s32 %alu_s32_10, %gidx0, %const_s32_9;
mov.b32 %const_s32_10, 160;
mul.lo.s32 %alu_s32_11, %gidx1, %const_s32_10;
add.s32 %alu_s32_12, %alu_s32_10, %alu_s32_11;
rem.s32 %alu_s32_13, %gidx2, %const_s32_4;
mov.b32 %const_s32_11, 8;
mul.lo.s32 %alu_s32_14, %alu_s32_13, %const_s32_11;
add.s32 %alu_s32_15, %alu_s32_12, %alu_s32_14;
mul.lo.s32 %alu_s32_16, %lidx4, %const_s32_8;
add.s32 %alu_s32_17, %alu_s32_15, %alu_s32_16;
mov.b32 %const_s32_12, 40;
mul.lo.s32 %alu_s32_18, %lidx6, %const_s32_12;
add.s32 %alu_s32_19, %alu_s32_17, %alu_s32_18;
mov.b32 %const_s32_13, 1280;
mov.b32 %const_s32_14, 320;
mov.b32 %const_s32_15, 655360;
mul.lo.s32 %alu_s32_20, %gidx1, %const_s32_15;
add.s32 %alu_s32_21, %alu_s32_10, %alu_s32_20;
mov.b32 %const_s32_16, 2560;
mul.lo.s32 %alu_s32_22, %alu_s32_3, %const_s32_16;
add.s32 %alu_s32_23, %alu_s32_21, %alu_s32_22;
add.s32 %alu_s32_24, %alu_s32_23, %alu_s32_14;
add.s32 %alu_s32_25, %alu_s32_24, %alu_s32_16;
mul.lo.s32 %alu_s32_26, %lidx5, %const_s32_10;
add.s32 %alu_s32_27, %alu_s32_25, %alu_s32_26;
mov.b32 %const_s32_17, 163840;
mul.lo.s32 %alu_s32_28, %lidx6, %const_s32_17;
add.s32 %alu_s32_29, %alu_s32_27, %alu_s32_28;
mov.b32 %const_s32_18, 0;
mov.b32 %const_s32_19, 32768;
mov.b32 %const_s32_20, 49152;
mov.b32 %const_s32_21, 0;
mov.b32 %const_s32_22, 3840;
mul.lo.s32 %alu_s32_30, %alu_s32_29, %const_s32_8;
mov.b32 %const_s32_23, 0;
cvt.u64.s32 %cast_u64_0, %alu_s32_30;
add.u64 %alu_u64_0, %dat_u64_0, %cast_u64_0;
mov.b32 %const_s32_24, 480;
mov.u32 %ridx_s32_0, %const_s32_0;
$loop_0:
mul.lo.s32 %alu_s32_31, %ridx_s32_0, %const_s32_8;
add.s32 %alu_s32_32, %alu_s32_9, %alu_s32_31;
mul.lo.s32 %alu_s32_33, %alu_s32_32, %const_s32_8;
cvt.u64.s32 %cast_u64_1, %alu_s32_33;
add.u64 %alu_u64_1, %dat_u64_1, %cast_u64_1;
ld.global.v4.f32 {%val_f32_0, %val_f32_1, %val_f32_2, %val_f32_3}, [%alu_u64_1+0];
ld.global.v4.f32 {%val_f32_4, %val_f32_5, %val_f32_6, %val_f32_7}, [%alu_u64_1+16384];
ld.global.v4.f32 {%val_f32_8, %val_f32_9, %val_f32_10, %val_f32_11}, [%alu_u64_1+32768];
ld.global.v4.f32 {%val_f32_12, %val_f32_13, %val_f32_14, %val_f32_15}, [%alu_u64_1+49152];
mul.lo.s32 %alu_s32_34, %ridx_s32_0, %const_s32_13;
add.s32 %alu_s32_35, %alu_s32_19, %alu_s32_34;
mul.lo.s32 %alu_s32_36, %alu_s32_35, %const_s32_8;
cvt.u64.s32 %cast_u64_2, %alu_s32_36;
add.u64 %alu_u64_2, %dat_u64_2, %cast_u64_2;
ld.global.v4.f32 {%val_f32_16, %val_f32_17, %val_f32_18, %val_f32_19}, [%alu_u64_2+0];
ld.global.v4.f32 {%val_f32_20, %val_f32_21, %val_f32_22, %val_f32_23}, [%alu_u64_2+1280];
ld.global.v4.f32 {%val_f32_24, %val_f32_25, %val_f32_26, %val_f32_27}, [%alu_u64_2+2560];
ld.global.v4.f32 {%val_f32_28, %val_f32_29, %val_f32_30, %val_f32_31}, [%alu_u64_2+3840];
mul.f32 %alu_f32_0, %val_f32_0, %val_f32_16;
fma.rn.f32 %alu_f32_1, %val_f32_1, %val_f32_20, %alu_f32_0;
fma.rn.f32 %alu_f32_2, %val_f32_2, %val_f32_24, %alu_f32_1;
fma.rn.f32 %alu_f32_3, %val_f32_3, %val_f32_28, %alu_f32_2;
mul.f32 %alu_f32_4, %val_f32_0, %val_f32_17;
fma.rn.f32 %alu_f32_5, %val_f32_1, %val_f32_21, %alu_f32_4;
fma.rn.f32 %alu_f32_6, %val_f32_2, %val_f32_25, %alu_f32_5;
fma.rn.f32 %alu_f32_7, %val_f32_3, %val_f32_29, %alu_f32_6;
mul.f32 %alu_f32_8, %val_f32_0, %val_f32_18;
fma.rn.f32 %alu_f32_9, %val_f32_1, %val_f32_22, %alu_f32_8;
fma.rn.f32 %alu_f32_10, %val_f32_2, %val_f32_26, %alu_f32_9;
fma.rn.f32 %alu_f32_11, %val_f32_3, %val_f32_30, %alu_f32_10;
mul.f32 %alu_f32_12, %val_f32_0, %val_f32_19;
fma.rn.f32 %alu_f32_13, %val_f32_1, %val_f32_23, %alu_f32_12;
fma.rn.f32 %alu_f32_14, %val_f32_2, %val_f32_27, %alu_f32_13;
fma.rn.f32 %alu_f32_15, %val_f32_3, %val_f32_31, %alu_f32_14;
mul.f32 %alu_f32_16, %val_f32_4, %val_f32_16;
fma.rn.f32 %alu_f32_17, %val_f32_5, %val_f32_20, %alu_f32_16;
fma.rn.f32 %alu_f32_18, %val_f32_6, %val_f32_24, %alu_f32_17;
fma.rn.f32 %alu_f32_19, %val_f32_7, %val_f32_28, %alu_f32_18;
mul.f32 %alu_f32_20, %val_f32_4, %val_f32_17;
fma.rn.f32 %alu_f32_21, %val_f32_5, %val_f32_21, %alu_f32_20;
fma.rn.f32 %alu_f32_22, %val_f32_6, %val_f32_25, %alu_f32_21;
fma.rn.f32 %alu_f32_23, %val_f32_7, %val_f32_29, %alu_f32_22;
mul.f32 %alu_f32_24, %val_f32_4, %val_f32_18;
fma.rn.f32 %alu_f32_25, %val_f32_5, %val_f32_22, %alu_f32_24;
fma.rn.f32 %alu_f32_26, %val_f32_6, %val_f32_26, %alu_f32_25;
fma.rn.f32 %alu_f32_27, %val_f32_7, %val_f32_30, %alu_f32_26;
mul.f32 %alu_f32_28, %val_f32_4, %val_f32_19;
fma.rn.f32 %alu_f32_29, %val_f32_5, %val_f32_23, %alu_f32_28;
fma.rn.f32 %alu_f32_30, %val_f32_6, %val_f32_27, %alu_f32_29;
fma.rn.f32 %alu_f32_31, %val_f32_7, %val_f32_31, %alu_f32_30;
mul.f32 %alu_f32_32, %val_f32_8, %val_f32_16;
fma.rn.f32 %alu_f32_33, %val_f32_9, %val_f32_20, %alu_f32_32;
fma.rn.f32 %alu_f32_34, %val_f32_10, %val_f32_24, %alu_f32_33;
fma.rn.f32 %alu_f32_35, %val_f32_11, %val_f32_28, %alu_f32_34;
mul.f32 %alu_f32_36, %val_f32_8, %val_f32_17;
fma.rn.f32 %alu_f32_37, %val_f32_9, %val_f32_21, %alu_f32_36;
fma.rn.f32 %alu_f32_38, %val_f32_10, %val_f32_25, %alu_f32_37;
fma.rn.f32 %alu_f32_39, %val_f32_11, %val_f32_29, %alu_f32_38;
mul.f32 %alu_f32_40, %val_f32_8, %val_f32_18;
fma.rn.f32 %alu_f32_41, %val_f32_9, %val_f32_22, %alu_f32_40;
fma.rn.f32 %alu_f32_42, %val_f32_10, %val_f32_26, %alu_f32_41;
fma.rn.f32 %alu_f32_43, %val_f32_11, %val_f32_30, %alu_f32_42;
mul.f32 %alu_f32_44, %val_f32_8, %val_f32_19;
fma.rn.f32 %alu_f32_45, %val_f32_9, %val_f32_23, %alu_f32_44;
fma.rn.f32 %alu_f32_46, %val_f32_10, %val_f32_27, %alu_f32_45;
fma.rn.f32 %alu_f32_47, %val_f32_11, %val_f32_31, %alu_f32_46;
mul.f32 %alu_f32_48, %val_f32_12, %val_f32_16;
fma.rn.f32 %alu_f32_49, %val_f32_13, %val_f32_20, %alu_f32_48;
fma.rn.f32 %alu_f32_50, %val_f32_14, %val_f32_24, %alu_f32_49;
fma.rn.f32 %alu_f32_51, %val_f32_15, %val_f32_28, %alu_f32_50;
mul.f32 %alu_f32_52, %val_f32_12, %val_f32_17;
fma.rn.f32 %alu_f32_53, %val_f32_13, %val_f32_21, %alu_f32_52;
fma.rn.f32 %alu_f32_54, %val_f32_14, %val_f32_25, %alu_f32_53;
fma.rn.f32 %alu_f32_55, %val_f32_15, %val_f32_29, %alu_f32_54;
mul.f32 %alu_f32_56, %val_f32_12, %val_f32_18;
fma.rn.f32 %alu_f32_57, %val_f32_13, %val_f32_22, %alu_f32_56;
fma.rn.f32 %alu_f32_58, %val_f32_14, %val_f32_26, %alu_f32_57;
fma.rn.f32 %alu_f32_59, %val_f32_15, %val_f32_30, %alu_f32_58;
mul.f32 %alu_f32_60, %val_f32_12, %val_f32_19;
fma.rn.f32 %alu_f32_61, %val_f32_13, %val_f32_23, %alu_f32_60;
fma.rn.f32 %alu_f32_62, %val_f32_14, %val_f32_27, %alu_f32_61;
fma.rn.f32 %alu_f32_63, %val_f32_15, %val_f32_31, %alu_f32_62;
add.f32 %alu_f32_64, %alu_f32_3, %acc_f32_0;
mov.b32 %acc_f32_0, %alu_f32_64;
add.f32 %alu_f32_65, %alu_f32_7, %acc_f32_1;
mov.b32 %acc_f32_1, %alu_f32_65;
add.f32 %alu_f32_66, %alu_f32_11, %acc_f32_2;
mov.b32 %acc_f32_2, %alu_f32_66;
add.f32 %alu_f32_67, %alu_f32_15, %acc_f32_3;
mov.b32 %acc_f32_3, %alu_f32_67;
add.f32 %alu_f32_68, %alu_f32_19, %acc_f32_4;
mov.b32 %acc_f32_4, %alu_f32_68;
add.f32 %alu_f32_69, %alu_f32_23, %acc_f32_5;
mov.b32 %acc_f32_5, %alu_f32_69;
add.f32 %alu_f32_70, %alu_f32_27, %acc_f32_6;
mov.b32 %acc_f32_6, %alu_f32_70;
add.f32 %alu_f32_71, %alu_f32_31, %acc_f32_7;
mov.b32 %acc_f32_7, %alu_f32_71;
add.f32 %alu_f32_72, %alu_f32_35, %acc_f32_8;
mov.b32 %acc_f32_8, %alu_f32_72;
add.f32 %alu_f32_73, %alu_f32_39, %acc_f32_9;
mov.b32 %acc_f32_9, %alu_f32_73;
add.f32 %alu_f32_74, %alu_f32_43, %acc_f32_10;
mov.b32 %acc_f32_10, %alu_f32_74;
add.f32 %alu_f32_75, %alu_f32_47, %acc_f32_11;
mov.b32 %acc_f32_11, %alu_f32_75;
add.f32 %alu_f32_76, %alu_f32_51, %acc_f32_12;
mov.b32 %acc_f32_12, %alu_f32_76;
add.f32 %alu_f32_77, %alu_f32_55, %acc_f32_13;
mov.b32 %acc_f32_13, %alu_f32_77;
add.f32 %alu_f32_78, %alu_f32_59, %acc_f32_14;
mov.b32 %acc_f32_14, %alu_f32_78;
add.f32 %alu_f32_79, %alu_f32_63, %acc_f32_15;
mov.b32 %acc_f32_15, %alu_f32_79;
add.s32 %ridx_s32_0, %ridx_s32_0, 1;
setp.lt.s32 %pred_pred_0, %ridx_s32_0, %const_s32_1;
@%pred_pred_0 bra $loop_0;
@!%pred_pred_0 bra $loop_0_exit;
$loop_0_exit:
st.global.v4.f32 [%alu_u64_0+0], {%acc_f32_0, %acc_f32_1, %acc_f32_2, %acc_f32_3};
st.global.v4.f32 [%alu_u64_0+160], {%acc_f32_4, %acc_f32_5, %acc_f32_6, %acc_f32_7};
st.global.v4.f32 [%alu_u64_0+320], {%acc_f32_8, %acc_f32_9, %acc_f32_10, %acc_f32_11};
st.global.v4.f32 [%alu_u64_0+480], {%acc_f32_12, %acc_f32_13, %acc_f32_14, %acc_f32_15};
ret;
}
and kernel B:
.version 7.5
.target sm_86
.address_size 64
.visible .entry r_2_2_64_5_2_16_4_1024_4_4_4n1(
.param .u64 data0,
.param .u64 data1,
.param .u64 data2
)
{
.reg .u64 %dat_u64_<3>;
.reg .f32 %acc_f32_<16>;
.reg .s32 %const_s32_<25>;
.reg .s32 %alu_s32_<37>;
.reg .u64 %cast_u64_<3>;
.reg .u64 %alu_u64_<3>;
.reg .s32 %ridx_s32_<1>;
.reg .f32 %val_f32_<32>;
.reg .f32 %alu_f32_<64>;
.reg .pred %pred_pred_<1>;
.reg .u32 %lidx6;
.reg .u32 %lidx5;
.reg .u32 %lidx4;
.reg .u32 %gidx2;
.reg .u32 %gidx1;
.reg .u32 %gidx0;
ld.param.u64 %dat_u64_0, [data0+0];
ld.param.u64 %dat_u64_1, [data1+0];
ld.param.u64 %dat_u64_2, [data2+0];
mov.u32 %gidx0, %ctaid.z;
mov.u32 %gidx1, %ctaid.y;
mov.u32 %gidx2, %ctaid.x;
mov.u32 %lidx4, %tid.z;
mov.u32 %lidx5, %tid.y;
mov.u32 %lidx6, %tid.x;
mov.b32 %acc_f32_0, 0f00000000;
mov.b32 %acc_f32_1, 0f00000000;
mov.b32 %acc_f32_2, 0f00000000;
mov.b32 %acc_f32_3, 0f00000000;
mov.b32 %acc_f32_4, 0f00000000;
mov.b32 %acc_f32_5, 0f00000000;
mov.b32 %acc_f32_6, 0f00000000;
mov.b32 %acc_f32_7, 0f00000000;
mov.b32 %acc_f32_8, 0f00000000;
mov.b32 %acc_f32_9, 0f00000000;
mov.b32 %acc_f32_10, 0f00000000;
mov.b32 %acc_f32_11, 0f00000000;
mov.b32 %acc_f32_12, 0f00000000;
mov.b32 %acc_f32_13, 0f00000000;
mov.b32 %acc_f32_14, 0f00000000;
mov.b32 %acc_f32_15, 0f00000000;
mov.b32 %const_s32_0, 0;
mov.b32 %const_s32_1, 1024;
mov.b32 %const_s32_2, 134217728;
mov.b32 %const_s32_3, 67108864;
mov.b32 %const_s32_4, 5;
mov.b32 %const_s32_5, 262144;
mov.b32 %const_s32_6, 16384;
mov.b32 %const_s32_7, 16777216;
mul.lo.s32 %alu_s32_0, %lidx6, %const_s32_7;
mul.lo.s32 %alu_s32_1, %lidx5, %const_s32_6;
div.s32 %alu_s32_2, %gidx2, %const_s32_4;
mul.lo.s32 %alu_s32_3, %alu_s32_2, %const_s32_5;
mul.lo.s32 %alu_s32_4, %gidx1, %const_s32_3;
mul.lo.s32 %alu_s32_5, %gidx0, %const_s32_2;
add.s32 %alu_s32_6, %alu_s32_5, %alu_s32_4;
add.s32 %alu_s32_7, %alu_s32_6, %alu_s32_3;
add.s32 %alu_s32_8, %alu_s32_7, %alu_s32_1;
add.s32 %alu_s32_9, %alu_s32_8, %alu_s32_0;
mov.b32 %const_s32_8, 4;
mov.b32 %const_s32_9, 1310720;
mov.b32 %const_s32_10, 160;
mov.b32 %const_s32_11, 8;
mov.b32 %const_s32_12, 40;
mul.lo.s32 %alu_s32_10, %lidx6, %const_s32_12;
mul.lo.s32 %alu_s32_11, %lidx4, %const_s32_8;
rem.s32 %alu_s32_12, %gidx2, %const_s32_4;
mul.lo.s32 %alu_s32_13, %alu_s32_12, %const_s32_11;
mul.lo.s32 %alu_s32_14, %gidx1, %const_s32_10;
mul.lo.s32 %alu_s32_15, %gidx0, %const_s32_9;
add.s32 %alu_s32_16, %alu_s32_15, %alu_s32_14;
add.s32 %alu_s32_17, %alu_s32_16, %alu_s32_13;
add.s32 %alu_s32_18, %alu_s32_17, %alu_s32_11;
add.s32 %alu_s32_19, %alu_s32_18, %alu_s32_10;
mov.b32 %const_s32_13, 1280;
mov.b32 %const_s32_14, 320;
mov.b32 %const_s32_15, 655360;
mov.b32 %const_s32_16, 2560;
mov.b32 %const_s32_17, 163840;
mov.b32 %const_s32_18, 0;
mov.b32 %const_s32_19, 32768;
mov.b32 %const_s32_20, 49152;
mov.b32 %const_s32_21, 0;
mov.b32 %const_s32_22, 3840;
mov.b32 %const_s32_23, 0;
mul.lo.s32 %alu_s32_20, %lidx6, %const_s32_17;
mul.lo.s32 %alu_s32_21, %lidx5, %const_s32_10;
mul.lo.s32 %alu_s32_22, %alu_s32_2, %const_s32_16;
mul.lo.s32 %alu_s32_23, %gidx1, %const_s32_15;
add.s32 %alu_s32_24, %alu_s32_15, %alu_s32_23;
add.s32 %alu_s32_25, %alu_s32_24, %alu_s32_22;
add.s32 %alu_s32_26, %alu_s32_25, %alu_s32_13;
add.s32 %alu_s32_27, %alu_s32_26, %alu_s32_11;
add.s32 %alu_s32_28, %alu_s32_27, %alu_s32_21;
add.s32 %alu_s32_29, %alu_s32_28, %alu_s32_20;
mul.lo.s32 %alu_s32_30, %alu_s32_29, %const_s32_8;
cvt.u64.s32 %cast_u64_0, %alu_s32_30;
add.u64 %alu_u64_0, %dat_u64_0, %cast_u64_0;
mov.b32 %const_s32_24, 480;
mov.u32 %ridx_s32_0, %const_s32_0;
$loop_0:
mul.lo.s32 %alu_s32_31, %ridx_s32_0, %const_s32_8;
add.s32 %alu_s32_32, %alu_s32_9, %alu_s32_31;
mul.lo.s32 %alu_s32_33, %alu_s32_32, %const_s32_8;
cvt.u64.s32 %cast_u64_1, %alu_s32_33;
add.u64 %alu_u64_1, %dat_u64_1, %cast_u64_1;
ld.global.v4.f32 {%val_f32_0, %val_f32_1, %val_f32_2, %val_f32_3}, [%alu_u64_1+0];
ld.global.v4.f32 {%val_f32_4, %val_f32_5, %val_f32_6, %val_f32_7}, [%alu_u64_1+16384];
ld.global.v4.f32 {%val_f32_8, %val_f32_9, %val_f32_10, %val_f32_11}, [%alu_u64_1+32768];
ld.global.v4.f32 {%val_f32_12, %val_f32_13, %val_f32_14, %val_f32_15}, [%alu_u64_1+49152];
mul.lo.s32 %alu_s32_34, %ridx_s32_0, %const_s32_13;
add.s32 %alu_s32_35, %alu_s32_19, %alu_s32_34;
mul.lo.s32 %alu_s32_36, %alu_s32_35, %const_s32_8;
cvt.u64.s32 %cast_u64_2, %alu_s32_36;
add.u64 %alu_u64_2, %dat_u64_2, %cast_u64_2;
ld.global.v4.f32 {%val_f32_16, %val_f32_17, %val_f32_18, %val_f32_19}, [%alu_u64_2+0];
ld.global.v4.f32 {%val_f32_20, %val_f32_21, %val_f32_22, %val_f32_23}, [%alu_u64_2+1280];
ld.global.v4.f32 {%val_f32_24, %val_f32_25, %val_f32_26, %val_f32_27}, [%alu_u64_2+2560];
ld.global.v4.f32 {%val_f32_28, %val_f32_29, %val_f32_30, %val_f32_31}, [%alu_u64_2+3840];
fma.rn.f32 %alu_f32_0, %val_f32_0, %val_f32_16, %acc_f32_0;
fma.rn.f32 %alu_f32_1, %val_f32_1, %val_f32_20, %alu_f32_0;
fma.rn.f32 %alu_f32_2, %val_f32_2, %val_f32_24, %alu_f32_1;
fma.rn.f32 %alu_f32_3, %val_f32_3, %val_f32_28, %alu_f32_2;
mov.b32 %acc_f32_0, %alu_f32_3;
fma.rn.f32 %alu_f32_4, %val_f32_0, %val_f32_17, %acc_f32_1;
fma.rn.f32 %alu_f32_5, %val_f32_1, %val_f32_21, %alu_f32_4;
fma.rn.f32 %alu_f32_6, %val_f32_2, %val_f32_25, %alu_f32_5;
fma.rn.f32 %alu_f32_7, %val_f32_3, %val_f32_29, %alu_f32_6;
mov.b32 %acc_f32_1, %alu_f32_7;
fma.rn.f32 %alu_f32_8, %val_f32_0, %val_f32_18, %acc_f32_2;
fma.rn.f32 %alu_f32_9, %val_f32_1, %val_f32_22, %alu_f32_8;
fma.rn.f32 %alu_f32_10, %val_f32_2, %val_f32_26, %alu_f32_9;
fma.rn.f32 %alu_f32_11, %val_f32_3, %val_f32_30, %alu_f32_10;
mov.b32 %acc_f32_2, %alu_f32_11;
fma.rn.f32 %alu_f32_12, %val_f32_0, %val_f32_19, %acc_f32_3;
fma.rn.f32 %alu_f32_13, %val_f32_1, %val_f32_23, %alu_f32_12;
fma.rn.f32 %alu_f32_14, %val_f32_2, %val_f32_27, %alu_f32_13;
fma.rn.f32 %alu_f32_15, %val_f32_3, %val_f32_31, %alu_f32_14;
mov.b32 %acc_f32_3, %alu_f32_15;
fma.rn.f32 %alu_f32_16, %val_f32_4, %val_f32_16, %acc_f32_4;
fma.rn.f32 %alu_f32_17, %val_f32_5, %val_f32_20, %alu_f32_16;
fma.rn.f32 %alu_f32_18, %val_f32_6, %val_f32_24, %alu_f32_17;
fma.rn.f32 %alu_f32_19, %val_f32_7, %val_f32_28, %alu_f32_18;
mov.b32 %acc_f32_4, %alu_f32_19;
fma.rn.f32 %alu_f32_20, %val_f32_4, %val_f32_17, %acc_f32_5;
fma.rn.f32 %alu_f32_21, %val_f32_5, %val_f32_21, %alu_f32_20;
fma.rn.f32 %alu_f32_22, %val_f32_6, %val_f32_25, %alu_f32_21;
fma.rn.f32 %alu_f32_23, %val_f32_7, %val_f32_29, %alu_f32_22;
mov.b32 %acc_f32_5, %alu_f32_23;
fma.rn.f32 %alu_f32_24, %val_f32_4, %val_f32_18, %acc_f32_6;
fma.rn.f32 %alu_f32_25, %val_f32_5, %val_f32_22, %alu_f32_24;
fma.rn.f32 %alu_f32_26, %val_f32_6, %val_f32_26, %alu_f32_25;
fma.rn.f32 %alu_f32_27, %val_f32_7, %val_f32_30, %alu_f32_26;
mov.b32 %acc_f32_6, %alu_f32_27;
fma.rn.f32 %alu_f32_28, %val_f32_4, %val_f32_19, %acc_f32_7;
fma.rn.f32 %alu_f32_29, %val_f32_5, %val_f32_23, %alu_f32_28;
fma.rn.f32 %alu_f32_30, %val_f32_6, %val_f32_27, %alu_f32_29;
fma.rn.f32 %alu_f32_31, %val_f32_7, %val_f32_31, %alu_f32_30;
mov.b32 %acc_f32_7, %alu_f32_31;
fma.rn.f32 %alu_f32_32, %val_f32_8, %val_f32_16, %acc_f32_8;
fma.rn.f32 %alu_f32_33, %val_f32_9, %val_f32_20, %alu_f32_32;
fma.rn.f32 %alu_f32_34, %val_f32_10, %val_f32_24, %alu_f32_33;
fma.rn.f32 %alu_f32_35, %val_f32_11, %val_f32_28, %alu_f32_34;
mov.b32 %acc_f32_8, %alu_f32_35;
fma.rn.f32 %alu_f32_36, %val_f32_8, %val_f32_17, %acc_f32_9;
fma.rn.f32 %alu_f32_37, %val_f32_9, %val_f32_21, %alu_f32_36;
fma.rn.f32 %alu_f32_38, %val_f32_10, %val_f32_25, %alu_f32_37;
fma.rn.f32 %alu_f32_39, %val_f32_11, %val_f32_29, %alu_f32_38;
mov.b32 %acc_f32_9, %alu_f32_39;
fma.rn.f32 %alu_f32_40, %val_f32_8, %val_f32_18, %acc_f32_10;
fma.rn.f32 %alu_f32_41, %val_f32_9, %val_f32_22, %alu_f32_40;
fma.rn.f32 %alu_f32_42, %val_f32_10, %val_f32_26, %alu_f32_41;
fma.rn.f32 %alu_f32_43, %val_f32_11, %val_f32_30, %alu_f32_42;
mov.b32 %acc_f32_10, %alu_f32_43;
fma.rn.f32 %alu_f32_44, %val_f32_8, %val_f32_19, %acc_f32_11;
fma.rn.f32 %alu_f32_45, %val_f32_9, %val_f32_23, %alu_f32_44;
fma.rn.f32 %alu_f32_46, %val_f32_10, %val_f32_27, %alu_f32_45;
fma.rn.f32 %alu_f32_47, %val_f32_11, %val_f32_31, %alu_f32_46;
mov.b32 %acc_f32_11, %alu_f32_47;
fma.rn.f32 %alu_f32_48, %val_f32_12, %val_f32_16, %acc_f32_12;
fma.rn.f32 %alu_f32_49, %val_f32_13, %val_f32_20, %alu_f32_48;
fma.rn.f32 %alu_f32_50, %val_f32_14, %val_f32_24, %alu_f32_49;
fma.rn.f32 %alu_f32_51, %val_f32_15, %val_f32_28, %alu_f32_50;
mov.b32 %acc_f32_12, %alu_f32_51;
fma.rn.f32 %alu_f32_52, %val_f32_12, %val_f32_17, %acc_f32_13;
fma.rn.f32 %alu_f32_53, %val_f32_13, %val_f32_21, %alu_f32_52;
fma.rn.f32 %alu_f32_54, %val_f32_14, %val_f32_25, %alu_f32_53;
fma.rn.f32 %alu_f32_55, %val_f32_15, %val_f32_29, %alu_f32_54;
mov.b32 %acc_f32_13, %alu_f32_55;
fma.rn.f32 %alu_f32_56, %val_f32_12, %val_f32_18, %acc_f32_14;
fma.rn.f32 %alu_f32_57, %val_f32_13, %val_f32_22, %alu_f32_56;
fma.rn.f32 %alu_f32_58, %val_f32_14, %val_f32_26, %alu_f32_57;
fma.rn.f32 %alu_f32_59, %val_f32_15, %val_f32_30, %alu_f32_58;
mov.b32 %acc_f32_14, %alu_f32_59;
fma.rn.f32 %alu_f32_60, %val_f32_12, %val_f32_19, %acc_f32_15;
fma.rn.f32 %alu_f32_61, %val_f32_13, %val_f32_23, %alu_f32_60;
fma.rn.f32 %alu_f32_62, %val_f32_14, %val_f32_27, %alu_f32_61;
fma.rn.f32 %alu_f32_63, %val_f32_15, %val_f32_31, %alu_f32_62;
mov.b32 %acc_f32_15, %alu_f32_63;
add.s32 %ridx_s32_0, %ridx_s32_0, 1;
setp.lt.s32 %pred_pred_0, %ridx_s32_0, %const_s32_1;
@%pred_pred_0 bra $loop_0;
@!%pred_pred_0 bra $loop_0_exit;
$loop_0_exit:
st.global.v4.f32 [%alu_u64_0+0], {%acc_f32_0, %acc_f32_1, %acc_f32_2, %acc_f32_3};
st.global.v4.f32 [%alu_u64_0+160], {%acc_f32_4, %acc_f32_5, %acc_f32_6, %acc_f32_7};
st.global.v4.f32 [%alu_u64_0+320], {%acc_f32_8, %acc_f32_9, %acc_f32_10, %acc_f32_11};
st.global.v4.f32 [%alu_u64_0+480], {%acc_f32_12, %acc_f32_13, %acc_f32_14, %acc_f32_15};
ret;
}
The only difference between those kernels is that kernel B uses one more fused multiply add operation when performing a reduce in the loop. Kernel A:
mul.f32 %alu_f32_60, %val_f32_13, %val_f32_23;
fma.rn.f32 %alu_f32_61, %val_f32_12, %val_f32_19, %alu_f32_60;
fma.rn.f32 %alu_f32_62, %val_f32_14, %val_f32_27, %alu_f32_61;
fma.rn.f32 %alu_f32_63, %val_f32_15, %val_f32_31, %alu_f32_62;
add.f32 %alu_f32_64, %alu_f32_3, %acc_f32_0;
mov.b32 %acc_f32_0, %alu_f32_64;
Kernel B:
fma.rn.f32 %alu_f32_60, %val_f32_12, %val_f32_19, %acc_f32_15;
fma.rn.f32 %alu_f32_61, %val_f32_13, %val_f32_23, %alu_f32_60;
fma.rn.f32 %alu_f32_62, %val_f32_14, %val_f32_27, %alu_f32_61;
fma.rn.f32 %alu_f32_63, %val_f32_15, %val_f32_31, %alu_f32_62;
mov.b32 %acc_f32_15, %alu_f32_63;
My intuition told me that it would increase speed but it made the kernel 2.5 times slower than it was. Profiling with nsight compute showed that Kernel B has 50% smaller L1 hit ratio. My initial thought was that the acc registers are being spilled and I’m getting cache misses because it gets loaded from local memory twice but I checked with nvcc and there are no spills
nvcc -arch=sm_86 --cubin -o kern kern.ptx --ptxas-options=-v
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function 'r_2_2_64_5_2_16_4_1024_4_4_4n1' for 'sm_86'
ptxas info : Function properties for r_2_2_64_5_2_16_4_1024_4_4_4n1
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 56 registers, 376 bytes cmem[0]
Looking further into the generated SASS it looks like the first kernel generates a much better load pattern for the data:
Kernel A:
1 00007fe0 23262100 MOV R1, c[0x0][0x28]
2 00007fe0 23262110 I2F.U32.RP R0, 0x5
3 00007fe0 23262120 S2R R5, SR_CTAID.X
4 00007fe0 23262130 MOV R49, RZ
5 00007fe0 23262140 CS2R R20, SRZ
6 00007fe0 23262150 CS2R R22, SRZ
7 00007fe0 23262160 S2R R6, SR_CTAID.Z
8 00007fe0 23262170 CS2R R16, SRZ
9 00007fe0 23262180 CS2R R18, SRZ
10 00007fe0 23262190 CS2R R14, SRZ
11 00007fe0 232621a0 S2R R13, SR_TID.Z
12 00007fe0 232621b0 ULDC.64 UR4, c[0x0][0x118]
13 00007fe0 232621c0 S2R R9, SR_TID.Y
14 00007fe0 232621d0 MUFU.RCP R0, R0
15 00007fe0 232621e0 S2R R11, SR_TID.X
16 00007fe0 232621f0 IABS R4, R5
17 00007fe0 23262200 IADD3 R2, R0, 0xffffffe, RZ
18 00007fe0 23262210 LOP3.LUT R0, R5, 0x5, RZ, 0x3c, !PT
19 00007fe0 23262220 F2I.FTZ.U32.TRUNC.NTZ R3, R2
20 00007fe0 23262230 ISETP.GE.AND P1, PT, R0, RZ, PT
21 00007fe0 23262240 IMAD R0, R6, 0x140000, RZ
22 00007fe0 23262250 MOV R2, RZ
23 00007fe0 23262260 IMAD R7, R3, -0x5, RZ
24 00007fe0 23262270 IMAD.HI.U32 R3, R3, R7, R2
25 00007fe0 23262280 S2R R7, SR_CTAID.Y
26 00007fe0 23262290 IMAD.HI.U32 R3, R3, R4, RZ
27 00007fe0 232622a0 IMAD R4, R3, -0x5, R4
28 00007fe0 232622b0 ISETP.GE.U32.AND P2, PT, R4, 0x5, PT
29 00007fe0 232622c0 IMAD R2, R7, 0xa0000, R0
30 00007fe0 232622d0 LEA R6, R6, R7, 0x1
31 00007fe0 232622e0 IMAD R0, R7, 0xa0, R0
32 00007fe0 232622f0 @P2 IADD3 R4, R4, -0x5, RZ
33 00007fe0 23262300 @P2 IADD3 R3, R3, 0x1, RZ
34 00007fe0 23262310 ISETP.GE.U32.AND P0, PT, R4, 0x5, PT
35 00007fe0 23262320 ISETP.GE.AND P2, PT, R5, RZ, PT
36 00007fe0 23262330 IADD3 R5, R4, -0x5, RZ
37 00007fe0 23262340 SEL R5, R5, R4, P0
38 00007fe0 23262350 @P0 IADD3 R3, R3, 0x1, RZ
39 00007fe0 23262360 @!P2 IADD3 R5, -R5, RZ, RZ
40 00007fe0 23262370 @!P1 IADD3 R3, -R3, RZ, RZ
41 00007fe0 23262380 LEA R0, R5, R0, 0x3
42 00007fe0 23262390 LEA R6, R6, R3, 0x8
43 00007fe0 232623a0 IMAD R2, R3, 0xa00, R2
44 00007fe0 232623b0 LEA R0, R13, R0, 0x2
45 00007fe0 232623c0 LEA R6, R6, R9, 0x4
46 00007fe0 232623d0 LEA R2, R5, R2, 0x3
47 00007fe0 232623e0 IMAD R0, R11, 0x28, R0
48 00007fe0 232623f0 LEA R48, R11, R6, 0xa
49 00007fe0 23262400 LEA R2, R13, R2, 0x2
50 00007fe0 23262410 CS2R R12, SRZ
51 00007fe0 23262420 IMAD R2, R9, 0xa0, R2
52 00007fe0 23262430 CS2R R8, SRZ
53 00007fe0 23262440 IMAD R2, R11, 0x28000, R2
54 00007fe0 23262450 CS2R R10, SRZ
55 00007fe0 23262460 SHF.L.U32 R3, R2, 0x2, RZ
56 00007fe0 23262470 IADD3 R2, P0, R3, c[0x0][0x160], RZ
57 00007fe0 23262480 LEA.HI.X.SX32 R3, R3, c[0x0][0x164], 0x1, P0
58 00007fe0 23262490 LEA R4, R48, R49, 0xc
59 00007fe0 232624a0 IMAD R5, R49, 0x500, R0
60 00007fe0 232624b0 SHF.L.U32 R4, R4, 0x2, RZ
61 00007fe0 232624c0 SHF.L.U32 R5, R5, 0x2, RZ
62 00007fe0 232624d0 SHF.L.U32 R4, R4, 0x2, RZ
63 00007fe0 232624e0 IADD3 R46, P1, R5, c[0x0][0x170], RZ
64 00007fe0 232624f0 IADD3 R44, P0, R4, c[0x0][0x168], RZ
65 00007fe0 23262500 LEA.HI.X.SX32 R47, R5, c[0x0][0x174], 0x1, P1
66 00007fe0 23262510 LEA.HI.X.SX32 R45, R4, c[0x0][0x16c], 0x1, P0
67 00007fe0 23262520 LDG.E.128 R24, [R46.64]
68 00007fe0 23262530 LDG.E.128 R40, [R44.64]
69 00007fe0 23262540 LDG.E.128 R28, [R46.64+0x500]
70 00007fe0 23262550 LDG.E.128 R32, [R46.64+0xa00]
71 00007fe0 23262560 LDG.E.128 R36, [R46.64+0xf00]
72 00007fe0 23262570 FMUL R4, R40, R24
73 00007fe0 23262580 FMUL R6, R40, R25
74 00007fe0 23262590 FMUL R53, R40, R26
75 00007fe0 232625a0 FMUL R40, R40, R27
76 00007fe0 232625b0 FFMA R51, R41, R28, R4
77 00007fe0 232625c0 FFMA R50, R41, R29, R6
78 00007fe0 232625d0 FFMA R53, R41, R30, R53
79 00007fe0 232625e0 LDG.E.128 R4, [R44.64+0x4000]
80 00007fe0 232625f0 FFMA R52, R41, R31, R40
81 00007fe0 23262600 FFMA R40, R42, R32, R51
82 00007fe0 23262610 FFMA R50, R42, R33, R50
83 00007fe0 23262620 FFMA R51, R42, R34, R53
84 00007fe0 23262630 FFMA R52, R42, R35, R52
85 00007fe0 23262640 FFMA R53, R43, R36, R40
86 00007fe0 23262650 FFMA R50, R43, R37, R50
87 00007fe0 23262660 FFMA R51, R43, R38, R51
88 00007fe0 23262670 FFMA R52, R43, R39, R52
89 00007fe0 23262680 LDG.E.128 R40, [R44.64+0x8000]
90 00007fe0 23262690 LDG.E.128 R44, [R44.64+0xc000]
91 00007fe0 232626a0 FADD R20, R53, R20
92 00007fe0 232626b0 FADD R21, R50, R21
93 00007fe0 232626c0 FADD R22, R51, R22
94 00007fe0 232626d0 IADD3 R49, R49, 0x1, RZ
95 00007fe0 232626e0 FADD R23, R52, R23
96 00007fe0 232626f0 ISETP.GE.AND P0, PT, R49, 0x400, PT
97 00007fe0 23262700 FMUL R53, R4, R24
98 00007fe0 23262710 FMUL R50, R4, R25
99 00007fe0 23262720 FMUL R51, R4, R26
100 00007fe0 23262730 FMUL R4, R4, R27
101 00007fe0 23262740 FFMA R53, R5, R28, R53
102 00007fe0 23262750 FFMA R50, R5, R29, R50
103 00007fe0 23262760 FFMA R51, R5, R30, R51
104 00007fe0 23262770 FFMA R4, R5, R31, R4
105 00007fe0 23262780 FFMA R53, R6, R32, R53
106 00007fe0 23262790 FFMA R50, R6, R33, R50
107 00007fe0 232627a0 FFMA R51, R6, R34, R51
108 00007fe0 232627b0 FFMA R4, R6, R35, R4
109 00007fe0 232627c0 FMUL R6, R40, R24
110 00007fe0 232627d0 FFMA R53, R7, R36, R53
111 00007fe0 232627e0 FFMA R50, R7, R37, R50
112 00007fe0 232627f0 FFMA R51, R7, R38, R51
113 00007fe0 23262800 FFMA R4, R7, R39, R4
114 00007fe0 23262810 FFMA R7, R41, R28, R6
115 00007fe0 23262820 FADD R16, R53, R16
116 00007fe0 23262830 FADD R17, R50, R17
117 00007fe0 23262840 FADD R18, R51, R18
118 00007fe0 23262850 FMUL R24, R44, R24
119 00007fe0 23262860 FMUL R6, R44, R25
120 00007fe0 23262870 FADD R19, R4, R19
121 00007fe0 23262880 FFMA R5, R45, R28, R24
122 00007fe0 23262890 FMUL R24, R40, R25
123 00007fe0 232628a0 FMUL R28, R40, R26
124 00007fe0 232628b0 FMUL R40, R40, R27
125 00007fe0 232628c0 FFMA R6, R45, R29, R6
126 00007fe0 232628d0 FFMA R24, R41, R29, R24
127 00007fe0 232628e0 FFMA R25, R41, R30, R28
128 00007fe0 232628f0 FFMA R40, R41, R31, R40
129 00007fe0 23262900 FFMA R28, R42, R32, R7
130 00007fe0 23262910 FFMA R24, R42, R33, R24
131 00007fe0 23262920 FFMA R25, R42, R34, R25
132 00007fe0 23262930 FFMA R40, R42, R35, R40
133 00007fe0 23262940 FMUL R42, R44, R26
134 00007fe0 23262950 FMUL R44, R44, R27
135 00007fe0 23262960 FFMA R32, R46, R32, R5
136 00007fe0 23262970 FFMA R6, R46, R33, R6
137 00007fe0 23262980 FFMA R27, R45, R30, R42
138 00007fe0 23262990 FFMA R44, R45, R31, R44
139 00007fe0 232629a0 FFMA R7, R43, R36, R28
140 00007fe0 232629b0 FFMA R24, R43, R37, R24
141 00007fe0 232629c0 FFMA R27, R46, R34, R27
142 00007fe0 232629d0 FFMA R44, R46, R35, R44
143 00007fe0 232629e0 FFMA R25, R43, R38, R25
144 00007fe0 232629f0 FFMA R26, R43, R39, R40
145 00007fe0 23262a00 FFMA R5, R47, R36, R32
146 00007fe0 23262a10 FFMA R6, R47, R37, R6
147 00007fe0 23262a20 FFMA R27, R47, R38, R27
148 00007fe0 23262a30 FFMA R44, R47, R39, R44
149 00007fe0 23262a40 FADD R8, R7, R8
150 00007fe0 23262a50 FADD R9, R24, R9
151 00007fe0 23262a60 FADD R10, R25, R10
152 00007fe0 23262a70 FADD R11, R26, R11
153 00007fe0 23262a80 FADD R12, R5, R12
154 00007fe0 23262a90 FADD R13, R6, R13
155 00007fe0 23262aa0 FADD R14, R27, R14
156 00007fe0 23262ab0 FADD R15, R44, R15
157 00007fe0 23262ac0 @!P0 BRA 0x7fe023262490
158 00007fe0 23262ad0 STG.E.128 [R2.64], R20
159 00007fe0 23262ae0 STG.E.128 [R2.64+0xa0], R16
160 00007fe0 23262af0 STG.E.128 [R2.64+0x140], R8
161 00007fe0 23262b00 STG.E.128 [R2.64+0x1e0], R12
162 00007fe0 23262b10 EXIT
Kernel B:
1 00007fe0 23260e00 MOV R1, c[0x0][0x28]
2 00007fe0 23260e10 I2F.U32.RP R0, 0x5
3 00007fe0 23260e20 S2R R5, SR_CTAID.X
4 00007fe0 23260e30 MOV R53, RZ
5 00007fe0 23260e40 CS2R R12, SRZ
6 00007fe0 23260e50 CS2R R14, SRZ
7 00007fe0 23260e60 S2R R6, SR_CTAID.Z
8 00007fe0 23260e70 CS2R R16, SRZ
9 00007fe0 23260e80 CS2R R18, SRZ
10 00007fe0 23260e90 S2R R9, SR_CTAID.Y
11 00007fe0 23260ea0 S2R R8, SR_TID.Z
12 00007fe0 23260eb0 MUFU.RCP R0, R0
13 00007fe0 23260ec0 S2R R10, SR_TID.Y
14 00007fe0 23260ed0 S2R R11, SR_TID.X
15 00007fe0 23260ee0 IABS R4, R5
16 00007fe0 23260ef0 IADD3 R2, R0, 0xffffffe, RZ
17 00007fe0 23260f00 LOP3.LUT R0, R5, 0x5, RZ, 0x3c, !PT
18 00007fe0 23260f10 F2I.FTZ.U32.TRUNC.NTZ R3, R2
19 00007fe0 23260f20 ISETP.GE.AND P1, PT, R0, RZ, PT
20 00007fe0 23260f30 IMAD R0, R6, 0x140000, RZ
21 00007fe0 23260f40 LEA R6, R6, R9, 0x1
22 00007fe0 23260f50 MOV R2, RZ
23 00007fe0 23260f60 IMAD R7, R3, -0x5, RZ
24 00007fe0 23260f70 IMAD.HI.U32 R3, R3, R7, R2
25 00007fe0 23260f80 IMAD R2, R9, 0xa0000, R0
26 00007fe0 23260f90 IMAD.HI.U32 R3, R3, R4, RZ
27 00007fe0 23260fa0 IMAD R4, R3, -0x5, R4
28 00007fe0 23260fb0 ISETP.GE.U32.AND P2, PT, R4, 0x5, PT
29 00007fe0 23260fc0 @P2 IADD3 R4, R4, -0x5, RZ
30 00007fe0 23260fd0 @P2 IADD3 R3, R3, 0x1, RZ
31 00007fe0 23260fe0 ISETP.GE.U32.AND P0, PT, R4, 0x5, PT
32 00007fe0 23260ff0 ISETP.GE.AND P2, PT, R5, RZ, PT
33 00007fe0 23261000 IADD3 R5, R4, -0x5, RZ
34 00007fe0 23261010 SEL R4, R5, R4, P0
35 00007fe0 23261020 IMAD R5, R9, 0xa0, R0
36 00007fe0 23261030 @P0 IADD3 R3, R3, 0x1, RZ
37 00007fe0 23261040 @!P2 IADD3 R4, -R4, RZ, RZ
38 00007fe0 23261050 @!P1 IADD3 R3, -R3, RZ, RZ
39 00007fe0 23261060 LEA R5, R4, R5, 0x3
40 00007fe0 23261070 IMAD R7, R3, 0xa00, R2
41 00007fe0 23261080 LEA R3, R6, R3, 0x8
42 00007fe0 23261090 LEA R0, R8, R5, 0x2
43 00007fe0 232610a0 LEA R7, R4, R7, 0x3
44 00007fe0 232610b0 LEA R52, R3, R10, 0x4
45 00007fe0 232610c0 IMAD R0, R11, 0x28, R0
46 00007fe0 232610d0 LEA R7, R8, R7, 0x2
47 00007fe0 232610e0 CS2R R4, SRZ
48 00007fe0 232610f0 LEA R52, R11, R52, 0xa
49 00007fe0 23261100 CS2R R8, SRZ
50 00007fe0 23261110 IMAD R7, R10, 0xa0, R7
51 00007fe0 23261120 IMAD R2, R11, 0x28000, R7
52 00007fe0 23261130 CS2R R6, SRZ
53 00007fe0 23261140 CS2R R10, SRZ
54 00007fe0 23261150 SHF.L.U32 R3, R2, 0x2, RZ
55 00007fe0 23261160 IADD3 R2, P0, R3, c[0x0][0x160], RZ
56 00007fe0 23261170 LEA.HI.X.SX32 R3, R3, c[0x0][0x164], 0x1, P0
57 00007fe0 23261180 LEA R20, R52, R53, 0xc
58 00007fe0 23261190 IMAD R21, R53, 0x500, R0
59 00007fe0 232611a0 ULDC.64 UR4, c[0x0][0x118]
60 00007fe0 232611b0 SHF.L.U32 R20, R20, 0x2, RZ
61 00007fe0 232611c0 SHF.L.U32 R21, R21, 0x2, RZ
62 00007fe0 232611d0 SHF.L.U32 R20, R20, 0x2, RZ
63 00007fe0 232611e0 IADD3 R48, P1, R21, c[0x0][0x170], RZ
64 00007fe0 232611f0 IADD3 R50, P0, R20, c[0x0][0x168], RZ
65 00007fe0 23261200 LEA.HI.X.SX32 R49, R21, c[0x0][0x174], 0x1, P1
66 00007fe0 23261210 LEA.HI.X.SX32 R51, R20, c[0x0][0x16c], 0x1, P0
67 00007fe0 23261220 LDG.E.128 R20, [R48.64]
68 00007fe0 23261230 LDG.E.128 R24, [R50.64]
69 00007fe0 23261240 LDG.E.128 R28, [R48.64+0x500]
70 00007fe0 23261250 LDG.E.128 R32, [R48.64+0xa00]
71 00007fe0 23261260 LDG.E.128 R36, [R50.64+0x4000]
72 00007fe0 23261270 LDG.E.128 R40, [R50.64+0x8000]
73 00007fe0 23261280 LDG.E.128 R44, [R50.64+0xc000]
74 00007fe0 23261290 FFMA R4, R24, R20, R4
75 00007fe0 232612a0 FFMA R6, R24, R22, R6
76 00007fe0 232612b0 FFMA R5, R24, R21, R5
77 00007fe0 232612c0 FFMA R24, R24, R23, R7
78 00007fe0 232612d0 FFMA R7, R25, R28, R4
79 00007fe0 232612e0 FFMA R51, R25, R30, R6
80 00007fe0 232612f0 FFMA R4, R26, R32, R7
81 00007fe0 23261300 FFMA R7, R26, R34, R51
82 00007fe0 23261310 LDG.E.128 R48, [R48.64+0xf00]
83 00007fe0 23261320 FFMA R5, R25, R29, R5
84 00007fe0 23261330 FFMA R24, R25, R31, R24
85 00007fe0 23261340 FFMA R6, R26, R33, R5
86 00007fe0 23261350 FFMA R26, R26, R35, R24
87 00007fe0 23261360 FFMA R24, R36, R21, R9
88 00007fe0 23261370 FFMA R9, R36, R22, R10
89 00007fe0 23261380 FFMA R8, R36, R20, R8
90 00007fe0 23261390 FFMA R9, R37, R30, R9
91 00007fe0 232613a0 FFMA R36, R36, R23, R11
92 00007fe0 232613b0 FFMA R11, R38, R34, R9
93 00007fe0 232613c0 FFMA R9, R40, R22, R14
94 00007fe0 232613d0 IADD3 R53, R53, 0x1, RZ
95 00007fe0 232613e0 FFMA R5, R37, R28, R8
96 00007fe0 232613f0 FFMA R10, R37, R29, R24
97 00007fe0 23261400 FFMA R9, R41, R30, R9
98 00007fe0 23261410 FFMA R12, R40, R20, R12
99 00007fe0 23261420 FFMA R24, R40, R21, R13
100 00007fe0 23261430 FFMA R40, R40, R23, R15
101 00007fe0 23261440 ISETP.GE.AND P0, PT, R53, 0x400, PT
102 00007fe0 23261450 FFMA R8, R38, R32, R5
103 00007fe0 23261460 FFMA R15, R42, R34, R9
104 00007fe0 23261470 FFMA R16, R44, R20, R16
105 00007fe0 23261480 FFMA R5, R41, R28, R12
106 00007fe0 23261490 FFMA R20, R44, R21, R17
107 00007fe0 232614a0 FFMA R9, R44, R22, R18
108 00007fe0 232614b0 FFMA R44, R44, R23, R19
109 00007fe0 232614c0 FFMA R14, R41, R29, R24
110 00007fe0 232614d0 FFMA R12, R42, R32, R5
111 00007fe0 232614e0 FFMA R9, R45, R30, R9
112 00007fe0 232614f0 FFMA R36, R37, R31, R36
113 00007fe0 23261500 FFMA R40, R41, R31, R40
114 00007fe0 23261510 FFMA R5, R45, R28, R16
115 00007fe0 23261520 FFMA R20, R45, R29, R20
116 00007fe0 23261530 FFMA R44, R45, R31, R44
117 00007fe0 23261540 FFMA R10, R38, R33, R10
118 00007fe0 23261550 FFMA R14, R42, R33, R14
119 00007fe0 23261560 FFMA R19, R46, R34, R9
120 00007fe0 23261570 FFMA R38, R38, R35, R36
121 00007fe0 23261580 FFMA R42, R42, R35, R40
122 00007fe0 23261590 FFMA R16, R46, R32, R5
123 00007fe0 232615a0 FFMA R20, R46, R33, R20
124 00007fe0 232615b0 FFMA R44, R46, R35, R44
125 00007fe0 232615c0 FFMA R5, R27, R49, R6
126 00007fe0 232615d0 FFMA R9, R39, R49, R10
127 00007fe0 232615e0 FFMA R13, R43, R49, R14
128 00007fe0 232615f0 FFMA R6, R27, R50, R7
129 00007fe0 23261600 FFMA R10, R39, R50, R11
130 00007fe0 23261610 FFMA R14, R43, R50, R15
131 00007fe0 23261620 FFMA R18, R47, R50, R19
132 00007fe0 23261630 FFMA R4, R27, R48, R4
133 00007fe0 23261640 FFMA R7, R27, R51, R26
134 00007fe0 23261650 FFMA R8, R39, R48, R8
135 00007fe0 23261660 FFMA R11, R39, R51, R38
136 00007fe0 23261670 FFMA R12, R43, R48, R12
137 00007fe0 23261680 FFMA R15, R43, R51, R42
138 00007fe0 23261690 FFMA R16, R47, R48, R16
139 00007fe0 232616a0 FFMA R17, R47, R49, R20
140 00007fe0 232616b0 FFMA R19, R47, R51, R44
141 00007fe0 232616c0 @!P0 BRA 0x7fe023261180
142 00007fe0 232616d0 ULDC.64 UR4, c[0x0][0x118]
143 00007fe0 232616e0 STG.E.128 [R2.64], R4
144 00007fe0 232616f0 STG.E.128 [R2.64+0xa0], R8
145 00007fe0 23261700 STG.E.128 [R2.64+0x140], R12
146 00007fe0 23261710 STG.E.128 [R2.64+0x1e0], R16
147 00007fe0 23261720 EXIT
Looking at line 67 where loads happen in both kernels it seems like SASS moved the loads more tightly together in kernel B generating much more lg throttling.
How can I further debug the reason for the increased miss ratio and worse load patterns for kernel B?
