Finding the reason for increased L1 miss ratio between kernels and reason for changes in SASS load patterns

I’m in the middle of profiling 2 PTX kernels, kernel A:

.version 7.5
.target sm_86
.address_size 64
.visible .entry r_2_2_64_5_2_16_4_1024_4_4_4n2(
        .param .u64 data0,
        .param .u64 data1,
        .param .u64 data2
)
{
        .reg            .u64 %dat_u64_<3>;
        .reg            .f32 %acc_f32_<16>;
        .reg            .s32 %const_s32_<25>;
        .reg            .s32 %alu_s32_<37>;
        .reg            .u64 %cast_u64_<3>;
        .reg            .u64 %alu_u64_<3>;
        .reg            .s32 %ridx_s32_<1>;
        .reg            .f32 %val_f32_<32>;
        .reg            .f32 %alu_f32_<80>;
        .reg            .pred %pred_pred_<1>;
        .reg            .u32 %lidx6;
        .reg            .u32 %lidx5;
        .reg            .u32 %lidx4;
        .reg            .u32 %gidx2;
        .reg            .u32 %gidx1;
        .reg            .u32 %gidx0;
        ld.param.u64    %dat_u64_0, [data0+0];
        ld.param.u64    %dat_u64_1, [data1+0];
        ld.param.u64    %dat_u64_2, [data2+0];
        mov.u32         %gidx0, %ctaid.z;
        mov.u32         %gidx1, %ctaid.y;
        mov.u32         %gidx2, %ctaid.x;
        mov.u32         %lidx4, %tid.z;
        mov.u32         %lidx5, %tid.y;
        mov.u32         %lidx6, %tid.x;
        mov.b32         %acc_f32_0, 0f00000000;
        mov.b32         %acc_f32_1, 0f00000000;
        mov.b32         %acc_f32_2, 0f00000000;
        mov.b32         %acc_f32_3, 0f00000000;
        mov.b32         %acc_f32_4, 0f00000000;
        mov.b32         %acc_f32_5, 0f00000000;
        mov.b32         %acc_f32_6, 0f00000000;
        mov.b32         %acc_f32_7, 0f00000000;
        mov.b32         %acc_f32_8, 0f00000000;
        mov.b32         %acc_f32_9, 0f00000000;
        mov.b32         %acc_f32_10, 0f00000000;
        mov.b32         %acc_f32_11, 0f00000000;
        mov.b32         %acc_f32_12, 0f00000000;
        mov.b32         %acc_f32_13, 0f00000000;
        mov.b32         %acc_f32_14, 0f00000000;
        mov.b32         %acc_f32_15, 0f00000000;
        mov.b32         %const_s32_0, 0;
        mov.b32         %const_s32_1, 1024;
        mov.b32         %const_s32_2, 134217728;
        mul.lo.s32      %alu_s32_0, %gidx0, %const_s32_2;
        mov.b32         %const_s32_3, 67108864;
        mul.lo.s32      %alu_s32_1, %gidx1, %const_s32_3;
        add.s32         %alu_s32_2, %alu_s32_0, %alu_s32_1;
        mov.b32         %const_s32_4, 5;
        div.s32         %alu_s32_3, %gidx2, %const_s32_4;
        mov.b32         %const_s32_5, 262144;
        mul.lo.s32      %alu_s32_4, %alu_s32_3, %const_s32_5;
        add.s32         %alu_s32_5, %alu_s32_2, %alu_s32_4;
        mov.b32         %const_s32_6, 16384;
        mul.lo.s32      %alu_s32_6, %lidx5, %const_s32_6;
        add.s32         %alu_s32_7, %alu_s32_5, %alu_s32_6;
        mov.b32         %const_s32_7, 16777216;
        mul.lo.s32      %alu_s32_8, %lidx6, %const_s32_7;
        add.s32         %alu_s32_9, %alu_s32_7, %alu_s32_8;
        mov.b32         %const_s32_8, 4;
        mov.b32         %const_s32_9, 1310720;
        mul.lo.s32      %alu_s32_10, %gidx0, %const_s32_9;
        mov.b32         %const_s32_10, 160;
        mul.lo.s32      %alu_s32_11, %gidx1, %const_s32_10;
        add.s32         %alu_s32_12, %alu_s32_10, %alu_s32_11;
        rem.s32         %alu_s32_13, %gidx2, %const_s32_4;
        mov.b32         %const_s32_11, 8;
        mul.lo.s32      %alu_s32_14, %alu_s32_13, %const_s32_11;
        add.s32         %alu_s32_15, %alu_s32_12, %alu_s32_14;
        mul.lo.s32      %alu_s32_16, %lidx4, %const_s32_8;
        add.s32         %alu_s32_17, %alu_s32_15, %alu_s32_16;
        mov.b32         %const_s32_12, 40;
        mul.lo.s32      %alu_s32_18, %lidx6, %const_s32_12;
        add.s32         %alu_s32_19, %alu_s32_17, %alu_s32_18;
        mov.b32         %const_s32_13, 1280;
        mov.b32         %const_s32_14, 320;
        mov.b32         %const_s32_15, 655360;
        mul.lo.s32      %alu_s32_20, %gidx1, %const_s32_15;
        add.s32         %alu_s32_21, %alu_s32_10, %alu_s32_20;
        mov.b32         %const_s32_16, 2560;
        mul.lo.s32      %alu_s32_22, %alu_s32_3, %const_s32_16;
        add.s32         %alu_s32_23, %alu_s32_21, %alu_s32_22;
        add.s32         %alu_s32_24, %alu_s32_23, %alu_s32_14;
        add.s32         %alu_s32_25, %alu_s32_24, %alu_s32_16;
        mul.lo.s32      %alu_s32_26, %lidx5, %const_s32_10;
        add.s32         %alu_s32_27, %alu_s32_25, %alu_s32_26;
        mov.b32         %const_s32_17, 163840;
        mul.lo.s32      %alu_s32_28, %lidx6, %const_s32_17;
        add.s32         %alu_s32_29, %alu_s32_27, %alu_s32_28;
        mov.b32         %const_s32_18, 0;
        mov.b32         %const_s32_19, 32768;
        mov.b32         %const_s32_20, 49152;
        mov.b32         %const_s32_21, 0;
        mov.b32         %const_s32_22, 3840;
        mul.lo.s32      %alu_s32_30, %alu_s32_29, %const_s32_8;
        mov.b32         %const_s32_23, 0;
        cvt.u64.s32     %cast_u64_0, %alu_s32_30;
        add.u64         %alu_u64_0, %dat_u64_0, %cast_u64_0;
        mov.b32         %const_s32_24, 480;
        mov.u32         %ridx_s32_0, %const_s32_0;
$loop_0:
        mul.lo.s32      %alu_s32_31, %ridx_s32_0, %const_s32_8;
        add.s32         %alu_s32_32, %alu_s32_9, %alu_s32_31;
        mul.lo.s32      %alu_s32_33, %alu_s32_32, %const_s32_8;
        cvt.u64.s32     %cast_u64_1, %alu_s32_33;
        add.u64         %alu_u64_1, %dat_u64_1, %cast_u64_1;
                        ld.global.v4.f32 {%val_f32_0, %val_f32_1, %val_f32_2, %val_f32_3}, [%alu_u64_1+0];
                        ld.global.v4.f32 {%val_f32_4, %val_f32_5, %val_f32_6, %val_f32_7}, [%alu_u64_1+16384];
                        ld.global.v4.f32 {%val_f32_8, %val_f32_9, %val_f32_10, %val_f32_11}, [%alu_u64_1+32768];
                        ld.global.v4.f32 {%val_f32_12, %val_f32_13, %val_f32_14, %val_f32_15}, [%alu_u64_1+49152];
        mul.lo.s32      %alu_s32_34, %ridx_s32_0, %const_s32_13;
        add.s32         %alu_s32_35, %alu_s32_19, %alu_s32_34;
        mul.lo.s32      %alu_s32_36, %alu_s32_35, %const_s32_8;
        cvt.u64.s32     %cast_u64_2, %alu_s32_36;
        add.u64         %alu_u64_2, %dat_u64_2, %cast_u64_2;
                        ld.global.v4.f32 {%val_f32_16, %val_f32_17, %val_f32_18, %val_f32_19}, [%alu_u64_2+0];
                        ld.global.v4.f32 {%val_f32_20, %val_f32_21, %val_f32_22, %val_f32_23}, [%alu_u64_2+1280];
                        ld.global.v4.f32 {%val_f32_24, %val_f32_25, %val_f32_26, %val_f32_27}, [%alu_u64_2+2560];
                        ld.global.v4.f32 {%val_f32_28, %val_f32_29, %val_f32_30, %val_f32_31}, [%alu_u64_2+3840];
        mul.f32         %alu_f32_0, %val_f32_0, %val_f32_16;
        fma.rn.f32      %alu_f32_1, %val_f32_1, %val_f32_20, %alu_f32_0;
        fma.rn.f32      %alu_f32_2, %val_f32_2, %val_f32_24, %alu_f32_1;
        fma.rn.f32      %alu_f32_3, %val_f32_3, %val_f32_28, %alu_f32_2;
        mul.f32         %alu_f32_4, %val_f32_0, %val_f32_17;
        fma.rn.f32      %alu_f32_5, %val_f32_1, %val_f32_21, %alu_f32_4;
        fma.rn.f32      %alu_f32_6, %val_f32_2, %val_f32_25, %alu_f32_5;
        fma.rn.f32      %alu_f32_7, %val_f32_3, %val_f32_29, %alu_f32_6;
        mul.f32         %alu_f32_8, %val_f32_0, %val_f32_18;
        fma.rn.f32      %alu_f32_9, %val_f32_1, %val_f32_22, %alu_f32_8;
        fma.rn.f32      %alu_f32_10, %val_f32_2, %val_f32_26, %alu_f32_9;
        fma.rn.f32      %alu_f32_11, %val_f32_3, %val_f32_30, %alu_f32_10;
        mul.f32         %alu_f32_12, %val_f32_0, %val_f32_19;
        fma.rn.f32      %alu_f32_13, %val_f32_1, %val_f32_23, %alu_f32_12;
        fma.rn.f32      %alu_f32_14, %val_f32_2, %val_f32_27, %alu_f32_13;
        fma.rn.f32      %alu_f32_15, %val_f32_3, %val_f32_31, %alu_f32_14;
        mul.f32         %alu_f32_16, %val_f32_4, %val_f32_16;
        fma.rn.f32      %alu_f32_17, %val_f32_5, %val_f32_20, %alu_f32_16;
        fma.rn.f32      %alu_f32_18, %val_f32_6, %val_f32_24, %alu_f32_17;
        fma.rn.f32      %alu_f32_19, %val_f32_7, %val_f32_28, %alu_f32_18;
        mul.f32         %alu_f32_20, %val_f32_4, %val_f32_17;
        fma.rn.f32      %alu_f32_21, %val_f32_5, %val_f32_21, %alu_f32_20;
        fma.rn.f32      %alu_f32_22, %val_f32_6, %val_f32_25, %alu_f32_21;
        fma.rn.f32      %alu_f32_23, %val_f32_7, %val_f32_29, %alu_f32_22;
        mul.f32         %alu_f32_24, %val_f32_4, %val_f32_18;
        fma.rn.f32      %alu_f32_25, %val_f32_5, %val_f32_22, %alu_f32_24;
        fma.rn.f32      %alu_f32_26, %val_f32_6, %val_f32_26, %alu_f32_25;
        fma.rn.f32      %alu_f32_27, %val_f32_7, %val_f32_30, %alu_f32_26;
        mul.f32         %alu_f32_28, %val_f32_4, %val_f32_19;
        fma.rn.f32      %alu_f32_29, %val_f32_5, %val_f32_23, %alu_f32_28;
        fma.rn.f32      %alu_f32_30, %val_f32_6, %val_f32_27, %alu_f32_29;
        fma.rn.f32      %alu_f32_31, %val_f32_7, %val_f32_31, %alu_f32_30;
        mul.f32         %alu_f32_32, %val_f32_8, %val_f32_16;
        fma.rn.f32      %alu_f32_33, %val_f32_9, %val_f32_20, %alu_f32_32;
        fma.rn.f32      %alu_f32_34, %val_f32_10, %val_f32_24, %alu_f32_33;
        fma.rn.f32      %alu_f32_35, %val_f32_11, %val_f32_28, %alu_f32_34;
        mul.f32         %alu_f32_36, %val_f32_8, %val_f32_17;
        fma.rn.f32      %alu_f32_37, %val_f32_9, %val_f32_21, %alu_f32_36;
        fma.rn.f32      %alu_f32_38, %val_f32_10, %val_f32_25, %alu_f32_37;
        fma.rn.f32      %alu_f32_39, %val_f32_11, %val_f32_29, %alu_f32_38;
        mul.f32         %alu_f32_40, %val_f32_8, %val_f32_18;
        fma.rn.f32      %alu_f32_41, %val_f32_9, %val_f32_22, %alu_f32_40;
        fma.rn.f32      %alu_f32_42, %val_f32_10, %val_f32_26, %alu_f32_41;
        fma.rn.f32      %alu_f32_43, %val_f32_11, %val_f32_30, %alu_f32_42;
        mul.f32         %alu_f32_44, %val_f32_8, %val_f32_19;
        fma.rn.f32      %alu_f32_45, %val_f32_9, %val_f32_23, %alu_f32_44;
        fma.rn.f32      %alu_f32_46, %val_f32_10, %val_f32_27, %alu_f32_45;
        fma.rn.f32      %alu_f32_47, %val_f32_11, %val_f32_31, %alu_f32_46;
        mul.f32         %alu_f32_48, %val_f32_12, %val_f32_16;
        fma.rn.f32      %alu_f32_49, %val_f32_13, %val_f32_20, %alu_f32_48;
        fma.rn.f32      %alu_f32_50, %val_f32_14, %val_f32_24, %alu_f32_49;
        fma.rn.f32      %alu_f32_51, %val_f32_15, %val_f32_28, %alu_f32_50;
        mul.f32         %alu_f32_52, %val_f32_12, %val_f32_17;
        fma.rn.f32      %alu_f32_53, %val_f32_13, %val_f32_21, %alu_f32_52;
        fma.rn.f32      %alu_f32_54, %val_f32_14, %val_f32_25, %alu_f32_53;
        fma.rn.f32      %alu_f32_55, %val_f32_15, %val_f32_29, %alu_f32_54;
        mul.f32         %alu_f32_56, %val_f32_12, %val_f32_18;
        fma.rn.f32      %alu_f32_57, %val_f32_13, %val_f32_22, %alu_f32_56;
        fma.rn.f32      %alu_f32_58, %val_f32_14, %val_f32_26, %alu_f32_57;
        fma.rn.f32      %alu_f32_59, %val_f32_15, %val_f32_30, %alu_f32_58;
        mul.f32         %alu_f32_60, %val_f32_12, %val_f32_19;
        fma.rn.f32      %alu_f32_61, %val_f32_13, %val_f32_23, %alu_f32_60;
        fma.rn.f32      %alu_f32_62, %val_f32_14, %val_f32_27, %alu_f32_61;
        fma.rn.f32      %alu_f32_63, %val_f32_15, %val_f32_31, %alu_f32_62;
        add.f32         %alu_f32_64, %alu_f32_3, %acc_f32_0;
        mov.b32         %acc_f32_0, %alu_f32_64;
        add.f32         %alu_f32_65, %alu_f32_7, %acc_f32_1;
        mov.b32         %acc_f32_1, %alu_f32_65;
        add.f32         %alu_f32_66, %alu_f32_11, %acc_f32_2;
        mov.b32         %acc_f32_2, %alu_f32_66;
        add.f32         %alu_f32_67, %alu_f32_15, %acc_f32_3;
        mov.b32         %acc_f32_3, %alu_f32_67;
        add.f32         %alu_f32_68, %alu_f32_19, %acc_f32_4;
        mov.b32         %acc_f32_4, %alu_f32_68;
        add.f32         %alu_f32_69, %alu_f32_23, %acc_f32_5;
        mov.b32         %acc_f32_5, %alu_f32_69;
        add.f32         %alu_f32_70, %alu_f32_27, %acc_f32_6;
        mov.b32         %acc_f32_6, %alu_f32_70;
        add.f32         %alu_f32_71, %alu_f32_31, %acc_f32_7;
        mov.b32         %acc_f32_7, %alu_f32_71;
        add.f32         %alu_f32_72, %alu_f32_35, %acc_f32_8;
        mov.b32         %acc_f32_8, %alu_f32_72;
        add.f32         %alu_f32_73, %alu_f32_39, %acc_f32_9;
        mov.b32         %acc_f32_9, %alu_f32_73;
        add.f32         %alu_f32_74, %alu_f32_43, %acc_f32_10;
        mov.b32         %acc_f32_10, %alu_f32_74;
        add.f32         %alu_f32_75, %alu_f32_47, %acc_f32_11;
        mov.b32         %acc_f32_11, %alu_f32_75;
        add.f32         %alu_f32_76, %alu_f32_51, %acc_f32_12;
        mov.b32         %acc_f32_12, %alu_f32_76;
        add.f32         %alu_f32_77, %alu_f32_55, %acc_f32_13;
        mov.b32         %acc_f32_13, %alu_f32_77;
        add.f32         %alu_f32_78, %alu_f32_59, %acc_f32_14;
        mov.b32         %acc_f32_14, %alu_f32_78;
        add.f32         %alu_f32_79, %alu_f32_63, %acc_f32_15;
        mov.b32         %acc_f32_15, %alu_f32_79;
        add.s32         %ridx_s32_0, %ridx_s32_0, 1;
        setp.lt.s32     %pred_pred_0, %ridx_s32_0, %const_s32_1;
        @%pred_pred_0   bra $loop_0;
        @!%pred_pred_0  bra $loop_0_exit;
$loop_0_exit:
        st.global.v4.f32        [%alu_u64_0+0], {%acc_f32_0, %acc_f32_1, %acc_f32_2, %acc_f32_3};
        st.global.v4.f32        [%alu_u64_0+160], {%acc_f32_4, %acc_f32_5, %acc_f32_6, %acc_f32_7};
        st.global.v4.f32        [%alu_u64_0+320], {%acc_f32_8, %acc_f32_9, %acc_f32_10, %acc_f32_11};
        st.global.v4.f32        [%alu_u64_0+480], {%acc_f32_12, %acc_f32_13, %acc_f32_14, %acc_f32_15};
        ret;
}

and kernel B:

.version 7.5
.target sm_86
.address_size 64
.visible .entry r_2_2_64_5_2_16_4_1024_4_4_4n1(
	.param .u64 data0,
	.param .u64 data1,
	.param .u64 data2
)
{
	.reg		.u64 %dat_u64_<3>;
	.reg		.f32 %acc_f32_<16>;
	.reg		.s32 %const_s32_<25>;
	.reg		.s32 %alu_s32_<37>;
	.reg		.u64 %cast_u64_<3>;
	.reg		.u64 %alu_u64_<3>;
	.reg		.s32 %ridx_s32_<1>;
	.reg		.f32 %val_f32_<32>;
	.reg		.f32 %alu_f32_<64>;
	.reg		.pred %pred_pred_<1>;
	.reg		.u32 %lidx6;
	.reg		.u32 %lidx5;
	.reg		.u32 %lidx4;
	.reg		.u32 %gidx2;
	.reg		.u32 %gidx1;
	.reg		.u32 %gidx0;
	ld.param.u64	%dat_u64_0, [data0+0];
	ld.param.u64	%dat_u64_1, [data1+0];
	ld.param.u64	%dat_u64_2, [data2+0];
	mov.u32		%gidx0, %ctaid.z;
	mov.u32		%gidx1, %ctaid.y;
	mov.u32		%gidx2, %ctaid.x;
	mov.u32		%lidx4, %tid.z;
	mov.u32		%lidx5, %tid.y;
	mov.u32		%lidx6, %tid.x;
	mov.b32		%acc_f32_0, 0f00000000;
	mov.b32		%acc_f32_1, 0f00000000;
	mov.b32		%acc_f32_2, 0f00000000;
	mov.b32		%acc_f32_3, 0f00000000;
	mov.b32		%acc_f32_4, 0f00000000;
	mov.b32		%acc_f32_5, 0f00000000;
	mov.b32		%acc_f32_6, 0f00000000;
	mov.b32		%acc_f32_7, 0f00000000;
	mov.b32		%acc_f32_8, 0f00000000;
	mov.b32		%acc_f32_9, 0f00000000;
	mov.b32		%acc_f32_10, 0f00000000;
	mov.b32		%acc_f32_11, 0f00000000;
	mov.b32		%acc_f32_12, 0f00000000;
	mov.b32		%acc_f32_13, 0f00000000;
	mov.b32		%acc_f32_14, 0f00000000;
	mov.b32		%acc_f32_15, 0f00000000;
	mov.b32		%const_s32_0, 0;
	mov.b32		%const_s32_1, 1024;
	mov.b32		%const_s32_2, 134217728;
	mov.b32		%const_s32_3, 67108864;
	mov.b32		%const_s32_4, 5;
	mov.b32		%const_s32_5, 262144;
	mov.b32		%const_s32_6, 16384;
	mov.b32		%const_s32_7, 16777216;
	mul.lo.s32	%alu_s32_0, %lidx6, %const_s32_7;
	mul.lo.s32	%alu_s32_1, %lidx5, %const_s32_6;
	div.s32		%alu_s32_2, %gidx2, %const_s32_4;
	mul.lo.s32	%alu_s32_3, %alu_s32_2, %const_s32_5;
	mul.lo.s32	%alu_s32_4, %gidx1, %const_s32_3;
	mul.lo.s32	%alu_s32_5, %gidx0, %const_s32_2;
	add.s32		%alu_s32_6, %alu_s32_5, %alu_s32_4;
	add.s32		%alu_s32_7, %alu_s32_6, %alu_s32_3;
	add.s32		%alu_s32_8, %alu_s32_7, %alu_s32_1;
	add.s32		%alu_s32_9, %alu_s32_8, %alu_s32_0;
	mov.b32		%const_s32_8, 4;
	mov.b32		%const_s32_9, 1310720;
	mov.b32		%const_s32_10, 160;
	mov.b32		%const_s32_11, 8;
	mov.b32		%const_s32_12, 40;
	mul.lo.s32	%alu_s32_10, %lidx6, %const_s32_12;
	mul.lo.s32	%alu_s32_11, %lidx4, %const_s32_8;
	rem.s32		%alu_s32_12, %gidx2, %const_s32_4;
	mul.lo.s32	%alu_s32_13, %alu_s32_12, %const_s32_11;
	mul.lo.s32	%alu_s32_14, %gidx1, %const_s32_10;
	mul.lo.s32	%alu_s32_15, %gidx0, %const_s32_9;
	add.s32		%alu_s32_16, %alu_s32_15, %alu_s32_14;
	add.s32		%alu_s32_17, %alu_s32_16, %alu_s32_13;
	add.s32		%alu_s32_18, %alu_s32_17, %alu_s32_11;
	add.s32		%alu_s32_19, %alu_s32_18, %alu_s32_10;
	mov.b32		%const_s32_13, 1280;
	mov.b32		%const_s32_14, 320;
	mov.b32		%const_s32_15, 655360;
	mov.b32		%const_s32_16, 2560;
	mov.b32		%const_s32_17, 163840;
	mov.b32		%const_s32_18, 0;
	mov.b32		%const_s32_19, 32768;
	mov.b32		%const_s32_20, 49152;
	mov.b32		%const_s32_21, 0;
	mov.b32		%const_s32_22, 3840;
	mov.b32		%const_s32_23, 0;
	mul.lo.s32	%alu_s32_20, %lidx6, %const_s32_17;
	mul.lo.s32	%alu_s32_21, %lidx5, %const_s32_10;
	mul.lo.s32	%alu_s32_22, %alu_s32_2, %const_s32_16;
	mul.lo.s32	%alu_s32_23, %gidx1, %const_s32_15;
	add.s32		%alu_s32_24, %alu_s32_15, %alu_s32_23;
	add.s32		%alu_s32_25, %alu_s32_24, %alu_s32_22;
	add.s32		%alu_s32_26, %alu_s32_25, %alu_s32_13;
	add.s32		%alu_s32_27, %alu_s32_26, %alu_s32_11;
	add.s32		%alu_s32_28, %alu_s32_27, %alu_s32_21;
	add.s32		%alu_s32_29, %alu_s32_28, %alu_s32_20;
	mul.lo.s32	%alu_s32_30, %alu_s32_29, %const_s32_8;
	cvt.u64.s32	%cast_u64_0, %alu_s32_30;
	add.u64		%alu_u64_0, %dat_u64_0, %cast_u64_0;
	mov.b32		%const_s32_24, 480;
	mov.u32		%ridx_s32_0, %const_s32_0;
$loop_0:
	mul.lo.s32	%alu_s32_31, %ridx_s32_0, %const_s32_8;
	add.s32		%alu_s32_32, %alu_s32_9, %alu_s32_31;
	mul.lo.s32	%alu_s32_33, %alu_s32_32, %const_s32_8;
	cvt.u64.s32	%cast_u64_1, %alu_s32_33;
	add.u64		%alu_u64_1, %dat_u64_1, %cast_u64_1;
			ld.global.v4.f32 {%val_f32_0, %val_f32_1, %val_f32_2, %val_f32_3}, [%alu_u64_1+0];
			ld.global.v4.f32 {%val_f32_4, %val_f32_5, %val_f32_6, %val_f32_7}, [%alu_u64_1+16384];
			ld.global.v4.f32 {%val_f32_8, %val_f32_9, %val_f32_10, %val_f32_11}, [%alu_u64_1+32768];
			ld.global.v4.f32 {%val_f32_12, %val_f32_13, %val_f32_14, %val_f32_15}, [%alu_u64_1+49152];
	mul.lo.s32	%alu_s32_34, %ridx_s32_0, %const_s32_13;
	add.s32		%alu_s32_35, %alu_s32_19, %alu_s32_34;
	mul.lo.s32	%alu_s32_36, %alu_s32_35, %const_s32_8;
	cvt.u64.s32	%cast_u64_2, %alu_s32_36;
	add.u64		%alu_u64_2, %dat_u64_2, %cast_u64_2;
			ld.global.v4.f32 {%val_f32_16, %val_f32_17, %val_f32_18, %val_f32_19}, [%alu_u64_2+0];
			ld.global.v4.f32 {%val_f32_20, %val_f32_21, %val_f32_22, %val_f32_23}, [%alu_u64_2+1280];
			ld.global.v4.f32 {%val_f32_24, %val_f32_25, %val_f32_26, %val_f32_27}, [%alu_u64_2+2560];
			ld.global.v4.f32 {%val_f32_28, %val_f32_29, %val_f32_30, %val_f32_31}, [%alu_u64_2+3840];
	fma.rn.f32	%alu_f32_0, %val_f32_0, %val_f32_16, %acc_f32_0;
	fma.rn.f32	%alu_f32_1, %val_f32_1, %val_f32_20, %alu_f32_0;
	fma.rn.f32	%alu_f32_2, %val_f32_2, %val_f32_24, %alu_f32_1;
	fma.rn.f32	%alu_f32_3, %val_f32_3, %val_f32_28, %alu_f32_2;
	mov.b32		%acc_f32_0, %alu_f32_3;
	fma.rn.f32	%alu_f32_4, %val_f32_0, %val_f32_17, %acc_f32_1;
	fma.rn.f32	%alu_f32_5, %val_f32_1, %val_f32_21, %alu_f32_4;
	fma.rn.f32	%alu_f32_6, %val_f32_2, %val_f32_25, %alu_f32_5;
	fma.rn.f32	%alu_f32_7, %val_f32_3, %val_f32_29, %alu_f32_6;
	mov.b32		%acc_f32_1, %alu_f32_7;
	fma.rn.f32	%alu_f32_8, %val_f32_0, %val_f32_18, %acc_f32_2;
	fma.rn.f32	%alu_f32_9, %val_f32_1, %val_f32_22, %alu_f32_8;
	fma.rn.f32	%alu_f32_10, %val_f32_2, %val_f32_26, %alu_f32_9;
	fma.rn.f32	%alu_f32_11, %val_f32_3, %val_f32_30, %alu_f32_10;
	mov.b32		%acc_f32_2, %alu_f32_11;
	fma.rn.f32	%alu_f32_12, %val_f32_0, %val_f32_19, %acc_f32_3;
	fma.rn.f32	%alu_f32_13, %val_f32_1, %val_f32_23, %alu_f32_12;
	fma.rn.f32	%alu_f32_14, %val_f32_2, %val_f32_27, %alu_f32_13;
	fma.rn.f32	%alu_f32_15, %val_f32_3, %val_f32_31, %alu_f32_14;
	mov.b32		%acc_f32_3, %alu_f32_15;
	fma.rn.f32	%alu_f32_16, %val_f32_4, %val_f32_16, %acc_f32_4;
	fma.rn.f32	%alu_f32_17, %val_f32_5, %val_f32_20, %alu_f32_16;
	fma.rn.f32	%alu_f32_18, %val_f32_6, %val_f32_24, %alu_f32_17;
	fma.rn.f32	%alu_f32_19, %val_f32_7, %val_f32_28, %alu_f32_18;
	mov.b32		%acc_f32_4, %alu_f32_19;
	fma.rn.f32	%alu_f32_20, %val_f32_4, %val_f32_17, %acc_f32_5;
	fma.rn.f32	%alu_f32_21, %val_f32_5, %val_f32_21, %alu_f32_20;
	fma.rn.f32	%alu_f32_22, %val_f32_6, %val_f32_25, %alu_f32_21;
	fma.rn.f32	%alu_f32_23, %val_f32_7, %val_f32_29, %alu_f32_22;
	mov.b32		%acc_f32_5, %alu_f32_23;
	fma.rn.f32	%alu_f32_24, %val_f32_4, %val_f32_18, %acc_f32_6;
	fma.rn.f32	%alu_f32_25, %val_f32_5, %val_f32_22, %alu_f32_24;
	fma.rn.f32	%alu_f32_26, %val_f32_6, %val_f32_26, %alu_f32_25;
	fma.rn.f32	%alu_f32_27, %val_f32_7, %val_f32_30, %alu_f32_26;
	mov.b32		%acc_f32_6, %alu_f32_27;
	fma.rn.f32	%alu_f32_28, %val_f32_4, %val_f32_19, %acc_f32_7;
	fma.rn.f32	%alu_f32_29, %val_f32_5, %val_f32_23, %alu_f32_28;
	fma.rn.f32	%alu_f32_30, %val_f32_6, %val_f32_27, %alu_f32_29;
	fma.rn.f32	%alu_f32_31, %val_f32_7, %val_f32_31, %alu_f32_30;
	mov.b32		%acc_f32_7, %alu_f32_31;
	fma.rn.f32	%alu_f32_32, %val_f32_8, %val_f32_16, %acc_f32_8;
	fma.rn.f32	%alu_f32_33, %val_f32_9, %val_f32_20, %alu_f32_32;
	fma.rn.f32	%alu_f32_34, %val_f32_10, %val_f32_24, %alu_f32_33;
	fma.rn.f32	%alu_f32_35, %val_f32_11, %val_f32_28, %alu_f32_34;
	mov.b32		%acc_f32_8, %alu_f32_35;
	fma.rn.f32	%alu_f32_36, %val_f32_8, %val_f32_17, %acc_f32_9;
	fma.rn.f32	%alu_f32_37, %val_f32_9, %val_f32_21, %alu_f32_36;
	fma.rn.f32	%alu_f32_38, %val_f32_10, %val_f32_25, %alu_f32_37;
	fma.rn.f32	%alu_f32_39, %val_f32_11, %val_f32_29, %alu_f32_38;
	mov.b32		%acc_f32_9, %alu_f32_39;
	fma.rn.f32	%alu_f32_40, %val_f32_8, %val_f32_18, %acc_f32_10;
	fma.rn.f32	%alu_f32_41, %val_f32_9, %val_f32_22, %alu_f32_40;
	fma.rn.f32	%alu_f32_42, %val_f32_10, %val_f32_26, %alu_f32_41;
	fma.rn.f32	%alu_f32_43, %val_f32_11, %val_f32_30, %alu_f32_42;
	mov.b32		%acc_f32_10, %alu_f32_43;
	fma.rn.f32	%alu_f32_44, %val_f32_8, %val_f32_19, %acc_f32_11;
	fma.rn.f32	%alu_f32_45, %val_f32_9, %val_f32_23, %alu_f32_44;
	fma.rn.f32	%alu_f32_46, %val_f32_10, %val_f32_27, %alu_f32_45;
	fma.rn.f32	%alu_f32_47, %val_f32_11, %val_f32_31, %alu_f32_46;
	mov.b32		%acc_f32_11, %alu_f32_47;
	fma.rn.f32	%alu_f32_48, %val_f32_12, %val_f32_16, %acc_f32_12;
	fma.rn.f32	%alu_f32_49, %val_f32_13, %val_f32_20, %alu_f32_48;
	fma.rn.f32	%alu_f32_50, %val_f32_14, %val_f32_24, %alu_f32_49;
	fma.rn.f32	%alu_f32_51, %val_f32_15, %val_f32_28, %alu_f32_50;
	mov.b32		%acc_f32_12, %alu_f32_51;
	fma.rn.f32	%alu_f32_52, %val_f32_12, %val_f32_17, %acc_f32_13;
	fma.rn.f32	%alu_f32_53, %val_f32_13, %val_f32_21, %alu_f32_52;
	fma.rn.f32	%alu_f32_54, %val_f32_14, %val_f32_25, %alu_f32_53;
	fma.rn.f32	%alu_f32_55, %val_f32_15, %val_f32_29, %alu_f32_54;
	mov.b32		%acc_f32_13, %alu_f32_55;
	fma.rn.f32	%alu_f32_56, %val_f32_12, %val_f32_18, %acc_f32_14;
	fma.rn.f32	%alu_f32_57, %val_f32_13, %val_f32_22, %alu_f32_56;
	fma.rn.f32	%alu_f32_58, %val_f32_14, %val_f32_26, %alu_f32_57;
	fma.rn.f32	%alu_f32_59, %val_f32_15, %val_f32_30, %alu_f32_58;
	mov.b32		%acc_f32_14, %alu_f32_59;
	fma.rn.f32	%alu_f32_60, %val_f32_12, %val_f32_19, %acc_f32_15;
	fma.rn.f32	%alu_f32_61, %val_f32_13, %val_f32_23, %alu_f32_60;
	fma.rn.f32	%alu_f32_62, %val_f32_14, %val_f32_27, %alu_f32_61;
	fma.rn.f32	%alu_f32_63, %val_f32_15, %val_f32_31, %alu_f32_62;
	mov.b32		%acc_f32_15, %alu_f32_63;
	add.s32		%ridx_s32_0, %ridx_s32_0, 1;
	setp.lt.s32	%pred_pred_0, %ridx_s32_0, %const_s32_1;
	@%pred_pred_0	bra $loop_0;
	@!%pred_pred_0	bra $loop_0_exit;
$loop_0_exit:
	st.global.v4.f32	[%alu_u64_0+0], {%acc_f32_0, %acc_f32_1, %acc_f32_2, %acc_f32_3};
	st.global.v4.f32	[%alu_u64_0+160], {%acc_f32_4, %acc_f32_5, %acc_f32_6, %acc_f32_7};
	st.global.v4.f32	[%alu_u64_0+320], {%acc_f32_8, %acc_f32_9, %acc_f32_10, %acc_f32_11};
	st.global.v4.f32	[%alu_u64_0+480], {%acc_f32_12, %acc_f32_13, %acc_f32_14, %acc_f32_15};
	ret;
}

The only difference between those kernels is that kernel B uses one more fused multiply add operation when performing a reduce in the loop. Kernel A:

        mul.f32         %alu_f32_60, %val_f32_13, %val_f32_23;
        fma.rn.f32      %alu_f32_61, %val_f32_12, %val_f32_19, %alu_f32_60;
        fma.rn.f32      %alu_f32_62, %val_f32_14, %val_f32_27, %alu_f32_61;
        fma.rn.f32      %alu_f32_63, %val_f32_15, %val_f32_31, %alu_f32_62;
        add.f32         %alu_f32_64, %alu_f32_3, %acc_f32_0; 
        mov.b32         %acc_f32_0, %alu_f32_64; 

Kernel B:

        fma.rn.f32      %alu_f32_60, %val_f32_12, %val_f32_19, %acc_f32_15; 
        fma.rn.f32      %alu_f32_61, %val_f32_13, %val_f32_23, %alu_f32_60;
        fma.rn.f32      %alu_f32_62, %val_f32_14, %val_f32_27, %alu_f32_61;
        fma.rn.f32      %alu_f32_63, %val_f32_15, %val_f32_31, %alu_f32_62;
        mov.b32         %acc_f32_15, %alu_f32_63; 

My intuition told me that it would increase speed but it made the kernel 2.5 times slower than it was. Profiling with nsight compute showed that Kernel B has 50% smaller L1 hit ratio. My initial thought was that the acc registers are being spilled and I’m getting cache misses because it gets loaded from local memory twice but I checked with nvcc and there are no spills

nvcc -arch=sm_86 --cubin -o kern kern.ptx --ptxas-options=-v
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function 'r_2_2_64_5_2_16_4_1024_4_4_4n1' for 'sm_86'
ptxas info    : Function properties for r_2_2_64_5_2_16_4_1024_4_4_4n1
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 56 registers, 376 bytes cmem[0]

Looking further into the generated SASS it looks like the first kernel generates a much better load pattern for the data:
Kernel A:

1	00007fe0 23262100	      MOV R1, c[0x0][0x28] 
2	00007fe0 23262110	      I2F.U32.RP R0, 0x5 
3	00007fe0 23262120	      S2R R5, SR_CTAID.X 
4	00007fe0 23262130	      MOV R49, RZ 
5	00007fe0 23262140	      CS2R R20, SRZ 
6	00007fe0 23262150	      CS2R R22, SRZ 
7	00007fe0 23262160	      S2R R6, SR_CTAID.Z 
8	00007fe0 23262170	      CS2R R16, SRZ 
9	00007fe0 23262180	      CS2R R18, SRZ 
10	00007fe0 23262190	      CS2R R14, SRZ 
11	00007fe0 232621a0	      S2R R13, SR_TID.Z 
12	00007fe0 232621b0	      ULDC.64 UR4, c[0x0][0x118] 
13	00007fe0 232621c0	      S2R R9, SR_TID.Y 
14	00007fe0 232621d0	      MUFU.RCP R0, R0 
15	00007fe0 232621e0	      S2R R11, SR_TID.X 
16	00007fe0 232621f0	      IABS R4, R5 
17	00007fe0 23262200	      IADD3 R2, R0, 0xffffffe, RZ 
18	00007fe0 23262210	      LOP3.LUT R0, R5, 0x5, RZ, 0x3c, !PT 
19	00007fe0 23262220	      F2I.FTZ.U32.TRUNC.NTZ R3, R2 
20	00007fe0 23262230	      ISETP.GE.AND P1, PT, R0, RZ, PT 
21	00007fe0 23262240	      IMAD R0, R6, 0x140000, RZ 
22	00007fe0 23262250	      MOV R2, RZ 
23	00007fe0 23262260	      IMAD R7, R3, -0x5, RZ 
24	00007fe0 23262270	      IMAD.HI.U32 R3, R3, R7, R2 
25	00007fe0 23262280	      S2R R7, SR_CTAID.Y 
26	00007fe0 23262290	      IMAD.HI.U32 R3, R3, R4, RZ 
27	00007fe0 232622a0	      IMAD R4, R3, -0x5, R4 
28	00007fe0 232622b0	      ISETP.GE.U32.AND P2, PT, R4, 0x5, PT 
29	00007fe0 232622c0	      IMAD R2, R7, 0xa0000, R0 
30	00007fe0 232622d0	      LEA R6, R6, R7, 0x1 
31	00007fe0 232622e0	      IMAD R0, R7, 0xa0, R0 
32	00007fe0 232622f0	@P2   IADD3 R4, R4, -0x5, RZ 
33	00007fe0 23262300	@P2   IADD3 R3, R3, 0x1, RZ 
34	00007fe0 23262310	      ISETP.GE.U32.AND P0, PT, R4, 0x5, PT 
35	00007fe0 23262320	      ISETP.GE.AND P2, PT, R5, RZ, PT 
36	00007fe0 23262330	      IADD3 R5, R4, -0x5, RZ 
37	00007fe0 23262340	      SEL R5, R5, R4, P0 
38	00007fe0 23262350	@P0   IADD3 R3, R3, 0x1, RZ 
39	00007fe0 23262360	@!P2  IADD3 R5, -R5, RZ, RZ 
40	00007fe0 23262370	@!P1  IADD3 R3, -R3, RZ, RZ 
41	00007fe0 23262380	      LEA R0, R5, R0, 0x3 
42	00007fe0 23262390	      LEA R6, R6, R3, 0x8 
43	00007fe0 232623a0	      IMAD R2, R3, 0xa00, R2 
44	00007fe0 232623b0	      LEA R0, R13, R0, 0x2 
45	00007fe0 232623c0	      LEA R6, R6, R9, 0x4 
46	00007fe0 232623d0	      LEA R2, R5, R2, 0x3 
47	00007fe0 232623e0	      IMAD R0, R11, 0x28, R0 
48	00007fe0 232623f0	      LEA R48, R11, R6, 0xa 
49	00007fe0 23262400	      LEA R2, R13, R2, 0x2 
50	00007fe0 23262410	      CS2R R12, SRZ 
51	00007fe0 23262420	      IMAD R2, R9, 0xa0, R2 
52	00007fe0 23262430	      CS2R R8, SRZ 
53	00007fe0 23262440	      IMAD R2, R11, 0x28000, R2 
54	00007fe0 23262450	      CS2R R10, SRZ 
55	00007fe0 23262460	      SHF.L.U32 R3, R2, 0x2, RZ 
56	00007fe0 23262470	      IADD3 R2, P0, R3, c[0x0][0x160], RZ 
57	00007fe0 23262480	      LEA.HI.X.SX32 R3, R3, c[0x0][0x164], 0x1, P0 
58	00007fe0 23262490	      LEA R4, R48, R49, 0xc 
59	00007fe0 232624a0	      IMAD R5, R49, 0x500, R0 
60	00007fe0 232624b0	      SHF.L.U32 R4, R4, 0x2, RZ 
61	00007fe0 232624c0	      SHF.L.U32 R5, R5, 0x2, RZ 
62	00007fe0 232624d0	      SHF.L.U32 R4, R4, 0x2, RZ 
63	00007fe0 232624e0	      IADD3 R46, P1, R5, c[0x0][0x170], RZ 
64	00007fe0 232624f0	      IADD3 R44, P0, R4, c[0x0][0x168], RZ 
65	00007fe0 23262500	      LEA.HI.X.SX32 R47, R5, c[0x0][0x174], 0x1, P1 
66	00007fe0 23262510	      LEA.HI.X.SX32 R45, R4, c[0x0][0x16c], 0x1, P0 
67	00007fe0 23262520	      LDG.E.128 R24, [R46.64] 
68	00007fe0 23262530	      LDG.E.128 R40, [R44.64] 
69	00007fe0 23262540	      LDG.E.128 R28, [R46.64+0x500] 
70	00007fe0 23262550	      LDG.E.128 R32, [R46.64+0xa00] 
71	00007fe0 23262560	      LDG.E.128 R36, [R46.64+0xf00] 
72	00007fe0 23262570	      FMUL R4, R40, R24 
73	00007fe0 23262580	      FMUL R6, R40, R25 
74	00007fe0 23262590	      FMUL R53, R40, R26 
75	00007fe0 232625a0	      FMUL R40, R40, R27 
76	00007fe0 232625b0	      FFMA R51, R41, R28, R4 
77	00007fe0 232625c0	      FFMA R50, R41, R29, R6 
78	00007fe0 232625d0	      FFMA R53, R41, R30, R53 
79	00007fe0 232625e0	      LDG.E.128 R4, [R44.64+0x4000] 
80	00007fe0 232625f0	      FFMA R52, R41, R31, R40 
81	00007fe0 23262600	      FFMA R40, R42, R32, R51 
82	00007fe0 23262610	      FFMA R50, R42, R33, R50 
83	00007fe0 23262620	      FFMA R51, R42, R34, R53 
84	00007fe0 23262630	      FFMA R52, R42, R35, R52 
85	00007fe0 23262640	      FFMA R53, R43, R36, R40 
86	00007fe0 23262650	      FFMA R50, R43, R37, R50 
87	00007fe0 23262660	      FFMA R51, R43, R38, R51 
88	00007fe0 23262670	      FFMA R52, R43, R39, R52 
89	00007fe0 23262680	      LDG.E.128 R40, [R44.64+0x8000] 
90	00007fe0 23262690	      LDG.E.128 R44, [R44.64+0xc000] 
91	00007fe0 232626a0	      FADD R20, R53, R20 
92	00007fe0 232626b0	      FADD R21, R50, R21 
93	00007fe0 232626c0	      FADD R22, R51, R22 
94	00007fe0 232626d0	      IADD3 R49, R49, 0x1, RZ 
95	00007fe0 232626e0	      FADD R23, R52, R23 
96	00007fe0 232626f0	      ISETP.GE.AND P0, PT, R49, 0x400, PT 
97	00007fe0 23262700	      FMUL R53, R4, R24 
98	00007fe0 23262710	      FMUL R50, R4, R25 
99	00007fe0 23262720	      FMUL R51, R4, R26 
100	00007fe0 23262730	      FMUL R4, R4, R27 
101	00007fe0 23262740	      FFMA R53, R5, R28, R53 
102	00007fe0 23262750	      FFMA R50, R5, R29, R50 
103	00007fe0 23262760	      FFMA R51, R5, R30, R51 
104	00007fe0 23262770	      FFMA R4, R5, R31, R4 
105	00007fe0 23262780	      FFMA R53, R6, R32, R53 
106	00007fe0 23262790	      FFMA R50, R6, R33, R50 
107	00007fe0 232627a0	      FFMA R51, R6, R34, R51 
108	00007fe0 232627b0	      FFMA R4, R6, R35, R4 
109	00007fe0 232627c0	      FMUL R6, R40, R24 
110	00007fe0 232627d0	      FFMA R53, R7, R36, R53 
111	00007fe0 232627e0	      FFMA R50, R7, R37, R50 
112	00007fe0 232627f0	      FFMA R51, R7, R38, R51 
113	00007fe0 23262800	      FFMA R4, R7, R39, R4 
114	00007fe0 23262810	      FFMA R7, R41, R28, R6 
115	00007fe0 23262820	      FADD R16, R53, R16 
116	00007fe0 23262830	      FADD R17, R50, R17 
117	00007fe0 23262840	      FADD R18, R51, R18 
118	00007fe0 23262850	      FMUL R24, R44, R24 
119	00007fe0 23262860	      FMUL R6, R44, R25 
120	00007fe0 23262870	      FADD R19, R4, R19 
121	00007fe0 23262880	      FFMA R5, R45, R28, R24 
122	00007fe0 23262890	      FMUL R24, R40, R25 
123	00007fe0 232628a0	      FMUL R28, R40, R26 
124	00007fe0 232628b0	      FMUL R40, R40, R27 
125	00007fe0 232628c0	      FFMA R6, R45, R29, R6 
126	00007fe0 232628d0	      FFMA R24, R41, R29, R24 
127	00007fe0 232628e0	      FFMA R25, R41, R30, R28 
128	00007fe0 232628f0	      FFMA R40, R41, R31, R40 
129	00007fe0 23262900	      FFMA R28, R42, R32, R7 
130	00007fe0 23262910	      FFMA R24, R42, R33, R24 
131	00007fe0 23262920	      FFMA R25, R42, R34, R25 
132	00007fe0 23262930	      FFMA R40, R42, R35, R40 
133	00007fe0 23262940	      FMUL R42, R44, R26 
134	00007fe0 23262950	      FMUL R44, R44, R27 
135	00007fe0 23262960	      FFMA R32, R46, R32, R5 
136	00007fe0 23262970	      FFMA R6, R46, R33, R6 
137	00007fe0 23262980	      FFMA R27, R45, R30, R42 
138	00007fe0 23262990	      FFMA R44, R45, R31, R44 
139	00007fe0 232629a0	      FFMA R7, R43, R36, R28 
140	00007fe0 232629b0	      FFMA R24, R43, R37, R24 
141	00007fe0 232629c0	      FFMA R27, R46, R34, R27 
142	00007fe0 232629d0	      FFMA R44, R46, R35, R44 
143	00007fe0 232629e0	      FFMA R25, R43, R38, R25 
144	00007fe0 232629f0	      FFMA R26, R43, R39, R40 
145	00007fe0 23262a00	      FFMA R5, R47, R36, R32 
146	00007fe0 23262a10	      FFMA R6, R47, R37, R6 
147	00007fe0 23262a20	      FFMA R27, R47, R38, R27 
148	00007fe0 23262a30	      FFMA R44, R47, R39, R44 
149	00007fe0 23262a40	      FADD R8, R7, R8 
150	00007fe0 23262a50	      FADD R9, R24, R9 
151	00007fe0 23262a60	      FADD R10, R25, R10 
152	00007fe0 23262a70	      FADD R11, R26, R11 
153	00007fe0 23262a80	      FADD R12, R5, R12 
154	00007fe0 23262a90	      FADD R13, R6, R13 
155	00007fe0 23262aa0	      FADD R14, R27, R14 
156	00007fe0 23262ab0	      FADD R15, R44, R15 
157	00007fe0 23262ac0	@!P0  BRA 0x7fe023262490 
158	00007fe0 23262ad0	      STG.E.128 [R2.64], R20 
159	00007fe0 23262ae0	      STG.E.128 [R2.64+0xa0], R16 
160	00007fe0 23262af0	      STG.E.128 [R2.64+0x140], R8 
161	00007fe0 23262b00	      STG.E.128 [R2.64+0x1e0], R12 
162	00007fe0 23262b10	      EXIT 

Kernel B:

1	00007fe0 23260e00	      MOV R1, c[0x0][0x28] 
2	00007fe0 23260e10	      I2F.U32.RP R0, 0x5 
3	00007fe0 23260e20	      S2R R5, SR_CTAID.X 
4	00007fe0 23260e30	      MOV R53, RZ 
5	00007fe0 23260e40	      CS2R R12, SRZ 
6	00007fe0 23260e50	      CS2R R14, SRZ 
7	00007fe0 23260e60	      S2R R6, SR_CTAID.Z 
8	00007fe0 23260e70	      CS2R R16, SRZ 
9	00007fe0 23260e80	      CS2R R18, SRZ 
10	00007fe0 23260e90	      S2R R9, SR_CTAID.Y 
11	00007fe0 23260ea0	      S2R R8, SR_TID.Z 
12	00007fe0 23260eb0	      MUFU.RCP R0, R0 
13	00007fe0 23260ec0	      S2R R10, SR_TID.Y 
14	00007fe0 23260ed0	      S2R R11, SR_TID.X 
15	00007fe0 23260ee0	      IABS R4, R5 
16	00007fe0 23260ef0	      IADD3 R2, R0, 0xffffffe, RZ 
17	00007fe0 23260f00	      LOP3.LUT R0, R5, 0x5, RZ, 0x3c, !PT 
18	00007fe0 23260f10	      F2I.FTZ.U32.TRUNC.NTZ R3, R2 
19	00007fe0 23260f20	      ISETP.GE.AND P1, PT, R0, RZ, PT 
20	00007fe0 23260f30	      IMAD R0, R6, 0x140000, RZ 
21	00007fe0 23260f40	      LEA R6, R6, R9, 0x1 
22	00007fe0 23260f50	      MOV R2, RZ 
23	00007fe0 23260f60	      IMAD R7, R3, -0x5, RZ 
24	00007fe0 23260f70	      IMAD.HI.U32 R3, R3, R7, R2 
25	00007fe0 23260f80	      IMAD R2, R9, 0xa0000, R0 
26	00007fe0 23260f90	      IMAD.HI.U32 R3, R3, R4, RZ 
27	00007fe0 23260fa0	      IMAD R4, R3, -0x5, R4 
28	00007fe0 23260fb0	      ISETP.GE.U32.AND P2, PT, R4, 0x5, PT 
29	00007fe0 23260fc0	@P2   IADD3 R4, R4, -0x5, RZ 
30	00007fe0 23260fd0	@P2   IADD3 R3, R3, 0x1, RZ 
31	00007fe0 23260fe0	      ISETP.GE.U32.AND P0, PT, R4, 0x5, PT 
32	00007fe0 23260ff0	      ISETP.GE.AND P2, PT, R5, RZ, PT 
33	00007fe0 23261000	      IADD3 R5, R4, -0x5, RZ 
34	00007fe0 23261010	      SEL R4, R5, R4, P0 
35	00007fe0 23261020	      IMAD R5, R9, 0xa0, R0 
36	00007fe0 23261030	@P0   IADD3 R3, R3, 0x1, RZ 
37	00007fe0 23261040	@!P2  IADD3 R4, -R4, RZ, RZ 
38	00007fe0 23261050	@!P1  IADD3 R3, -R3, RZ, RZ 
39	00007fe0 23261060	      LEA R5, R4, R5, 0x3 
40	00007fe0 23261070	      IMAD R7, R3, 0xa00, R2 
41	00007fe0 23261080	      LEA R3, R6, R3, 0x8 
42	00007fe0 23261090	      LEA R0, R8, R5, 0x2 
43	00007fe0 232610a0	      LEA R7, R4, R7, 0x3 
44	00007fe0 232610b0	      LEA R52, R3, R10, 0x4 
45	00007fe0 232610c0	      IMAD R0, R11, 0x28, R0 
46	00007fe0 232610d0	      LEA R7, R8, R7, 0x2 
47	00007fe0 232610e0	      CS2R R4, SRZ 
48	00007fe0 232610f0	      LEA R52, R11, R52, 0xa 
49	00007fe0 23261100	      CS2R R8, SRZ 
50	00007fe0 23261110	      IMAD R7, R10, 0xa0, R7 
51	00007fe0 23261120	      IMAD R2, R11, 0x28000, R7 
52	00007fe0 23261130	      CS2R R6, SRZ 
53	00007fe0 23261140	      CS2R R10, SRZ 
54	00007fe0 23261150	      SHF.L.U32 R3, R2, 0x2, RZ 
55	00007fe0 23261160	      IADD3 R2, P0, R3, c[0x0][0x160], RZ 
56	00007fe0 23261170	      LEA.HI.X.SX32 R3, R3, c[0x0][0x164], 0x1, P0 
57	00007fe0 23261180	      LEA R20, R52, R53, 0xc 
58	00007fe0 23261190	      IMAD R21, R53, 0x500, R0 
59	00007fe0 232611a0	      ULDC.64 UR4, c[0x0][0x118] 
60	00007fe0 232611b0	      SHF.L.U32 R20, R20, 0x2, RZ 
61	00007fe0 232611c0	      SHF.L.U32 R21, R21, 0x2, RZ 
62	00007fe0 232611d0	      SHF.L.U32 R20, R20, 0x2, RZ 
63	00007fe0 232611e0	      IADD3 R48, P1, R21, c[0x0][0x170], RZ 
64	00007fe0 232611f0	      IADD3 R50, P0, R20, c[0x0][0x168], RZ 
65	00007fe0 23261200	      LEA.HI.X.SX32 R49, R21, c[0x0][0x174], 0x1, P1 
66	00007fe0 23261210	      LEA.HI.X.SX32 R51, R20, c[0x0][0x16c], 0x1, P0 
67	00007fe0 23261220	      LDG.E.128 R20, [R48.64] 
68	00007fe0 23261230	      LDG.E.128 R24, [R50.64] 
69	00007fe0 23261240	      LDG.E.128 R28, [R48.64+0x500] 
70	00007fe0 23261250	      LDG.E.128 R32, [R48.64+0xa00] 
71	00007fe0 23261260	      LDG.E.128 R36, [R50.64+0x4000] 
72	00007fe0 23261270	      LDG.E.128 R40, [R50.64+0x8000] 
73	00007fe0 23261280	      LDG.E.128 R44, [R50.64+0xc000] 
74	00007fe0 23261290	      FFMA R4, R24, R20, R4 
75	00007fe0 232612a0	      FFMA R6, R24, R22, R6 
76	00007fe0 232612b0	      FFMA R5, R24, R21, R5 
77	00007fe0 232612c0	      FFMA R24, R24, R23, R7 
78	00007fe0 232612d0	      FFMA R7, R25, R28, R4 
79	00007fe0 232612e0	      FFMA R51, R25, R30, R6 
80	00007fe0 232612f0	      FFMA R4, R26, R32, R7 
81	00007fe0 23261300	      FFMA R7, R26, R34, R51 
82	00007fe0 23261310	      LDG.E.128 R48, [R48.64+0xf00] 
83	00007fe0 23261320	      FFMA R5, R25, R29, R5 
84	00007fe0 23261330	      FFMA R24, R25, R31, R24 
85	00007fe0 23261340	      FFMA R6, R26, R33, R5 
86	00007fe0 23261350	      FFMA R26, R26, R35, R24 
87	00007fe0 23261360	      FFMA R24, R36, R21, R9 
88	00007fe0 23261370	      FFMA R9, R36, R22, R10 
89	00007fe0 23261380	      FFMA R8, R36, R20, R8 
90	00007fe0 23261390	      FFMA R9, R37, R30, R9 
91	00007fe0 232613a0	      FFMA R36, R36, R23, R11 
92	00007fe0 232613b0	      FFMA R11, R38, R34, R9 
93	00007fe0 232613c0	      FFMA R9, R40, R22, R14 
94	00007fe0 232613d0	      IADD3 R53, R53, 0x1, RZ 
95	00007fe0 232613e0	      FFMA R5, R37, R28, R8 
96	00007fe0 232613f0	      FFMA R10, R37, R29, R24 
97	00007fe0 23261400	      FFMA R9, R41, R30, R9 
98	00007fe0 23261410	      FFMA R12, R40, R20, R12 
99	00007fe0 23261420	      FFMA R24, R40, R21, R13 
100	00007fe0 23261430	      FFMA R40, R40, R23, R15 
101	00007fe0 23261440	      ISETP.GE.AND P0, PT, R53, 0x400, PT 
102	00007fe0 23261450	      FFMA R8, R38, R32, R5 
103	00007fe0 23261460	      FFMA R15, R42, R34, R9 
104	00007fe0 23261470	      FFMA R16, R44, R20, R16 
105	00007fe0 23261480	      FFMA R5, R41, R28, R12 
106	00007fe0 23261490	      FFMA R20, R44, R21, R17 
107	00007fe0 232614a0	      FFMA R9, R44, R22, R18 
108	00007fe0 232614b0	      FFMA R44, R44, R23, R19 
109	00007fe0 232614c0	      FFMA R14, R41, R29, R24 
110	00007fe0 232614d0	      FFMA R12, R42, R32, R5 
111	00007fe0 232614e0	      FFMA R9, R45, R30, R9 
112	00007fe0 232614f0	      FFMA R36, R37, R31, R36 
113	00007fe0 23261500	      FFMA R40, R41, R31, R40 
114	00007fe0 23261510	      FFMA R5, R45, R28, R16 
115	00007fe0 23261520	      FFMA R20, R45, R29, R20 
116	00007fe0 23261530	      FFMA R44, R45, R31, R44 
117	00007fe0 23261540	      FFMA R10, R38, R33, R10 
118	00007fe0 23261550	      FFMA R14, R42, R33, R14 
119	00007fe0 23261560	      FFMA R19, R46, R34, R9 
120	00007fe0 23261570	      FFMA R38, R38, R35, R36 
121	00007fe0 23261580	      FFMA R42, R42, R35, R40 
122	00007fe0 23261590	      FFMA R16, R46, R32, R5 
123	00007fe0 232615a0	      FFMA R20, R46, R33, R20 
124	00007fe0 232615b0	      FFMA R44, R46, R35, R44 
125	00007fe0 232615c0	      FFMA R5, R27, R49, R6 
126	00007fe0 232615d0	      FFMA R9, R39, R49, R10 
127	00007fe0 232615e0	      FFMA R13, R43, R49, R14 
128	00007fe0 232615f0	      FFMA R6, R27, R50, R7 
129	00007fe0 23261600	      FFMA R10, R39, R50, R11 
130	00007fe0 23261610	      FFMA R14, R43, R50, R15 
131	00007fe0 23261620	      FFMA R18, R47, R50, R19 
132	00007fe0 23261630	      FFMA R4, R27, R48, R4 
133	00007fe0 23261640	      FFMA R7, R27, R51, R26 
134	00007fe0 23261650	      FFMA R8, R39, R48, R8 
135	00007fe0 23261660	      FFMA R11, R39, R51, R38 
136	00007fe0 23261670	      FFMA R12, R43, R48, R12 
137	00007fe0 23261680	      FFMA R15, R43, R51, R42 
138	00007fe0 23261690	      FFMA R16, R47, R48, R16 
139	00007fe0 232616a0	      FFMA R17, R47, R49, R20 
140	00007fe0 232616b0	      FFMA R19, R47, R51, R44 
141	00007fe0 232616c0	@!P0  BRA 0x7fe023261180 
142	00007fe0 232616d0	      ULDC.64 UR4, c[0x0][0x118] 
143	00007fe0 232616e0	      STG.E.128 [R2.64], R4 
144	00007fe0 232616f0	      STG.E.128 [R2.64+0xa0], R8 
145	00007fe0 23261700	      STG.E.128 [R2.64+0x140], R12 
146	00007fe0 23261710	      STG.E.128 [R2.64+0x1e0], R16 
147	00007fe0 23261720	      EXIT 

Looking at line 67 where loads happen in both kernels it seems like SASS moved the loads more tightly together in kernel B generating much more lg throttling.
How can I further debug the reason for the increased miss ratio and worse load patterns for kernel B?

Nsight Compute does not support showing per instruction (or code range) L1 hit/misses. The common method would be to look at the access pattern (number of unique sectors) accessed by each instruction and the addresses accessed and answer the following questions?

  • Is the access pattern optimal for the given instruction or is the pattern such that there is an expectation of spatial reuse (i.e. the same warp will issue an additional instruction leverage some of the data cached from the first instruction) or temporal reuse (i.e. a different warp will use the same data or additional data cached from the first instruction)?
  • Is there any spatial and temporal requirements assumed by the algorithm to improve either L1 or L2 hit rate?

When diff’ing the SASS I don’t see any changes to the address patterns but due to both register allocation differences and instruction re-ordering it is hard to tell. Do the unique and excessive sectors accessed change between the two kernels?

When looking in NCU is the L1/SHM configuration different between the two options? I do not see any shared memory accesses. In NCU you would want to see that L1/SHM prefers L1 tagged.

Is the launch configuration and occupancy the same for the two kernels?

Oh sorry, by the better load patterns I didn’t mean memory regions. SASS just seems to group less loads one after another here is a diff from NCU


You can see that 2 Loads get executed later for kernel A even though they aren’t accessed untill later in the kernel, leading to high throttling. This is while executing the kernel with the same data and configuration. The theoretical occupancy is the same for both kernels although achieved is slightly lower(-3%) for kernel B.