// // Generated by LLVM NVPTX Back-End // .version 7.1 .target sm_80 .address_size 64 // .globl __inference_reduction_1d_parallel_2d_contraction_25_Sum_kernel // __wg___inference_reduction_1d_parallel_2d_contraction_25_Sum_kernel_0 has been demoted .visible .entry __inference_reduction_1d_parallel_2d_contraction_25_Sum_kernel( .param .u64 __inference_reduction_1d_parallel_2d_contraction_25_Sum_kernel_param_0, .param .u64 __inference_reduction_1d_parallel_2d_contraction_25_Sum_kernel_param_1 ) { .reg .pred p<6>; .reg .b32 r<18>; .reg .f32 f<28>; .reg .b64 rd<25>; // demoted variable .shared .align 4 .b8 __wg___inference_reduction_1d_parallel_2d_contraction_25_Sum_kernel_0[1024]; mov.u32 r1, %ctaid.x; mov.u32 r17, %tid.x; bar.sync 0; setp.gt.s32 p1, r17, 255; @p1 bra LBB0_3; ld.param.u64 rd10, [__inference_reduction_1d_parallel_2d_contraction_25_Sum_kernel_param_0]; cvta.to.global.u64 rd2, rd10; mul.lo.s32 r3, r1, 768; add.s32 r16, r17, -256; mad.lo.s32 r15, r17, 3, r3; mul.wide.s32 rd11, r17, 4; mov.u64 rd12, __wg___inference_reduction_1d_parallel_2d_contraction_25_Sum_kernel_0; add.s64 rd23, rd12, rd11; LBB0_2: mul.wide.s32 rd13, r15, 4; add.s64 rd14, rd2, rd13; ld.global.f32 f5, [rd14]; add.f32 f6, f5, 0f00000000; add.s32 r12, r15, 1; mul.wide.s32 rd15, r12, 4; add.s64 rd16, rd2, rd15; ld.global.f32 f7, [rd16]; add.f32 f8, f6, f7; add.s32 r13, r15, 2; mul.wide.s32 rd17, r13, 4; add.s64 rd18, rd2, rd17; ld.global.f32 f9, [rd18]; add.f32 f10, f8, f9; st.shared.f32 [rd23], f10; add.s32 r16, r16, 256; add.s32 r15, r15, 768; add.s64 rd23, rd23, 1024; setp.gt.u32 p2, r16, 2147483391; @p2 bra LBB0_2; LBB0_3: bar.sync 0; setp.gt.s32 p3, r17, 0; @p3 bra LBB0_9; ld.param.u64 rd9, [__inference_reduction_1d_parallel_2d_contraction_25_Sum_kernel_param_1]; cvta.to.global.u64 rd1, rd9; mul.wide.s32 rd19, r1, 4; add.s64 rd6, rd1, rd19; ld.global.f32 f27, [rd6]; mov.u64 rd20, 0; mov.u64 rd21, __wg___inference_reduction_1d_parallel_2d_contraction_25_Sum_kernel_0; LBB0_5: mov.u64 rd24, rd20; LBB0_6: add.s64 rd22, rd21, rd24; ld.shared.v4.f32 {f11, f12, f13, f14}, [rd22]; add.f32 f15, f27, f11; add.f32 f16, f15, f12; add.f32 f17, f16, f13; add.f32 f18, f17, f14; ld.shared.v4.f32 {f19, f20, f21, f22}, [rd22+16]; add.f32 f23, f18, f19; add.f32 f24, f23, f20; add.f32 f25, f24, f21; add.f32 f27, f25, f22; add.s64 rd24, rd24, 32; cvt.u32.u64 r14, rd24; setp.ne.s32 p4, r14, 1024; @p4 bra LBB0_6; add.s32 r11, r17, 256; setp.lt.s32 p5, r17, -255; mov.u32 r17, r11; @p5 bra LBB0_5; st.global.f32 [rd6], f27; LBB0_9: bar.sync 0; ret; }