It seems that dumping sass doesn’t working and dumping ptx working correctly.
The result of cuobjdump --dump-sass
is as follows:
oka@harp:~/benchmark/parboil_bench/parboil/benchmarks/tpacf/build/cuda_default$ cuobjdump tpacf --dump-sass
Fatbin ptx code:
arch = sm_12
code version = [1,4]
producer = cuda
host = linux
compile_size = 64bit
identifier = src/cuda/main.cu
The result of cuobjdump --dump-ptx
is as follows:
Fatbin ptx code:
arch = sm_12
code version = [1,4]
producer = cuda
host = linux
compile_size = 64bit
identifier = src/cuda/main.cu
.version 1.4
.target sm_12, map_f64_to_f32
// compiled with /home/oka/local/cuda_4_0_17/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00001c2a_00000000-7_main.cpp3.i (/tmp/ccBI#.v9ZzOv)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_12, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00001c2a_00000000-6_main.cudafe2.gpu"
.file 3 "src/cuda/model.h"
.file 4 "/usr/lib/gcc/x86_64-linux-gnu/4.4.7/include/stddef.h"
.file 5 "/home/oka/local/cuda_4_0_17/cuda//include/crt/device_runtime.h"
.file 6 "/home/oka/local/cuda_4_0_17/cuda//include/host_defines.h"
.file 7 "/home/oka/local/cuda_4_0_17/cuda//include/builtin_types.h"
.file 8 "/home/oka/local/cuda_4_0_17/cuda//include/device_types.h"
.file 9 "/home/oka/local/cuda_4_0_17/cuda//include/driver_types.h"
.file 10 "/home/oka/local/cuda_4_0_17/cuda//include/surface_types.h"
.file 11 "/home/oka/local/cuda_4_0_17/cuda//include/texture_types.h"
.file 12 "/home/oka/local/cuda_4_0_17/cuda//include/vector_types.h"
.file 13 "/home/oka/local/cuda_4_0_17/cuda//include/device_launch_parameters.h"
.file 14 "/home/oka/local/cuda_4_0_17/cuda//include/crt/storage_class.h"
.file 15 "/usr/include/x86_64-linux-gnu/bits/types.h"
.file 16 "/usr/include/time.h"
.file 17 "/home/oka/local/cuda_4_0_17/cuda//include/sm_11_atomic_functions.h"
.file 18 "src/cuda/tpacf_kernel.cu"
.file 19 "/home/oka/local/cuda_4_0_17/cuda//include/common_functions.h"
.file 20 "/home/oka/local/cuda_4_0_17/cuda//include/math_functions.h"
.file 21 "/home/oka/local/cuda_4_0_17/cuda//include/math_constants.h"
.file 22 "/home/oka/local/cuda_4_0_17/cuda//include/device_functions.h"
.file 23 "/home/oka/local/cuda_4_0_17/cuda//include/sm_12_atomic_functions.h"
.file 24 "/home/oka/local/cuda_4_0_17/cuda//include/sm_13_double_functions.h"
.file 25 "/home/oka/local/cuda_4_0_17/cuda//include/sm_20_atomic_functions.h"
.file 26 "/home/oka/local/cuda_4_0_17/cuda//include/sm_20_intrinsics.h"
.file 27 "/home/oka/local/cuda_4_0_17/cuda//include/surface_functions.h"
.file 28 "/home/oka/local/cuda_4_0_17/cuda//include/texture_fetch_functions.h"
.file 29 "/home/oka/local/cuda_4_0_17/cuda//include/math_functions_dbl_ptx1.h"
.const .align 4 .b8 dev_binb[84];
.entry _Z9gen_histsPmPfS0_S0_ii (
.param .u64 __cudaparm__Z9gen_histsPmPfS0_S0_ii_histograms,
.param .u64 __cudaparm__Z9gen_histsPmPfS0_S0_ii_all_x_data,
.param .u64 __cudaparm__Z9gen_histsPmPfS0_S0_ii_all_y_data,
.param .u64 __cudaparm__Z9gen_histsPmPfS0_S0_ii_all_z_data,
.param .s32 __cudaparm__Z9gen_histsPmPfS0_S0_ii_NUM_SETS,
.param .s32 __cudaparm__Z9gen_histsPmPfS0_S0_ii_NUM_ELEMENTS)
{
.reg .u32 %r<152>;
.reg .u64 %rd<165>;
.reg .f32 %f<23>;
.reg .pred %p<42>;
.shared .align 4 .b8 __cuda___cuda_local_var_35010_5_non_const_warp_hists128[10240];
.shared .align 4 .b8 __cuda___cuda_local_var_35007_44_non_const_data_s10368[3072];
.loc 18 51 0
$LDWbegin__Z9gen_histsPmPfS0_S0_ii:
cvt.u32.u16 %r1, %tid.x;
mov.u32 %r2, 2559;
setp.gt.u32 %p1, %r1, %r2;
@%p1 bra $Lt_0_17922;
.loc 18 73 0
mov.u64 %rd1, __cuda___cuda_local_var_35010_5_non_const_warp_hists128;
mov.u32 %r3, 0;
and.b32 %r4, %r1, 127;
cvt.u64.u32 %rd2, %r4;
shr.u32 %r5, %r1, 7;
cvt.u64.u32 %rd3, %r5;
mul.wide.u32 %rd4, %r5, 128;
add.u64 %rd5, %rd2, %rd4;
mul.lo.u64 %rd6, %rd5, 4;
add.u64 %rd7, %rd1, %rd6;
st.shared.u32 [%rd7+0], %r3;
$Lt_0_17922:
mov.u64 %rd1, __cuda___cuda_local_var_35010_5_non_const_warp_hists128;
add.u32 %r6, %r1, 256;
mov.u32 %r7, 2559;
setp.gt.u32 %p2, %r6, %r7;
@%p2 bra $Lt_0_18434;
mov.u32 %r8, 0;
and.b32 %r9, %r6, 127;
cvt.u64.u32 %rd8, %r9;
shr.u32 %r10, %r6, 7;
cvt.u64.u32 %rd9, %r10;
mul.wide.u32 %rd10, %r10, 128;
add.u64 %rd11, %rd8, %rd10;
mul.lo.u64 %rd12, %rd11, 4;
add.u64 %rd13, %rd1, %rd12;
st.shared.u32 [%rd13+0], %r8;
$Lt_0_18434:
add.u32 %r11, %r1, 512;
mov.u32 %r12, 2559;
setp.gt.u32 %p3, %r11, %r12;
@%p3 bra $Lt_0_18946;
mov.u32 %r13, 0;
and.b32 %r14, %r11, 127;
cvt.u64.u32 %rd14, %r14;
shr.u32 %r15, %r11, 7;
cvt.u64.u32 %rd15, %r15;
mul.wide.u32 %rd16, %r15, 128;
add.u64 %rd17, %rd14, %rd16;
mul.lo.u64 %rd18, %rd17, 4;
add.u64 %rd19, %rd1, %rd18;
st.shared.u32 [%rd19+0], %r13;
$Lt_0_18946:
add.u32 %r16, %r1, 768;
mov.u32 %r17, 2559;
setp.gt.u32 %p4, %r16, %r17;
@%p4 bra $Lt_0_19458;
mov.u32 %r18, 0;
and.b32 %r19, %r16, 127;
cvt.u64.u32 %rd20, %r19;
shr.u32 %r20, %r16, 7;
cvt.u64.u32 %rd21, %r20;
mul.wide.u32 %rd22, %r20, 128;
add.u64 %rd23, %rd20, %rd22;
mul.lo.u64 %rd24, %rd23, 4;
add.u64 %rd25, %rd1, %rd24;
st.shared.u32 [%rd25+0], %r18;
$Lt_0_19458:
add.u32 %r21, %r1, 1024;
mov.u32 %r22, 2559;
setp.gt.u32 %p5, %r21, %r22;
@%p5 bra $Lt_0_19970;
mov.u32 %r23, 0;
and.b32 %r24, %r21, 127;
cvt.u64.u32 %rd26, %r24;
shr.u32 %r25, %r21, 7;
cvt.u64.u32 %rd27, %r25;
mul.wide.u32 %rd28, %r25, 128;
add.u64 %rd29, %rd26, %rd28;
mul.lo.u64 %rd30, %rd29, 4;
add.u64 %rd31, %rd1, %rd30;
st.shared.u32 [%rd31+0], %r23;
$Lt_0_19970:
add.u32 %r26, %r1, 1280;
mov.u32 %r27, 2559;
setp.gt.u32 %p6, %r26, %r27;
@%p6 bra $Lt_0_20482;
mov.u32 %r28, 0;
and.b32 %r29, %r26, 127;
cvt.u64.u32 %rd32, %r29;
shr.u32 %r30, %r26, 7;
cvt.u64.u32 %rd33, %r30;
mul.wide.u32 %rd34, %r30, 128;
add.u64 %rd35, %rd32, %rd34;
mul.lo.u64 %rd36, %rd35, 4;
add.u64 %rd37, %rd1, %rd36;
st.shared.u32 [%rd37+0], %r28;
$Lt_0_20482:
add.u32 %r31, %r1, 1536;
mov.u32 %r32, 2559;
setp.gt.u32 %p7, %r31, %r32;
@%p7 bra $Lt_0_20994;
mov.u32 %r33, 0;
and.b32 %r34, %r31, 127;
cvt.u64.u32 %rd38, %r34;
shr.u32 %r35, %r31, 7;
cvt.u64.u32 %rd39, %r35;
mul.wide.u32 %rd40, %r35, 128;
add.u64 %rd41, %rd38, %rd40;
mul.lo.u64 %rd42, %rd41, 4;
add.u64 %rd43, %rd1, %rd42;
st.shared.u32 [%rd43+0], %r33;
$Lt_0_20994:
add.u32 %r36, %r1, 1792;
mov.u32 %r37, 2559;
setp.gt.u32 %p8, %r36, %r37;
@%p8 bra $Lt_0_21506;
mov.u32 %r38, 0;
and.b32 %r39, %r36, 127;
cvt.u64.u32 %rd44, %r39;
shr.u32 %r40, %r36, 7;
cvt.u64.u32 %rd45, %r40;
mul.wide.u32 %rd46, %r40, 128;
add.u64 %rd47, %rd44, %rd46;
mul.lo.u64 %rd48, %rd47, 4;
add.u64 %rd49, %rd1, %rd48;
st.shared.u32 [%rd49+0], %r38;
$Lt_0_21506:
add.u32 %r41, %r1, 2048;
mov.u32 %r42, 2559;
setp.gt.u32 %p9, %r41, %r42;
@%p9 bra $Lt_0_22018;
mov.u32 %r43, 0;
and.b32 %r44, %r41, 127;
cvt.u64.u32 %rd50, %r44;
shr.u32 %r45, %r41, 7;
cvt.u64.u32 %rd51, %r45;
mul.wide.u32 %rd52, %r45, 128;
add.u64 %rd53, %rd50, %rd52;
mul.lo.u64 %rd54, %rd53, 4;
add.u64 %rd55, %rd1, %rd54;
st.shared.u32 [%rd55+0], %r43;
$Lt_0_22018:
add.u32 %r46, %r1, 2304;
mov.u32 %r47, 2559;
setp.gt.u32 %p10, %r46, %r47;
@%p10 bra $Lt_0_22530;
mov.u32 %r48, 0;
and.b32 %r49, %r46, 127;
cvt.u64.u32 %rd56, %r49;
shr.u32 %r50, %r46, 7;
cvt.u64.u32 %rd57, %r50;
mul.wide.u32 %rd58, %r50, 128;
add.u64 %rd59, %rd56, %rd58;
mul.lo.u64 %rd60, %rd59, 4;
add.u64 %rd61, %rd1, %rd60;
st.shared.u32 [%rd61+0], %r48;
$Lt_0_22530:
ld.param.s32 %r51, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_NUM_SETS];
add.u32 %r52, %r51, 1;
cvt.u32.u16 %r53, %ctaid.x;
setp.le.u32 %p11, %r52, %r53;
ld.param.u64 %rd62, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_all_z_data];
ld.param.u64 %rd63, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_all_y_data];
ld.param.u64 %rd64, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_all_x_data];
ld.param.s32 %r54, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_NUM_ELEMENTS];
@!%p11 bra $Lt_0_23298;
ld.param.u64 %rd64, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_all_x_data];
.loc 18 80 0
mov.s64 %rd65, %rd64;
.loc 18 73 0
ld.param.u64 %rd63, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_all_y_data];
.loc 18 81 0
mov.s64 %rd66, %rd63;
.loc 18 73 0
ld.param.u64 %rd62, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_all_z_data];
.loc 18 82 0
mov.s64 %rd67, %rd62;
.loc 18 73 0
ld.param.s32 %r51, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_NUM_SETS];
.loc 18 83 0
sub.u32 %r55, %r53, %r51;
.loc 18 73 0
ld.param.s32 %r54, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_NUM_ELEMENTS];
.loc 18 83 0
mul.lo.u32 %r56, %r54, %r55;
cvt.u64.u32 %rd68, %r56;
mul.wide.u32 %rd69, %r56, 4;
add.u64 %rd70, %rd69, %rd64;
.loc 18 84 0
add.u64 %rd71, %rd69, %rd63;
.loc 18 85 0
add.u64 %rd72, %rd69, %rd62;
bra.uni $Lt_0_23042;
$Lt_0_23298:
.loc 18 73 0
ld.param.s32 %r54, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_NUM_ELEMENTS];
.loc 18 89 0
mul.lo.u32 %r57, %r54, %r53;
cvt.u64.u32 %rd73, %r57;
mul.wide.u32 %rd74, %r57, 4;
.loc 18 73 0
ld.param.u64 %rd64, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_all_x_data];
.loc 18 89 0
add.u64 %rd75, %rd74, %rd64;
mov.s64 %rd70, %rd75;
.loc 18 73 0
ld.param.u64 %rd63, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_all_y_data];
.loc 18 90 0
add.u64 %rd76, %rd74, %rd63;
mov.s64 %rd71, %rd76;
.loc 18 73 0
ld.param.u64 %rd62, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_all_z_data];
.loc 18 91 0
add.u64 %rd77, %rd74, %rd62;
mov.s64 %rd72, %rd77;
.loc 18 93 0
mov.s64 %rd65, %rd75;
.loc 18 94 0
mov.s64 %rd66, %rd76;
.loc 18 95 0
mov.s64 %rd67, %rd77;
$Lt_0_23042:
mov.u32 %r58, 0;
.loc 18 73 0
ld.param.s32 %r54, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_NUM_ELEMENTS];
.loc 18 95 0
setp.eq.u32 %p12, %r54, %r58;
@%p12 bra $Lt_0_23554;
setp.gt.u32 %p13, %r52, %r53;
selp.s32 %r59, 1, 0, %p13;
.loc 18 73 0
ld.param.s32 %r54, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_NUM_ELEMENTS];
.loc 18 95 0
add.u32 %r60, %r54, 255;
shr.s32 %r61, %r60, 31;
mov.s32 %r62, 255;
and.b32 %r63, %r61, %r62;
add.s32 %r64, %r63, %r60;
shr.s32 %r65, %r64, 8;
mov.u32 %r66, 0;
mov.u64 %rd78, __cuda___cuda_local_var_35007_44_non_const_data_s10368;
mov.u64 %rd79, dev_binb;
mov.s32 %r67, %r65;
$Lt_0_24066:
// Loop body line 95, nesting depth: 1, estimated iterations: unknown
add.u32 %r68, %r66, %r1;
setp.le.u32 %p14, %r54, %r68;
@%p14 bra $Lt_0_24322;
.loc 18 105 0
cvt.u64.u32 %rd80, %r68;
mul.wide.u32 %rd81, %r68, 4;
add.u64 %rd82, %rd81, %rd66;
ld.global.f32 %f1, [%rd82+0];
add.u64 %rd83, %rd81, %rd67;
ld.global.f32 %f2, [%rd83+0];
cvt.u64.u32 %rd84, %r1;
mul.wide.u32 %rd85, %r1, 12;
add.u64 %rd86, %rd78, %rd85;
add.u64 %rd87, %rd81, %rd65;
ld.global.f32 %f3, [%rd87+0];
st.shared.f32 [%rd86+0], %f3;
st.shared.f32 [%rd86+4], %f1;
st.shared.f32 [%rd86+8], %f2;
$Lt_0_24322:
.loc 18 109 0
bar.sync 0;
.loc 18 112 0
add.u32 %r69, %r66, 1;
mov.u32 %r70, 0;
selp.u32 %r71, %r69, %r70, %p13;
mov.s32 %r72, %r71;
setp.le.u32 %p15, %r54, %r71;
@%p15 bra $Lt_0_24834;
sub.u32 %r73, %r54, %r71;
add.u32 %r74, %r73, 255;
shr.s32 %r75, %r74, 31;
mov.s32 %r76, 255;
and.b32 %r77, %r75, %r76;
add.s32 %r78, %r77, %r74;
shr.s32 %r79, %r78, 8;
add.u32 %r80, %r71, %r1;
setp.gt.u32 %p16, %r54, %r66;
mov.s32 %r81, %r79;
$Lt_0_25346:
// Loop body line 112, nesting depth: 2, estimated iterations: unknown
setp.le.u32 %p17, %r54, %r80;
@%p17 bra $Lt_0_25602;
.loc 18 122 0
cvt.u64.u32 %rd88, %r80;
mul.wide.u32 %rd89, %r80, 4;
add.u64 %rd90, %rd89, %rd70;
ld.global.f32 %f4, [%rd90+0];
mov.f32 %f5, %f4;
.loc 18 123 0
add.u64 %rd91, %rd89, %rd71;
ld.global.f32 %f6, [%rd91+0];
mov.f32 %f7, %f6;
.loc 18 124 0
add.u64 %rd92, %rd89, %rd72;
ld.global.f32 %f8, [%rd92+0];
mov.f32 %f9, %f8;
$Lt_0_25602:
.loc 18 131 0
@!%p16 bra $L_0_16642;
mov.f32 %f10, %f9;
mov.f32 %f11, %f5;
mov.f32 %f12, %f7;
mov.u32 %r82, 0;
$L_0_15874:
.loc 18 136 0
cvt.u64.u32 %rd93, %r82;
mul.wide.u32 %rd94, %r82, 12;
add.u64 %rd95, %rd78, %rd94;
ld.shared.f32 %f13, [%rd95+4];
mul.f32 %f14, %f13, %f12;
ld.shared.f32 %f15, [%rd95+0];
mad.f32 %f16, %f15, %f11, %f14;
ld.shared.f32 %f17, [%rd95+8];
mad.f32 %f18, %f17, %f10, %f16;
mov.u32 %r83, 20;
mov.u32 %r84, 0;
$Lt_0_26626:
// Loop body line 136, nesting depth: 3, estimated iterations: unknown
add.u32 %r85, %r83, %r84;
shr.u32 %r86, %r85, 1;
cvt.u64.u32 %rd96, %r86;
mul.wide.u32 %rd97, %r86, 4;
add.u64 %rd98, %rd79, %rd97;
ld.const.f32 %f19, [%rd98+0];
setp.le.f32 %p18, %f19, %f18;
@!%p18 bra $Lt_0_27138;
.loc 18 153 0
mov.s32 %r83, %r86;
bra.uni $Lt_0_26882;
$Lt_0_27138:
.loc 18 155 0
mov.s32 %r84, %r86;
$Lt_0_26882:
add.u32 %r87, %r84, 1;
setp.gt.u32 %p19, %r83, %r87;
@%p19 bra $Lt_0_26626;
.loc 18 157 0
cvt.u64.u32 %rd99, %r84;
mul.wide.u32 %rd100, %r84, 4;
add.u64 %rd101, %rd79, %rd100;
ld.const.f32 %f20, [%rd101+0];
setp.gt.f32 %p20, %f20, %f18;
@!%p20 bra $Lt_0_33794;
cvt.u64.u32 %rd102, %r83;
mul.wide.u32 %rd103, %r83, 4;
add.u64 %rd104, %rd79, %rd103;
ld.const.f32 %f21, [%rd104+0];
setp.le.f32 %p21, %f21, %f18;
@!%p21 bra $Lt_0_33794;
@%p11 bra $L_0_17410;
add.u32 %r88, %r82, %r66;
setp.le.u32 %p22, %r80, %r88;
@%p22 bra $Lt_0_33794;
$L_0_17410:
setp.le.u32 %p23, %r54, %r80;
@%p23 bra $Lt_0_33794;
.loc 17 123 0
shr.u32 %r89, %r1, 1;
cvt.u64.u32 %rd105, %r89;
mul.lo.u64 %rd106, %rd102, 128;
add.u64 %rd107, %rd105, %rd106;
mul.lo.u64 %rd108, %rd107, 4;
add.u64 %rd109, %rd1, %rd108;
sub.u64 %rd110, %rd109, 512;
mov.u32 %r90, 1;
atom.shared.add.u32 %r91, [%rd110], %r90;
$Lt_0_33794:
$L_0_16898:
.loc 18 131 0
add.u32 %r82, %r82, 1;
mov.u32 %r92, 255;
setp.gt.u32 %p24, %r82, %r92;
@%p24 bra $L_0_16642;
add.u32 %r93, %r82, %r66;
setp.gt.u32 %p25, %r54, %r93;
@%p25 bra $L_0_15874;
$L_0_16642:
$L_0_16130:
add.u32 %r72, %r72, 256;
add.u32 %r80, %r80, 256;
setp.gt.u32 %p26, %r54, %r72;
@%p26 bra $Lt_0_25346;
$Lt_0_24834:
add.u32 %r66, %r66, 256;
setp.gt.u32 %p27, %r54, %r66;
@%p27 bra $Lt_0_24066;
$Lt_0_23554:
and.b32 %r94, %r1, 63;
shr.u32 %r95, %r1, 6;
mov.u32 %r96, 19;
setp.le.u32 %p28, %r95, %r96;
add.u32 %r97, %r95, 4;
add.u32 %r98, %r95, 8;
add.u32 %r99, %r95, 12;
add.u32 %r100, %r95, 16;
selp.s32 %r101, 1, 0, %p28;
mov.u32 %r102, 19;
setp.le.u32 %p29, %r97, %r102;
mov.u32 %r103, 19;
setp.le.u32 %p30, %r98, %r103;
mov.u32 %r104, 19;
setp.le.u32 %p31, %r99, %r104;
mov.u32 %r105, 19;
setp.le.u32 %p32, %r100, %r105;
selp.s32 %r106, 1, 0, %p29;
selp.s32 %r107, 1, 0, %p30;
selp.s32 %r108, 1, 0, %p31;
selp.s32 %r109, 1, 0, %p32;
mov.u32 %r110, 64;
$Lt_0_28674:
// Loop body line 131, nesting depth: 1, estimated iterations: unknown
.loc 18 179 0
bar.sync 0;
setp.lt.u32 %p33, %r94, %r110;
selp.s32 %r111, 1, 0, %p33;
and.b32 %r112, %r111, %r101;
mov.u32 %r113, 0;
setp.eq.s32 %p34, %r112, %r113;
@%p34 bra $Lt_0_28930;
.loc 18 185 0
cvt.u64.u32 %rd111, %r94;
cvt.u64.u32 %rd112, %r95;
mul.wide.u32 %rd113, %r95, 128;
add.u64 %rd114, %rd113, %rd111;
mul.lo.u64 %rd115, %rd114, 4;
add.u64 %rd116, %rd1, %rd115;
ld.shared.u32 %r114, [%rd116+0];
add.u32 %r115, %r94, %r110;
cvt.u64.u32 %rd117, %r115;
add.u64 %rd118, %rd117, %rd113;
mul.lo.u64 %rd119, %rd118, 4;
add.u64 %rd120, %rd1, %rd119;
ld.shared.u32 %r116, [%rd120+0];
add.u32 %r117, %r114, %r116;
cvt.u64.u32 %rd121, %r117;
cvt.u32.u64 %r118, %rd121;
st.shared.u32 [%rd116+0], %r118;
$Lt_0_28930:
.loc 18 179 0
bar.sync 0;
and.b32 %r119, %r111, %r106;
mov.u32 %r120, 0;
setp.eq.s32 %p35, %r119, %r120;
@%p35 bra $Lt_0_29442;
.loc 18 185 0
cvt.u64.u32 %rd111, %r94;
cvt.u64.u32 %rd122, %r95;
mul.wide.u32 %rd113, %r95, 128;
add.u64 %rd123, %rd113, %rd111;
mul.lo.u64 %rd124, %rd123, 4;
add.u64 %rd116, %rd1, %rd124;
ld.shared.u32 %r121, [%rd116+2048];
add.u32 %r122, %r94, %r110;
cvt.u64.u32 %rd125, %r122;
add.u64 %rd126, %rd125, %rd113;
mul.lo.u64 %rd127, %rd126, 4;
add.u64 %rd128, %rd1, %rd127;
ld.shared.u32 %r123, [%rd128+2048];
add.u32 %r124, %r121, %r123;
cvt.u64.u32 %rd129, %r124;
cvt.u32.u64 %r125, %rd129;
st.shared.u32 [%rd116+2048], %r125;
$Lt_0_29442:
.loc 18 179 0
bar.sync 0;
and.b32 %r126, %r111, %r107;
mov.u32 %r127, 0;
setp.eq.s32 %p36, %r126, %r127;
@%p36 bra $Lt_0_29954;
.loc 18 185 0
cvt.u64.u32 %rd111, %r94;
cvt.u64.u32 %rd130, %r95;
mul.wide.u32 %rd113, %r95, 128;
add.u64 %rd131, %rd113, %rd111;
mul.lo.u64 %rd132, %rd131, 4;
add.u64 %rd116, %rd1, %rd132;
ld.shared.u32 %r128, [%rd116+4096];
add.u32 %r129, %r94, %r110;
cvt.u64.u32 %rd133, %r129;
add.u64 %rd134, %rd133, %rd113;
mul.lo.u64 %rd135, %rd134, 4;
add.u64 %rd136, %rd1, %rd135;
ld.shared.u32 %r130, [%rd136+4096];
add.u32 %r131, %r128, %r130;
cvt.u64.u32 %rd137, %r131;
cvt.u32.u64 %r132, %rd137;
st.shared.u32 [%rd116+4096], %r132;
$Lt_0_29954:
.loc 18 179 0
bar.sync 0;
and.b32 %r133, %r111, %r108;
mov.u32 %r134, 0;
setp.eq.s32 %p37, %r133, %r134;
@%p37 bra $Lt_0_30466;
.loc 18 185 0
cvt.u64.u32 %rd111, %r94;
cvt.u64.u32 %rd138, %r95;
mul.wide.u32 %rd113, %r95, 128;
add.u64 %rd139, %rd113, %rd111;
mul.lo.u64 %rd140, %rd139, 4;
add.u64 %rd116, %rd1, %rd140;
ld.shared.u32 %r135, [%rd116+6144];
add.u32 %r136, %r94, %r110;
cvt.u64.u32 %rd141, %r136;
add.u64 %rd142, %rd141, %rd113;
mul.lo.u64 %rd143, %rd142, 4;
add.u64 %rd144, %rd1, %rd143;
ld.shared.u32 %r137, [%rd144+6144];
add.u32 %r138, %r135, %r137;
cvt.u64.u32 %rd145, %r138;
cvt.u32.u64 %r139, %rd145;
st.shared.u32 [%rd116+6144], %r139;
$Lt_0_30466:
.loc 18 179 0
bar.sync 0;
and.b32 %r140, %r111, %r109;
mov.u32 %r141, 0;
setp.eq.s32 %p38, %r140, %r141;
@%p38 bra $Lt_0_30978;
.loc 18 185 0
cvt.u64.u32 %rd111, %r94;
cvt.u64.u32 %rd146, %r95;
mul.wide.u32 %rd113, %r95, 128;
add.u64 %rd147, %rd113, %rd111;
mul.lo.u64 %rd148, %rd147, 4;
add.u64 %rd116, %rd1, %rd148;
ld.shared.u32 %r142, [%rd116+8192];
add.u32 %r143, %r94, %r110;
cvt.u64.u32 %rd149, %r143;
add.u64 %rd150, %rd149, %rd113;
mul.lo.u64 %rd151, %rd150, 4;
add.u64 %rd152, %rd1, %rd151;
ld.shared.u32 %r144, [%rd152+8192];
add.u32 %r145, %r142, %r144;
cvt.u64.u32 %rd153, %r145;
cvt.u32.u64 %r146, %rd153;
st.shared.u32 [%rd116+8192], %r146;
$Lt_0_30978:
.loc 18 173 0
shr.u32 %r110, %r110, 1;
mov.u32 %r147, 0;
setp.ne.u32 %p39, %r110, %r147;
@%p39 bra $Lt_0_28674;
.loc 18 190 0
bar.sync 0;
mov.u32 %r148, 19;
setp.gt.u32 %p40, %r1, %r148;
@%p40 bra $Lt_0_31746;
.loc 18 197 0
cvt.u64.u32 %rd154, %r1;
mul.wide.u32 %rd155, %r1, 512;
add.u64 %rd156, %rd1, %rd155;
ld.shared.u32 %r149, [%rd156+0];
cvt.u64.u32 %rd157, %r149;
ld.param.u64 %rd158, [__cudaparm__Z9gen_histsPmPfS0_S0_ii_histograms];
mul24.lo.u32 %r150, %r53, 20;
cvt.u64.u32 %rd159, %r150;
mul.wide.u32 %rd160, %r150, 8;
add.u64 %rd161, %rd158, %rd160;
mul.wide.u32 %rd162, %r1, 8;
add.u64 %rd163, %rd161, %rd162;
st.global.u64 [%rd163+0], %rd157;
$Lt_0_31746:
.loc 18 199 0
exit;
$LDWend__Z9gen_histsPmPfS0_S0_ii:
} // _Z9gen_histsPmPfS0_S0_ii