hi, i cache part of data into constant buffer to speed up the read.
if the address is small, i read from constant buffer; elsel, i read from global memory. The big picture is below:
#412 if(i < mark) //use cbuffer!
#413 ...cBuf[i]..;
#414 else
#415 ...globalMemory[i]..;
the real .ptx content related with line #412-#415 is below. can anyone tell me why they are so long? and how can they be mapped to 4 lines of cuda code? thanks!
ps. i used “nvcc -myKernel.cu -ptx” in cmd line. the card is 8800GTX. nvcc is Feb_14 beta version.
Thanks!
# 412 if(entry < _CBufREntryNum)
# 413 cmp = BoxCompare(c_Roots[entry], boxQ, queryType, false);
mov.s32 $r40, 0; #
cvt.s8.s32 $rh1, $r40; #
# .loc 12 14 0
@$p3 bra $Lt_12_120; #
$LBB8_MyKernel:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mov.s32 $r41, 2; #
setp.eq.s32 $p5, $r1, $r41; #
@$p5 bra $Lt_12_124; #
$LBB9_MyKernel:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mov.s32 $r42, 3; #
setp.eq.s32 $p6, $r1, $r42; #
@$p6 bra $Lt_12_128; #
$LBB10_MyKernel:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mov.s16 $rh6, 1; #
mov.s32 $r43, 1; #
bra.uni $Lt_12_16; #
$Lt_12_120:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mul24.lo.u32 $r44, $r36, 16; #
add.u32 $r45, $r44, $r26; #
mov.u32 $r46, (&boxQ$4); #
add.u32 $r47, $r27, 8; #
mov.s32 $r48, $r45; #
mov.s16 $rh7, 1; #
mov.s32 $r43, 1; #
$Lt_12_121:
#<loop> Loop body line 14, nesting depth: 4, iterations: 2
# .loc 12 25 0
ld.const.u32 $r49, [$r48+0]; # id:265 c_Roots+0x0
ld.local.u32 $r50, [$r46+8]; # id:266 boxQ$4+0x0
setp.gt.u32 $p7, $r49, $r50; #
@$p7 bra $Lt_12_156; #
$LBB13_MyKernel:
#<loop> Part of loop body line 14, head labeled $Lt_12_121
ld.local.u32 $r51, [$r46+0]; # id:267 boxQ$4+0x0
ld.const.u32 $r52, [$r48+8]; # id:268 c_Roots+0x0
setp.gt.u32 $p8, $r51, $r52; #
@$p8 bra $L_12_82; #
$LBB14_MyKernel:
#<loop> Part of loop body line 14, head labeled $Lt_12_121
mov.s16 $rh8, 1; #
mov.s32 $r53, 1; #
bra.uni $L_12_81; #
$Lt_12_156:
$L_12_82:
#<loop> Part of loop body line 14, head labeled $Lt_12_121
mov.s16 $rh9, 0; #
mov.s32 $r53, 0; #
$L_12_81:
#<loop> Part of loop body line 14, head labeled $Lt_12_121
and.s32 $r54, $r53, $r43; #
mov.s32 $r55, 0; #
set.ne.u32.s32 $r56, $r54, $r55; #
neg.s32 $r43, $r56; #
# .loc 12 24 0
add.u32 $r48, $r48, 4; #
add.u32 $r46, $r46, 4; #
setp.ne.s32 $p9, $r46, $r47; #
@$p9 bra $Lt_12_121; #
$LBB17_MyKernel:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
bra.uni $Lt_12_16; #
$Lt_12_124:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mul24.lo.u32 $r44, $r36, 16; #
add.u32 $r45, $r44, $r26; #
mov.u32 $r57, (&boxQ$4); #
add.u32 $r47, $r27, 8; #
mov.s32 $r58, $r45; #
mov.s16 $rh10, 1; #
mov.s32 $r43, 1; #
$Lt_12_125:
#<loop> Loop body line 24, nesting depth: 4, iterations: 2
# .loc 12 30 0
ld.local.u32 $r59, [$r57+0]; # id:270 boxQ$4+0x0
ld.const.u32 $r60, [$r58+0]; # id:269 c_Roots+0x0
setp.lt.u32 $p10, $r59, $r60; #
@$p10 bra $Lt_12_158; #
$LBB20_MyKernel:
#<loop> Part of loop body line 24, head labeled $Lt_12_125
ld.local.u32 $r61, [$r57+8]; # id:272 boxQ$4+0x0
ld.const.u32 $r62, [$r58+8]; # id:271 c_Roots+0x0
setp.gt.u32 $p11, $r61, $r62; #
@$p11 bra $L_12_85; #
$LBB21_MyKernel:
#<loop> Part of loop body line 24, head labeled $Lt_12_125
mov.s16 $rh11, 1; #
mov.s32 $r63, 1; #
bra.uni $L_12_84; #
$Lt_12_158:
$L_12_85:
#<loop> Part of loop body line 24, head labeled $Lt_12_125
mov.s16 $rh12, 0; #
mov.s32 $r63, 0; #
$L_12_84:
#<loop> Part of loop body line 24, head labeled $Lt_12_125
and.s32 $r64, $r63, $r43; #
mov.s32 $r65, 0; #
set.ne.u32.s32 $r66, $r64, $r65; #
neg.s32 $r43, $r66; #
# .loc 12 29 0
add.u32 $r58, $r58, 4; #
add.u32 $r57, $r57, 4; #
setp.ne.s32 $p12, $r57, $r47; #
@$p12 bra $Lt_12_125; #
$LBB24_MyKernel:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
bra.uni $Lt_12_16; #
$Lt_12_128:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mul24.lo.u32 $r44, $r36, 16; #
add.u32 $r45, $r44, $r26; #
mov.u32 $r67, (&boxQ$4); #
add.u32 $r47, $r27, 8; #
mov.s32 $r68, $r45; #
mov.s16 $rh13, 1; #
mov.s32 $r43, 1; #
$Lt_12_129:
#<loop> Loop body line 29, nesting depth: 4, iterations: 2
# .loc 12 34 0
ld.const.u32 $r69, [$r68+0]; # id:273 c_Roots+0x0
ld.local.u32 $r70, [$r67+8]; # id:274 boxQ$4+0x0
setp.gt.u32 $p13, $r69, $r70; #
@$p13 bra $Lt_12_160; #
$LBB27_MyKernel:
#<loop> Part of loop body line 29, head labeled $Lt_12_129
ld.local.u32 $r71, [$r67+0]; # id:275 boxQ$4+0x0
ld.const.u32 $r72, [$r68+8]; # id:276 c_Roots+0x0
setp.gt.u32 $p14, $r71, $r72; #
@$p14 bra $L_12_88; #
$LBB28_MyKernel:
#<loop> Part of loop body line 29, head labeled $Lt_12_129
mov.s16 $rh14, 1; #
mov.s32 $r73, 1; #
bra.uni $L_12_87; #
$Lt_12_160:
$L_12_88:
#<loop> Part of loop body line 29, head labeled $Lt_12_129
mov.s16 $rh15, 0; #
mov.s32 $r73, 0; #
$L_12_87:
#<loop> Part of loop body line 29, head labeled $Lt_12_129
and.s32 $r74, $r73, $r43; #
mov.s32 $r75, 0; #
set.ne.u32.s32 $r76, $r74, $r75; #
neg.s32 $r43, $r76; #
# .loc 12 33 0
add.u32 $r68, $r68, 4; #
add.u32 $r67, $r67, 4; #
setp.ne.s32 $p15, $r67, $r47; #
@$p15 bra $Lt_12_129; #
$Lt_12_16:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
# .loc 12 413 0
mov.s32 $r77, $r43; #
cvt.s8.s32 $rh2, $r77; #
bra.uni $Lt_12_111; #
$Lt_12_112:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
# .loc 12 415 0
# 414 else
# 415 cmp = BoxCompare(d_dir[entry], boxQ, queryType, false);
mov.s32 $r78, 0; #
cvt.s8.s32 $rh3, $r78; #
# .loc 12 14 0
@$p3 bra $Lt_12_138; #
$LBB33_MyKernel:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mov.s32 $r79, 2; #
setp.eq.s32 $p16, $r1, $r79; #
@$p16 bra $Lt_12_142; #
$LBB34_MyKernel:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mov.s32 $r80, 3; #
setp.eq.s32 $p17, $r1, $r80; #
@$p17 bra $Lt_12_146; #
$LBB35_MyKernel:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mov.s16 $rh16, 1; #
mov.s32 $r43, 1; #
bra.uni $Lt_12_5; #
$Lt_12_138:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mul24.lo.u32 $r44, $r36, 16; #
ld.param.u32 $r81, %parm_d_dir; # id:278 %parm_d_dir+0x0
add.u32 $r82, $r81, $r44; #
mov.u32 $r46, (&boxQ$4); #
add.u32 $r47, $r27, 8; #
mov.s32 $r83, $r82; #
mov.s16 $rh17, 1; #
mov.s32 $r43, 1; #
$Lt_12_139:
#<loop> Loop body line 14, nesting depth: 4, iterations: 2
# .loc 12 25 0
ld.global.u32 $r84, [$r83+0]; # id:279
ld.local.u32 $r85, [$r46+8]; # id:280 boxQ$4+0x0
setp.gt.u32 $p18, $r84, $r85; #
@$p18 bra $Lt_12_162; #
$LBB38_MyKernel:
#<loop> Part of loop body line 14, head labeled $Lt_12_139
ld.local.u32 $r86, [$r46+0]; # id:281 boxQ$4+0x0
ld.global.u32 $r87, [$r83+8]; # id:282
setp.gt.u32 $p19, $r86, $r87; #
@$p19 bra $L_12_94; #
$LBB39_MyKernel:
#<loop> Part of loop body line 14, head labeled $Lt_12_139
mov.s16 $rh18, 1; #
mov.s32 $r88, 1; #
bra.uni $L_12_93; #
$Lt_12_162:
$L_12_94:
#<loop> Part of loop body line 14, head labeled $Lt_12_139
mov.s16 $rh19, 0; #
mov.s32 $r88, 0; #
$L_12_93:
#<loop> Part of loop body line 14, head labeled $Lt_12_139
and.s32 $r89, $r88, $r43; #
mov.s32 $r90, 0; #
set.ne.u32.s32 $r91, $r89, $r90; #
neg.s32 $r43, $r91; #
# .loc 12 24 0
add.u32 $r83, $r83, 4; #
add.u32 $r46, $r46, 4; #
setp.ne.s32 $p20, $r46, $r47; #
@$p20 bra $Lt_12_139; #
$LBB42_MyKernel:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
bra.uni $Lt_12_5; #
$Lt_12_142:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mul24.lo.u32 $r44, $r36, 16; #
ld.param.u32 $r92, %parm_d_dir; # id:278 %parm_d_dir+0x0
add.u32 $r82, $r92, $r44; #
mov.u32 $r57, (&boxQ$4); #
add.u32 $r47, $r27, 8; #
mov.s32 $r93, $r82; #
mov.s16 $rh20, 1; #
mov.s32 $r43, 1; #
$Lt_12_143:
#<loop> Loop body line 24, nesting depth: 4, iterations: 2
# .loc 12 30 0
ld.local.u32 $r94, [$r57+0]; # id:284 boxQ$4+0x0
ld.global.u32 $r95, [$r93+0]; # id:283
setp.lt.u32 $p21, $r94, $r95; #
@$p21 bra $Lt_12_164; #
$LBB45_MyKernel:
#<loop> Part of loop body line 24, head labeled $Lt_12_143
ld.local.u32 $r96, [$r57+8]; # id:286 boxQ$4+0x0
ld.global.u32 $r97, [$r93+8]; # id:285
setp.gt.u32 $p22, $r96, $r97; #
@$p22 bra $L_12_97; #
$LBB46_MyKernel:
#<loop> Part of loop body line 24, head labeled $Lt_12_143
mov.s16 $rh21, 1; #
mov.s32 $r98, 1; #
bra.uni $L_12_96; #
$Lt_12_164:
$L_12_97:
#<loop> Part of loop body line 24, head labeled $Lt_12_143
mov.s16 $rh22, 0; #
mov.s32 $r98, 0; #
$L_12_96:
#<loop> Part of loop body line 24, head labeled $Lt_12_143
and.s32 $r99, $r98, $r43; #
mov.s32 $r100, 0; #
set.ne.u32.s32 $r101, $r99, $r100; #
neg.s32 $r43, $r101; #
# .loc 12 29 0
add.u32 $r93, $r93, 4; #
add.u32 $r57, $r57, 4; #
setp.ne.s32 $p23, $r57, $r47; #
@$p23 bra $Lt_12_143; #
$LBB49_MyKernel:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
bra.uni $Lt_12_5; #
$Lt_12_146:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
mul24.lo.u32 $r44, $r36, 16; #
ld.param.u32 $r102, %parm_d_dir; # id:278 %parm_d_dir+0x0
add.u32 $r82, $r102, $r44; #
mov.u32 $r67, (&boxQ$4); #
add.u32 $r47, $r27, 8; #
mov.s32 $r103, $r82; #
mov.s16 $rh23, 1; #
mov.s32 $r43, 1; #
$Lt_12_147:
#<loop> Loop body line 29, nesting depth: 4, iterations: 2
# .loc 12 34 0
ld.global.u32 $r104, [$r103+0]; # id:287
ld.local.u32 $r105, [$r67+8]; # id:288 boxQ$4+0x0
setp.gt.u32 $p24, $r104, $r105; #
@$p24 bra $Lt_12_166; #
$LBB52_MyKernel:
#<loop> Part of loop body line 29, head labeled $Lt_12_147
ld.local.u32 $r106, [$r67+0]; # id:289 boxQ$4+0x0
ld.global.u32 $r107, [$r103+8]; # id:290
setp.gt.u32 $p25, $r106, $r107; #
@$p25 bra $L_12_100; #
$LBB53_MyKernel:
#<loop> Part of loop body line 29, head labeled $Lt_12_147
mov.s16 $rh24, 1; #
mov.s32 $r108, 1; #
bra.uni $L_12_99; #
$Lt_12_166:
$L_12_100:
#<loop> Part of loop body line 29, head labeled $Lt_12_147
mov.s16 $rh25, 0; #
mov.s32 $r108, 0; #
$L_12_99:
#<loop> Part of loop body line 29, head labeled $Lt_12_147
and.s32 $r109, $r108, $r43; #
mov.s32 $r110, 0; #
set.ne.u32.s32 $r111, $r109, $r110; #
neg.s32 $r43, $r111; #
# .loc 12 33 0
add.u32 $r103, $r103, 4; #
add.u32 $r67, $r67, 4; #
setp.ne.s32 $p26, $r67, $r47; #
@$p26 bra $Lt_12_147; #
$Lt_12_5:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
# .loc 12 415 0
mov.s32 $r112, $r43; #
cvt.s8.s32 $rh2, $r112; #
$Lt_12_111:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
cvt.s32.s8 $r113, $rh2; #
mov.s32 $r114, 0; #
setp.eq.s32 $p27, $r113, $r114; #
@$p27 bra $Lt_12_149; #
$LBB58_MyKernel:
#<loop> Part of loop body line 409, head labeled $Lt_12_110
# .loc 12 418 0
# 416 if(cmp)