.version 1.0
.target compute_10, map_f64_to_f32
//by ctc's be_ptx
.extern .shared .align 16 .b8 sharedbase[];
.entry e011e760c_tester
{
.param .u32 p_entering;
.param .u32 p_ret;
.param .s32 p_n;
.reg .u32 $r1,$r2,$r3,$r4,$r5,$r6,$r7,$r8,$r9,$r10,$r11,$r12,$r13,$r14,$r15,$r16,$r17,$r18,$r19,$r20,$r21,$r22,$r23,$r24,$r25,$r26,$r27,$r28,$r29,$r30,$r31,$r32,$r33,$r34,$r35,$r36,$r37,$r38,$r39,$r40,$r41,$r42,$r43,$r44,$r45,$r46,$r47,$r48,$r49,$r50,$r51,$r52,$r53,$r54,$r55,$r56,$r57,$r58,$r59,$r60,$r61,$r62,$r63,$r64,$r65,$r66,$r67;
.reg .pred $p1,$p2,$p3,$p4,$p5,$p6,$p7,$p8;
cvt.u32.u16 $r1,%tid.x;
mov.s32 $r2,$r1;
$L012454D0:
mov.s32 $r3,0;
setp.eq.s32 $p1,$r2,$r3;
@$p1 bra $L0123DC18;
bra.uni $L_exit;
$L0123DC18:
ld.param.s32 $r5,p_entering;
mov.s32 $r6,64;
add.s32 $r4,$r5,$r6;
mov.s32 $r7,$r4;
mov.u32 $r8,%physid;
mov.s32 $r9,$r8;
$L0123E050:
mov.s32 $r10,$r9;
mov.s32 $r12,16;
shr.s32 $r11,$r10,$r12;
mov.s32 $r13,1;
and.b32 $r11,$r13,$r11;
mov.s32 $r15,19;
shr.s32 $r14,$r10,$r15;
add.s32 $r11,$r11,$r14;
mov.s32 $r16,$r11;
$L0123D8B8:
mov.s32 $r17,$r16;
mov.s32 $r19,2;
shl.b32 $r18,$r17,$r19;
ld.param.s32 $r20,p_entering;
add.s32 $r18,$r20,$r18;
mov.s32 $r22,1;
mov.s32 $r21,$r22;
st.global.s32 [$r18],$r21;
mov.s32 $r23,0;
mov.s32 $r24,0;
bra.uni $L0123E638;
$L0123E710:
mov.s32 $r26,2;
shl.b32 $r25,$r24,$r26;
add.s32 $r25,$r7,$r25;
ld.global.s32 $r27,[$r25];
mov.s32 $r28,$r23;
max.s32 $r29,$r28,$r27;
mov.s32 $r30,$r29;
$L012423D0:
mov.s32 $r23,$r30;
add.s32 $r24,$r24,1;
$L0123E638:
mov.s32 $r31,15;
setp.le.s32 $p2,$r24,$r31;
@$p2 bra $L0123E710;
mov.s32 $r32,$r23;
add.s32 $r23,$r23,1;
mov.s32 $r34,2;
shl.b32 $r33,$r17,$r34;
add.s32 $r33,$r7,$r33;
mov.s32 $r35,$r23;
st.global.s32 [$r33],$r35;
mov.s32 $r37,2;
shl.b32 $r36,$r17,$r37;
ld.param.s32 $r38,p_entering;
add.s32 $r36,$r38,$r36;
mov.s32 $r40,0;
mov.s32 $r39,$r40;
st.global.s32 [$r36],$r39;
mov.s32 $r41,0;
bra.uni $L0123FEB0;
$L0123FF88:
bra.uni $L01240498;
$L012403C0:
$L01240498:
mov.s32 $r43,2;
shl.b32 $r42,$r41,$r43;
ld.param.s32 $r44,p_entering;
add.s32 $r42,$r44,$r42;
ld.global.s32 $r45,[$r42];
mov.s32 $r46,0;
setp.ne.s32 $p3,$r45,$r46;
@$p3 bra $L012403C0;
bra.uni $L01240C30;
$L01240B58:
$L01240C30:
mov.s32 $r48,2;
shl.b32 $r47,$r41,$r48;
add.s32 $r47,$r7,$r47;
ld.global.s32 $r49,[$r47];
mov.s32 $r50,0;
setp.eq.s32 $p4,$r49,$r50;
@$p4 bra $L01240D08;
mov.s32 $r52,2;
shl.b32 $r51,$r41,$r52;
add.s32 $r51,$r7,$r51;
ld.global.s32 $r53,[$r51];
setp.lt.s32 $p5,$r53,$r23;
@$p5 bra $L01240B58;
mov.s32 $r55,2;
shl.b32 $r54,$r41,$r55;
add.s32 $r54,$r7,$r54;
ld.global.s32 $r56,[$r54];
setp.ne.s32 $p6,$r56,$r23;
@$p6 bra $L01240D08;
setp.lt.s32 $p7,$r41,$r17;
@$p7 bra $L01240B58;
$L01240D08:
add.s32 $r41,$r41,1;
$L0123FEB0:
mov.s32 $r57,15;
setp.le.s32 $p8,$r41,$r57;
@$p8 bra $L0123FF88;
cvt.u32.u16 $r58,%ctaid.x;
mov.s32 $r59,$r58;
$L01245D40:
add.s32 $r59,$r59,1;
ld.param.s32 $r61,p_ret;
ld.param.s32 $r63,p_ret;
ld.global.s32 $r62,[$r63];
add.s32 $r60,$r62,$r59;
st.global.s32 [$r61],$r60;
mov.s32 $r65,2;
shl.b32 $r64,$r17,$r65;
add.s32 $r64,$r7,$r64;
mov.s32 $r67,0;
mov.s32 $r66,$r67;
st.global.s32 [$r64],$r66;
$L_exit:exit;
}
Successfully implemented block-wise atomic operation on 8800 using Lamport’s bakery algorithm and %physid.
My guess should be correct.