These are the results using clock().
For S1070 (Tesla) with nvcc 3.2/cuobjdump 4.0 (‘nvcc -keep warpsync.cu’):
A[i] = 0 for i < b0
A[i] = 244576 for b0 <= i < b1
A[i] = 244468 for b1 <= i < b2
A[i] = 0 for b2 <= i
and
B[i] = 0 for i < b0
B[i] = 244770 for b0 <= i < b2
B[i] = 0 for b2 <= i
and the cuobjdump output:
code for sm_10
Function : _Z8warpsyncPViS0_iii
/*0000*/ /*0xa000000504000780*/ I2I.U32.U16 R1, R0L;
/*0008*/ /*0x3001d3fd6c20c7c8*/ ISET.S32.C0 o [0x7f], g [0x9], R1, LE;
/*0010*/ /*0xa000d00300000000*/ SSY 0x68;
/*0018*/ /*0x1000900300000280*/ BRA C0.NE, 0x48;
/*0020*/ /*0x3001d1fd6c20c7c8*/ ISET.S32.C0 o [0x7f], g [0x8], R1, LE;
/*0028*/ /*0x3000000300000500*/ RET C0.EQU;
/*0030*/ /*0x0000000160004780*/ S2R R0, SR1;
/*0038*/ /*0x30010001c4100780*/ SHL R0, R0, 0x1;
/*0040*/ /*0x1000d00300000780*/ BRA 0x68;
/*0048*/ /*0x3001d5fd6c2107c8*/ ISET.S32.C0 o [0x7f], g [0xa], R1, GT;
/*0050*/ /*0x3000000300000500*/ RET C0.EQU;
/*0058*/ /*0x0000000160004780*/ S2R R0, SR1;
/*0060*/ /*0x30010001c4100780*/ SHL R0, R0, 0x1;
/*0068*/ /*0x30020205c4100782*/ SHL.S R1, R1, 0x2;
/*0070*/ /*0x2000c80904204780*/ IADD R2, g [0x4], R1;
/*0078*/ /*0xd00e0401a0c00780*/ GST.U32 global14 [R2], R0;
/*0080*/ /*0x0000000160004780*/ S2R R0, SR1;
/*0088*/ /*0x30010001c4100780*/ SHL R0, R0, 0x1;
/*0090*/ /*0x2000cc0504204780*/ IADD R1, g [0x6], R1;
/*0098*/ /*0xd00e0201a0c00781*/ GST.U32 global14 [R1], R0;
For GTX 480 (Fermi) with nvcc 4.0 (‘nvcc -keep -arch sm_20 warpsync.cu’):
A[i] = 0 for i < b0
A[i] = 83798724 for b0 <= i < b1
A[i] = 83798444 for b1 <= i < b2
A[i] = 0 for b2 <= i
and
B[i] = 0 for i < b0
B[i] = 83798874 for b0 <= i < b1
B[i] = 83798562 for b1 <= i < b2
B[i] = 0 for b2 <= i
and the cuobjdump output:
code for sm_20
Function : _Z8warpsyncPViS0_iii
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x84009c042c000000*/ S2R R2, SR_Tid_X;
/*0010*/ /*0xd021dc231b0e4000*/ ISETP.GE.AND P0, pt, R2, c [0x0] [0x34], pt;
/*0018*/ /*0xa00001e740000000*/ @P0 BRA 0x48;
/*0020*/ /*0xc021dc231b0e4000*/ ISETP.GE.AND P0, pt, R2, c [0x0] [0x30], pt;
/*0028*/ /*0x000021e780000000*/ @!P0 EXIT;
/*0030*/ /*0x40001c042c000001*/ S2R R0, SR_ClockLo;
/*0038*/ /*0x04001e036000c000*/ SHL R0, R0, 0x1;
/*0040*/ /*0x80001de740000000*/ BRA 0x68;
/*0048*/ /*0xe021dc23188e4000*/ ISETP.LT.AND P0, pt, R2, c [0x0] [0x38], pt;
/*0050*/ /*0x000021e780000000*/ @!P0 EXIT;
/*0058*/ /*0x40001c042c000001*/ S2R R0, SR_ClockLo;
/*0060*/ /*0x04001e036000c000*/ SHL R0, R0, 0x1;
/*0068*/ /*0x0820de036000c000*/ SHL R3, R2, 0x2;
/*0070*/ /*0x10209ce35000c000*/ IMUL.HI R2, R2, 0x4;
/*0078*/ /*0x80311c0348014000*/ IADD R4.CC, R3, c [0x0] [0x20];
/*0080*/ /*0x90215c4348004000*/ IADD.X R5, R2, c [0x0] [0x24];
/*0088*/ /*0x00401f8594000000*/ ST.E.WT [R4], R0;
/*0090*/ /*0x40001c042c000001*/ S2R R0, SR_ClockLo;
/*0098*/ /*0x04001e036000c000*/ SHL R0, R0, 0x1;
/*00a0*/ /*0xa0311c0348014000*/ IADD R4.CC, R3, c [0x0] [0x28];
/*00a8*/ /*0xb0215c4348004000*/ IADD.X R5, R2, c [0x0] [0x2c];
/*00b0*/ /*0x00401f8594000000*/ ST.E.WT [R4], R0;
/*00b8*/ /*0x00001de780000000*/ EXIT;
The threads don’t seem to reconverge in Fermi. This is expected given the SASS dump. However, it’s still intriguing why it doesn’t fail with the tid’s.