That is good to know.
The SASS makes me think that the bad blockIdx.x values I see are real.
I used this method to see the SASS with interleaved C++ source code in Visual Studio:
PTX_SASS_Assembly_Debugging.htm
The PrintIfBadBlockIdx() excerpt is appended below. It looks like gridDim.x is obtained from constant memory and blockIdx.x from SR_CTAID.X. The printf() call even refetches the values rather than use the values stored earlier in R21 and R22. I can single-step the SASS in Visual Studio, but I am unable to see GPU register values.
Based on the SASS information, and much more debugging, it now appears that the virtual function call had nothing to do with the issue. Further it appears that CDP fire-and-forget launches may be at the root of the problem.
I have managed to make a standalone program that exhibits the same bad behavior as my full program, but only when I run the program in nvvp. In contrast, my full program shows the bad behavior regardless of whether I use nvvp. I will keep working on it and post the program once I have more reliable results or a better idea of what is going on.
Thanks for your help njuffa,
__device__ __noinline__ bool PrintIfBadBlockIdx( const char* AppendStr = "", bool AlwaysPrint = false ) {
0x00402588 IADD32I R1, R1, -0x60
0x00402590 S2R R0, SR_LMEMHIOFF
0x00402598 ISETP.GE.U32.AND P0, PT, R1, R0, PT
0x004025a8 @P0 BRA 0x4025b8
0x004025b0 BPT.TRAP 0x1
0x004025b8 STL [R1+0x58], R29
0x004025c8 STL [R1+0x54], R28
0x004025d0 STL [R1+0x50], R27
0x004025d8 STL [R1+0x4c], R26
0x004025e8 STL [R1+0x48], R25
0x004025f0 STL [R1+0x44], R24
0x004025f8 STL [R1+0x40], R23
0x00402608 STL [R1+0x3c], R22
0x00402610 STL [R1+0x38], R21
0x00402618 STL [R1+0x34], R20
0x00402628 STL [R1+0x30], R19
0x00402630 STL [R1+0x2c], R18
0x00402638 STL [R1+0x28], R17
0x00402648 STL [R1+0x24], R16
0x00402650 STL [R1+0x20], R2
0x00402658 IADD R0, R1, RZ
0x00402668 I2I.U32.U32 R0, R0
0x00402670 MOV R8, R0
0x00402678 MOV R9, RZ
0x00402688 MOV R7, R8
0x00402690 MOV R8, R9
0x00402698 MOV R7, R7
0x004026a8 MOV R8, R8
0x004026b0 MOV R0, c[0x0][0x4]
0x004026b8 MOV R3, c[0x0][0x104]
0x004026c8 IADD R0.CC, R7, R0
0x004026d0 IADD.X R3, R8, R3
0x004026d8 BFE R6, R6, 0x800
0x004026e8 I2I.S16.S8 R6, R6
0x004026f0 MOV R4, R4
0x004026f8 MOV R5, R5
0x00402708 MOV R16, R0
0x00402710 MOV R2, R3
0x00402718 XMAD.PSL.CLO R25, R25.H1, 0x1, R6
0x00402728 MOV R28, R4
0x00402730 MOV R29, R5
const uint32_t N_Blocks = gridDim.x;
0x00402738 MOV R0, c[0x0][0x14]
0x00402748 MOV R0, R0
0x00402750 MOV R21, R0
const uint32_t BlkIndex = blockIdx.x;
0x00402758 S2R R0, SR_CTAID.X
0x00402768 MOV R0, R0
0x00402770 MOV R0, R0
0x00402778 MOV R22, R0
const uint32_t ThrIndex = threadIdx.x;
0x00402788 S2R R0, SR_TID.X
0x00402790 MOV R0, R0
0x00402798 MOV R0, R0
0x004027a8 MOV R23, R0
const bool LeadThr = ThrIndex == 0;
0x004027b0 ISETP.EQ.AND P0, PT, R23, RZ, PT
0x004027b8 SEL R0, RZ, 0x1, !P0
0x004027c8 XMAD.PSL.CLO R24, R24.H1, 0x1, R0
bool BadIdx = false; // all threads
0x004027d0 XMAD.PSL.CLO R0, R0.H1, 0x1, RZ
0x004027d8 MOV R0, R0
0x004027e8 MOV R0, R0
if ( LeadThr ) {
0x004027f0 I2I.S32.S16 R3, RZ
0x004027f8 I2I.S32.S16 R4, R24
0x00402808 ISETP.NE.AND P0, PT, R4, R3, PT
0x00402810 PSETP.AND.AND P0, PT, !P0, PT, PT
0x00402818 MOV R0, R0
0x00402828 PSETP.AND.AND P0, PT, P0, PT, PT
0x00402830 XMAD.PSL.CLO R0, R0, 0x1, R0
0x00402838 SSY 0x402cf8
0x00402848 @P0 SYNC
0x00402850 BRA 0x402858
BadIdx = BlkIndex >= N_Blocks; // LeadThr only
0x00402858 ISETP.GE.U32.AND P0, PT, R22, R21, PT
0x00402868 SEL R3, RZ, 0x1, !P0
0x00402870 SEL R4, RZ, 0x1, !P0
0x00402878 XMAD.PSL.CLO R20, R20.H1, 0x1, R3
0x00402888 XMAD.PSL.CLO R19, R19.H1, 0x1, R4
const char* BadStr = BadIdx ? " <<< !!!" : "";
0x00402890 I2I.S32.S16 R0, RZ
0x00402898 I2I.S32.S16 R3, R19
0x004028a8 ISETP.NE.AND P0, PT, R3, R0, PT
0x004028b0 PSETP.AND.AND P0, PT, !P0, PT, PT
0x004028b8 @P0 BRA 0x402940
0x004028c8 BRA 0x4028d0
0x004028d0 MOV32I R4, 0x0
0x004028d8 MOV32I R5, 0x0
0x004028e8 MOV R4, R4
0x004028f0 MOV R5, R5
0x004028f8 MOV R4, R4
0x00402908 MOV R5, R5
0x00402910 MOV R4, R4
0x00402918 MOV R5, R5
0x00402928 MOV R4, R4
0x00402930 MOV R5, R5
0x00402938 BRA 0x4029b8
0x00402948 MOV32I R4, 0x0
0x00402950 MOV32I R5, 0x0
0x00402958 MOV R4, R4
0x00402968 MOV R5, R5
0x00402970 MOV R4, R4
0x00402978 MOV R5, R5
0x00402988 MOV R4, R4
0x00402990 MOV R5, R5
0x00402998 MOV R4, R4
0x004029a8 MOV R5, R5
0x004029b0 BRA 0x4029b8
0x004029b8 MOV R4, R4
0x004029c8 MOV R5, R5
0x004029d0 MOV R26, R4
0x004029d8 MOV R27, R5
if ( BadIdx || AlwaysPrint ) {
0x004029e8 LOP32I.AND R0, R20, 0xff
0x004029f0 I2I.S32.S16 R3, RZ
0x004029f8 I2I.S32.S16 R0, R0
0x00402a08 ISETP.NE.AND P1, PT, R0, R3, PT
0x00402a10 PSETP.AND.AND P0, PT, PT, PT, PT
0x00402a18 PSETP.AND.AND P0, PT, P0, PT, PT
0x00402a28 PSETP.AND.AND P0, PT, P0, PT, PT
0x00402a30 @P1 BRA 0x402a90
0x00402a38 BRA 0x402a40
0x00402a48 LOP32I.AND R0, R25, 0xff
0x00402a50 I2I.S32.S16 R3, RZ
0x00402a58 I2I.S32.S16 R0, R0
0x00402a68 ISETP.NE.AND P0, PT, R0, R3, PT
0x00402a70 PSETP.AND.AND P0, PT, P0, PT, PT
0x00402a78 PSETP.AND.AND P0, PT, P0, PT, PT
0x00402a88 BRA 0x402a90
0x00402a90 PSETP.AND.AND P0, PT, P0, PT, PT
0x00402a98 PSETP.AND.AND P0, PT, !P0, PT, PT
0x00402aa8 SSY 0x402cd0
0x00402ab0 @P0 SYNC
0x00402ab8 BRA 0x402ac0
printf( "\n ***** Block %3u of %3u, SM = %3u ***** %s %s", blockIdx.x, gridDim.x, GetSM_ID(), AppendStr, BadStr );
0x00402ac8 S2R R0, SR_CTAID.X
0x00402ad0 MOV R0, R0
0x00402ad8 MOV R3, c[0x0][0x14]
0x00402ae8 MOV R18, R0
0x00402af0 MOV R17, R3
0x00402af8 JCAL 0x0
0x00402b08 MOV R4, R4
0x00402b10 IADD R7.CC, R16, RZ
0x00402b18 IADD.X R0, R2, RZ
0x00402b28 LEA R6.CC, R7, RZ
0x00402b30 LEA.HI.X P0, R7, R7, RZ, R0
0x00402b38 ST.E [R6], R18, P0
0x00402b48 IADD32I R7.CC, R16, 0x4
0x00402b50 IADD.X R0, R2, RZ
0x00402b58 LEA R6.CC, R7, RZ
0x00402b68 LEA.HI.X P0, R7, R7, RZ, R0
0x00402b70 ST.E [R6], R17, P0
0x00402b78 IADD32I R7.CC, R16, 0x8
0x00402b88 IADD.X R0, R2, RZ
0x00402b90 LEA R6.CC, R7, RZ
0x00402b98 LEA.HI.X P0, R7, R7, RZ, R0
0x00402ba8 ST.E [R6], R4, P0
0x00402bb0 IADD32I R5.CC, R16, 0x10
0x00402bb8 IADD.X R0, R2, RZ
0x00402bc8 LEA R4.CC, R5, RZ
0x00402bd0 LEA.HI.X P0, R5, R5, RZ, R0
0x00402bd8 ST.E.64 [R4], R28, P0
0x00402be8 IADD32I R5.CC, R16, 0x18
0x00402bf0 IADD.X R0, R2, RZ
0x00402bf8 LEA R4.CC, R5, RZ
0x00402c08 LEA.HI.X P0, R5, R5, RZ, R0
0x00402c10 ST.E.64 [R4], R26, P0
0x00402c18 MOV32I R4, 0x0
0x00402c28 MOV32I R5, 0x0
0x00402c30 MOV R4, R4
0x00402c38 MOV R5, R5
0x00402c48 MOV R0, R4
0x00402c50 MOV R5, R5
0x00402c58 MOV R4, R0
0x00402c68 MOV R5, R5
0x00402c70 IADD R6.CC, R16, RZ
0x00402c78 IADD.X R7, R2, RZ
0x00402c88 MOV R4, R4
0x00402c90 MOV R5, R5
0x00402c98 MOV R6, R6
0x00402ca8 MOV R7, R7
0x00402cb0 JCAL 0x0
0x00402cb8 SYNC
0x00402cc8 SYNC
0x00402cd0 XMAD.PSL.CLO R0, R20, 0x1, R0
0x00402cd8 MOV R0, R0
0x00402ce8 SYNC
0x00402cf0 SYNC
return BadIdx; // threads other than thread 0 always return false
0x00402cf8 MOV R0, R0
0x00402d08 MOV R0, R0
0x00402d10 XMAD.PSL.CHI R0, R0.H1, 0x1, R0
0x00402d18 I2I.S16.S8 R3, R0
0x00402d28 MOV R3, R3
0x00402d30 MOV R3, R3
0x00402d38 I2I.U32.U16 R3, R3
0x00402d48 MOV R3, R3
0x00402d50 I2I.S32.S8 R3, R3
0x00402d58 MOV R3, R3
0x00402d68 MOV R4, R3
0x00402d70 BRA 0x402d78
0x00402d78 LDL R2, [R1+0x20]
0x00402d88 LDL R16, [R1+0x24]
0x00402d90 LDL R17, [R1+0x28]
0x00402d98 LDL R18, [R1+0x2c]
0x00402da8 LDL R19, [R1+0x30]
0x00402db0 LDL R20, [R1+0x34]
0x00402db8 LDL R21, [R1+0x38]
0x00402dc8 LDL R22, [R1+0x3c]
0x00402dd0 LDL R23, [R1+0x40]
0x00402dd8 LDL R24, [R1+0x44]
0x00402de8 LDL R25, [R1+0x48]
0x00402df0 LDL R26, [R1+0x4c]
0x00402df8 LDL R27, [R1+0x50]
0x00402e08 LDL R28, [R1+0x54]
0x00402e10 LDL R29, [R1+0x58]
0x00402e18 IADD32I R1, R1, 0x60
0x00402e28 RET
0x00402e30 BRA 0x402e30
0x00402e38 NOP