Hello experts,
I have one question about the syncronization between load and register consumer instruction within one single thread. I found that there is no dependency barrier in some cases while others may have DEPBAR.LE being inserted by compiler. Could someone help explain it, thank you.
Will the hardware check the register dependency and guarantee the correctness?
When do we need the compiler to insert the explicit DEPBAR instruction?
case_1: no DEPBAR between LDG and IADD instructions:
code for sm_52
Function : _Z6vecAddPiS_S_i
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001cfc00e22007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ S2R R0, SR_CTAID.X ; /* 0xf0c8000002570000 */
/*0018*/ S2R R2, SR_TID.X ; /* 0xf0c8000002170002 */
/* 0x001fd842fec20ff1 */
/*0028*/ XMAD.MRG R3, R0.reuse, c[0x0] [0x8].H1, RZ ; /* 0x4f107f8000270003 */
/*0030*/ XMAD R2, R0.reuse, c[0x0] [0x8], R2 ; /* 0x4e00010000270002 */
/*0038*/ XMAD.PSL.CBCC R0, R0.H1, R3.H1, R2 ; /* 0x5b30011800370000 */
/* 0x001ff400fd4007ed */
/*0048*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x158], PT ; /* 0x4b6d038005670007 */
/*0050*/ NOP ; /* 0x50b0000000070f00 */
/*0058*/ @P0 EXIT ; /* 0xe30000000000000f */
/* 0x081fd800fea207f1 */
/*0068*/ SHL R6, R0.reuse, 0x2 ; /* 0x3848000000270006 */
/*0070*/ SHR R0, R0, 0x1e ; /* 0x3829000001e70000 */
/*0078*/ IADD R4.CC, R6.reuse, c[0x0][0x140] ; /* 0x4c10800005070604 */
/* 0x001fd800fe0207f2 */
/*0088*/ IADD.X R5, R0.reuse, c[0x0][0x144] ; /* 0x4c10080005170005 */
/*0090*/ { IADD R2.CC, R6, c[0x0][0x148] ; /* 0x4c10800005270602 */
/*0098*/ LDG.E R4, [R4] }
/* 0xeed4200000070404 */
/* 0x001fd800f62007e2 */
/*00a8*/ IADD.X R3, R0, c[0x0][0x14c] ; /* 0x4c10080005370003 */
/*00b0*/ LDG.E R2, [R2] ; /* 0xeed4200000070202 */
/*00b8*/ IADD R6.CC, R6, c[0x0][0x150] ; /* 0x4c10800005470606 */
/* 0x001fc420fe4007f7 */
/*00c8*/ IADD.X R7, R0, c[0x0][0x154] ; /* 0x4c10080005570007 */
/*00d0*/ IADD R0, R2, R4 ; /* 0x5c10000000470200 */
/*00d8*/ STG.E [R6], R0 ; /* 0xeedc200000070600 */
/* 0x001ffc00ffe007ea */
/*00e8*/ NOP ; /* 0x50b0000000070f00 */
/*00f0*/ EXIT ; /* 0xe30000000007000f */
/*00f8*/ BRA 0xf8 ; /* 0xe2400fffff87000f */
case_2: DEPBAR.LE is inserted in the sass instructions
code for sm_52
Function : _Z8vadd_intPiPKiS_ii
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM52 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM52)"
/* 0x001c4400fe0007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ MOV R2, c[0x0][0x140] ; /* 0x4c98078005070002 */
/*0018*/ S2R R19, SR_CTAID.X ; /* 0xf0c8000002570013 */
/* 0x001fc400fe2007f2 */
/*0028*/ MOV R3, c[0x0][0x144] ; /* 0x4c98078005170003 */
/*0030*/ LDG.E.CI R0, [R2] ; /* 0xeed4a00000070200 */
/*0038*/ LDG.E.CI R5, [R2+0x4] ; /* 0xeed4a00000470205 */
/* 0x001ec400fe2007b1 */
/*0048*/ LDG.E.CI R6, [R2+0x8] ; /* 0xeed4a00000870206 */
/*0050*/ LDG.E.CI R4, [R2+0xc] ; /* 0xeed4a00000c70204 */
/*0058*/ LDG.E.CI R7, [R2+0x10] ; /* 0xeed4a00001070207 */
/* 0x001fc400f62007f1 */
/*0068*/ LDG.E.CI R8, [R2+0x14] ; /* 0xeed4a00001470208 */
/*0070*/ LDG.E.CI R9, [R2+0x18] ; /* 0xeed4a00001870209 */
/*0078*/ LDG.E.CI R10, [R2+0x1c] ; /* 0xeed4a00001c7020a */
/* 0x001ec400fe2007b1 */
/*0088*/ LDG.E.CI R11, [R2+0x20] ; /* 0xeed4a0000207020b */
/*0090*/ LDG.E.CI R12, [R2+0x24] ; /* 0xeed4a0000247020c */
/*0098*/ LDG.E.CI R13, [R2+0x28] ; /* 0xeed4a0000287020d */
/* 0x001fc400f62007f1 */
/*00a8*/ LDG.E.CI R14, [R2+0x2c] ; /* 0xeed4a00002c7020e */
/*00b0*/ LDG.E.CI R15, [R2+0x30] ; /* 0xeed4a0000307020f */
/*00b8*/ LDG.E.CI R16, [R2+0x34] ; /* 0xeed4a00003470210 */
/* 0x001fcc00362007b1 */
/*00c8*/ LDG.E.CI R17, [R2+0x38] ; /* 0xeed4a00003870211 */
/*00d0*/ LDG.E.CI R18, [R2+0x3c] ; /* 0xeed4a00003c70212 */
/*00d8*/ DEPBAR.LE SB5, 0x6 ; /* 0xf0f0000034670000 */
/* 0x001f8400eac007f0 */
/*00e8*/ IADD3 R5, R6, R5, R0 ; /* 0x5cc0000000570605 */
/*00f0*/ S2R R0, SR_TID.X ; /* 0xf0c8000002170000 */
/*00f8*/ IADD3 R7, R7, R4, R5 ; /* 0x5cc0028000470707 */
/* 0x001fd800fea20ff0 */
/*0108*/ XMAD.MRG R5, R19.reuse, c[0x0] [0x8].H1, RZ ; /* 0x4f107f8000271305 */
/*0110*/ DEPBAR.LE SB5, 0x4 ; /* 0xf0f0000034470000 */
/*0118*/ IADD3 R8, R9, R8, R7 ; /* 0x5cc0038000870908 */
/* 0x001fc400fec007f0 */
/*0128*/ IADD3 R10, R11, R10, R8 ; /* 0x5cc0040000a70b0a */
/*0130*/ DEPBAR.LE SB5, 0x2 ; /* 0xf0f0000034270000 */
/*0138*/ IADD3 R12, R13, R12, R10 ; /* 0x5cc0050000c70d0c */
/* 0x001fd400fc2027f5 */