hi.
ptxas seems to always generate a branch when compiling comditional atomicAdd().
__global__ void f(int *p)
{
__shared__ int x;
int idx = threadIdx.x;
int r = 0;
for (int i = 0; i < 4; i++) {
int n = 42;
if (idx % 2 == 1) {
n = atomicAdd(&x, idx); // conditionaly execute atomicAdd()
}
r ^= n;
}
*p = r;
}
SASS
code for sm_61
Function : _Z1fPi
.headerflags @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x001c7c00fe0007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ { MOV32I R2, 0x2a; /* 0x0100000002a7f002 */
/*0018*/ S2R R5, SR_TID.X; } /* 0xf0c8000002170005 */
/* 0x001f9800fec00ff0 */
/*0028*/ { LEA.HI R0, R5, R5, RZ, 0x1; /* 0x5bdf7f8010570500 */
/*0030*/ SSY 0x80; } /* 0xe290000004800000 */
/*0038*/ LOP32I.AND R0, R0, 0xfffffffe; /* 0x040fffffffe70000 */
/* 0x001fc000fda007f6 */
/*0048*/ IADD R0, -R0, R5; /* 0x5c12000000570000 */
/*0050*/ ISETP.NE.AND P0, PT, R0, 0x1, PT; /* 0x366b038000170007 */
/*0058*/ { MOV32I R0, 0x2a; /* 0x0100000002a7f000 */
/*0068*/ @P0 SYNC; } /* 0x001ff400022007fd */
/* 0xf0f800000000000f */
/*0070*/ ATOMS.ADD R2, [RZ], R5; /* 0xec0000000057ff02 */
/*0078*/ SYNC; /* 0xf0f800000007000f */
/* 0x001ff400fe200ff0 */
/*0088*/ { MOV32I R3, 0x2a; /* 0x0100000002a7f003 */
/*0090*/ SSY 0xb8; } /* 0xe290000002000000 */
/*0098*/ @P0 SYNC; /* 0xf0f800000000000f */
/* 0x003fc000ffa00011 */
/*00a8*/ ATOMS.ADD R3, [RZ], R5; /* 0xec0000000057ff03 */
/*00b0*/ SYNC; /* 0xf0f800000007000f */
/*00b8*/ { LOP.XOR R3, R3, R2; /* 0x5c47040000270303 */
/*00c8*/ SSY 0xf8; } /* 0x001ff400fe0007f1 */
/* 0xe290000002800000 */
/*00d0*/ { MOV32I R2, 0x2a; /* 0x0100000002a7f002 */
/*00d8*/ @P0 SYNC; } /* 0xf0f800000000000f */
/* 0x003fc000ffa00011 */
/*00e8*/ ATOMS.ADD R2, [RZ], R5; /* 0xec0000000057ff02 */
/*00f0*/ SYNC; /* 0xf0f800000007000f */
/*00f8*/ { LOP.XOR R2, R2, R3; /* 0x5c47040000370202 */
/*0108*/ SSY 0x130; } /* 0x00004400ffa007f1 */
/* 0xe290000002000000 */
/*0110*/ @P0 SYNC; /* 0xf0f800000000000f */
/*0118*/ ATOMS.ADD R0, [RZ], R5; /* 0xec0000000057ff00 */
/* 0x001fc401fe2007fd */
/*0128*/ SYNC; /* 0xf0f800000007000f */
/*0130*/ LOP.XOR R0, R0, R2; /* 0x5c47040000270000 */
/*0138*/ MOV R2, c[0x0][0x140]; /* 0x4c98078005070002 */
/* 0x001ffc00fe2007f2 */
/*0148*/ MOV R3, c[0x0][0x144]; /* 0x4c98078005170003 */
/*0150*/ STG.E [R2], R0; /* 0xeedc200000070200 */
/*0158*/ EXIT; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*0168*/ BRA 0x160; /* 0xe2400fffff07000f */
/*0170*/ NOP; /* 0x50b0000000070f00 */
/*0178*/ NOP; /* 0x50b0000000070f00 */
in this example, four branches were generated and each branch skips only one instruction.
I tried similar codes many times to get predicated instruction, but ptxas always generates branches for condtional atomicAdd.
Are these branches optimal? Or, is ptxas generating a bad code?