predicated ATOMS.ADD instruction never generated by ptxas

hi.

ptxas seems to always generate a branch when compiling comditional atomicAdd().

__global__ void f(int *p)
{
    __shared__ int x;
    int idx = threadIdx.x;
    int r = 0;
    for (int i = 0; i < 4; i++) {
        int n = 42;
        if (idx % 2 == 1) {
            n = atomicAdd(&x, idx); // conditionaly execute atomicAdd()
        }
        r ^= n;
    }
    *p = r;
}

SASS

code for sm_61
		Function : _Z1fPi
	.headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                               /* 0x001c7c00fe0007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                       /* 0x4c98078000870001 */
        /*0010*/         {         MOV32I R2, 0x2a;                            /* 0x0100000002a7f002 */
        /*0018*/                   S2R R5, SR_TID.X;        }                  /* 0xf0c8000002170005 */
                                                                               /* 0x001f9800fec00ff0 */
        /*0028*/         {         LEA.HI R0, R5, R5, RZ, 0x1;                 /* 0x5bdf7f8010570500 */
        /*0030*/                   SSY 0x80;        }                          /* 0xe290000004800000 */
        /*0038*/                   LOP32I.AND R0, R0, 0xfffffffe;              /* 0x040fffffffe70000 */
                                                                               /* 0x001fc000fda007f6 */
        /*0048*/                   IADD R0, -R0, R5;                           /* 0x5c12000000570000 */
        /*0050*/                   ISETP.NE.AND P0, PT, R0, 0x1, PT;           /* 0x366b038000170007 */
        /*0058*/         {         MOV32I R0, 0x2a;                            /* 0x0100000002a7f000 */
        /*0068*/               @P0 SYNC;        }                              /* 0x001ff400022007fd */
                                                                               /* 0xf0f800000000000f */
        /*0070*/                   ATOMS.ADD R2, [RZ], R5;                     /* 0xec0000000057ff02 */
        /*0078*/                   SYNC;                                       /* 0xf0f800000007000f */
                                                                               /* 0x001ff400fe200ff0 */
        /*0088*/         {         MOV32I R3, 0x2a;                            /* 0x0100000002a7f003 */
        /*0090*/                   SSY 0xb8;        }                          /* 0xe290000002000000 */
        /*0098*/               @P0 SYNC;                                       /* 0xf0f800000000000f */
                                                                               /* 0x003fc000ffa00011 */
        /*00a8*/                   ATOMS.ADD R3, [RZ], R5;                     /* 0xec0000000057ff03 */
        /*00b0*/                   SYNC;                                       /* 0xf0f800000007000f */
        /*00b8*/         {         LOP.XOR R3, R3, R2;                         /* 0x5c47040000270303 */
        /*00c8*/                   SSY 0xf8;        }                          /* 0x001ff400fe0007f1 */
                                                                               /* 0xe290000002800000 */
        /*00d0*/         {         MOV32I R2, 0x2a;                            /* 0x0100000002a7f002 */
        /*00d8*/               @P0 SYNC;        }                              /* 0xf0f800000000000f */
                                                                               /* 0x003fc000ffa00011 */
        /*00e8*/                   ATOMS.ADD R2, [RZ], R5;                     /* 0xec0000000057ff02 */
        /*00f0*/                   SYNC;                                       /* 0xf0f800000007000f */
        /*00f8*/         {         LOP.XOR R2, R2, R3;                         /* 0x5c47040000370202 */
        /*0108*/                   SSY 0x130;        }                         /* 0x00004400ffa007f1 */
                                                                               /* 0xe290000002000000 */
        /*0110*/               @P0 SYNC;                                       /* 0xf0f800000000000f */
        /*0118*/                   ATOMS.ADD R0, [RZ], R5;                     /* 0xec0000000057ff00 */
                                                                               /* 0x001fc401fe2007fd */
        /*0128*/                   SYNC;                                       /* 0xf0f800000007000f */
        /*0130*/                   LOP.XOR R0, R0, R2;                         /* 0x5c47040000270000 */
        /*0138*/                   MOV R2, c[0x0][0x140];                      /* 0x4c98078005070002 */
                                                                               /* 0x001ffc00fe2007f2 */
        /*0148*/                   MOV R3, c[0x0][0x144];                      /* 0x4c98078005170003 */
        /*0150*/                   STG.E [R2], R0;                             /* 0xeedc200000070200 */
        /*0158*/                   EXIT;                                       /* 0xe30000000007000f */
                                                                               /* 0x001f8000fc0007ff */
        /*0168*/                   BRA 0x160;                                  /* 0xe2400fffff07000f */
        /*0170*/                   NOP;                                        /* 0x50b0000000070f00 */
        /*0178*/                   NOP;                                        /* 0x50b0000000070f00 */

in this example, four branches were generated and each branch skips only one instruction.
I tried similar codes many times to get predicated instruction, but ptxas always generates branches for condtional atomicAdd.

Are these branches optimal? Or, is ptxas generating a bad code?