Hello,
I have written this trivial kernel:
__global__ void ktestRecipr(float *buffer, int terms){
int term_m_1 = blockIdx.x * blockDim.x + threadIdx.x;
float term_dbl = 2.0f*(float)term_m_1;
buffer[term_m_1] = 1.0f/(1.0f+term_dbl);
}
The problem is that for CC 2.0 it translates to a huge stream of instructions, where for CC1.1 the instruction stream is normal (short). Thus code for CC 2.0 is not efficient and I don’t know what causes it. It seems as a fault of the reciprocal operation as removing it turns the instruction stream to a very short one (i.e. buffer[term_m_1] = 1.0f+term_dbl; )
The instruction stream as of cuobjbin output is:
Function : _Z11ktestReciprPfi
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x94001c042c000000*/ S2R R0, SR_CTAid_X;
/*0010*/ /*0x84009c042c000000*/ S2R R2, SR_Tid_X;
/*0018*/ /*0x20001c0320044000*/ IMAD.U32.U32 R0, R0, c [0x0] [0x8], R2;
/*0020*/ /*0x01209e0418000000*/ I2F.F32.S32 R2, R0;
/*0028*/ /*0x08209c0050000000*/ FADD R2, R2, R2;
/*0030*/ /*0x00211c005000cfe0*/ FADD R4, R2, 0x3f800;
/*0038*/ /*0x6001000750000000*/ CAL 0x58;
/*0040*/ /*0x80001c4340004000*/ ISCADD R0, R0, c [0x0] [0x20], 0x2;
/*0048*/ /*0x00011c8590000000*/ ST [R0], R4;
/*0050*/ /*0x00001de780000000*/ EXIT;
/*0058*/ /*0x04409c036000c000*/ SHL.W R2, R4, 0x1;
/*0060*/ /*0x60209c035800c000*/ SHR.U32.W R2, R2, 0x18;
/*0068*/ /*0x0c20dc034800c000*/ IADD R3, R2, 0x3;
/*0070*/ /*0x0c20dc041c000000*/ I2I.U32.U8 R3, R3;
/*0078*/ /*0x0c31dc03198ec000*/ ISETP.LE.U32.AND P0, pt, R3, 0x3, pt;
/*0080*/ /*0x800001e740000000*/ @P0 BRA 0xa8;
/*0088*/ /*0x10409c00c8000000*/ MUFU.RCP R2, R4;
/*0090*/ /*0x0440dc0030048000*/ FFMA R3, R4, R2, c [0x10] [0x0];
/*0098*/ /*0x0c211e0030040000*/ FFMA R4, R2, -R3, R2;
/*00a0*/ /*0x60001de740000007*/ BRA 0x280;
/*00a8*/ /*0xfc21dc031a8e0000*/ ISETP.NE.U32.AND P0, pt, R2, RZ, pt;
/*00b0*/ /*0x000001e740000003*/ @P0 BRA 0x178;
/*00b8*/ /*0xfc409c023801ffff*/ LOP32I.AND R2, R4, 0x7fffff;
/*00c0*/ /*0x0800dc0378000000*/ FLO.U32 R3, R2;
/*00c8*/ /*0x7c30de034800c000*/ IADD R3, -R3, 0x1f;
/*00d0*/ /*0xdc315c034800ffff*/ IADD R5, R3, 0xffff7;
/*00d8*/ /*0x0451dc03198ec000*/ ISETP.LE.U32.AND P0, pt, R5, 0x1, pt;
/*00e0*/ /*0x400001e740000000*/ @P0 BRA 0xf8;
/*00e8*/ /*0x10411c00c8000000*/ MUFU.RCP R4, R4;
/*00f0*/ /*0x20001de740000006*/ BRA 0x280;
/*00f8*/ /*0xe0315c034800ffff*/ IADD R5, R3, 0xffff8;
/*0100*/ /*0x14209c0360000000*/ SHL.W R2, R2, R5;
/*0108*/ /*0xfc209c023801ffff*/ LOP32I.AND R2, R2, 0x7fffff;
/*0110*/ /*0x00209c4238fe0000*/ LOP32I.OR R2, R2, 0x3f800000;
/*0118*/ /*0x10215c00c8000000*/ MUFU.RCP R5, R2;
/*0120*/ /*0x04209c00300a8000*/ FFMA R2, R2, R5, c [0x10] [0x0];
/*0128*/ /*0x08509e00300a0000*/ FFMA R2, R5, -R2, R5;
/*0130*/ /*0xfc209c023801ffff*/ LOP32I.AND R2, R2, 0x7fffff;
/*0138*/ /*0xfc215c03110e0000*/ ISET.EQ.U32.AND R5, R2, RZ, pt;
/*0140*/ /*0x1430dd0348000000*/ IADD R3, R3, -R5;
/*0148*/ /*0xd0315c034800c003*/ IADD R5, R3, 0xf4;
/*0150*/ /*0x0040dc023a000000*/ LOP32I.AND R3, R4, -0x80000000;
/*0158*/ /*0x5c511c036000c000*/ SHL.W R4, R5, 0x17;
/*0160*/ /*0x1030dc4368000000*/ LOP.OR R3, R3, R4;
/*0168*/ /*0x0c211c4368000000*/ LOP.OR R4, R2, R3;
/*0170*/ /*0x20001de740000004*/ BRA 0x280;
/*0178*/ /*0x0c215c034800fffc*/ IADD R5, R2, 0xfff03;
/*0180*/ /*0x0451dc031a0ec000*/ ISETP.GT.U32.AND P0, pt, R5, 0x1, pt;
/*0188*/ /*0xa00001e740000003*/ @P0 BRA 0x278;
/*0190*/ /*0xfc40dc023801ffff*/ LOP32I.AND R3, R4, 0x7fffff;
/*0198*/ /*0x0c029de218000000*/ MOV32I R10, 0x3;
/*01a0*/ /*0x00411c023a000000*/ LOP32I.AND R4, R4, -0x80000000;
/*01a8*/ /*0x00319c4238fe0000*/ LOP32I.OR R6, R3, 0x3f800000;
/*01b0*/ /*0x14a29c0360000000*/ SHL.W R10, R10, R5;
/*01b8*/ /*0x1061dc00c8000000*/ MUFU.RCP R7, R6;
/*01c0*/ /*0x04619c00300e8000*/ FFMA R6, R6, R7, c [0x10] [0x0];
/*01c8*/ /*0x18721e00308e0000*/ FFMA.RM R8, R7, -R6, R7;
/*01d0*/ /*0x18719e00310e0000*/ FFMA.RP R6, R7, -R6, R7;
/*01d8*/ /*0xfc825c023801ffff*/ LOP32I.AND R9, R8, 0x7fffff;
/*01e0*/ /*0x1881dc002e8e0000*/ FSETP.NEU.FTZ.AND P0, pt, R8, R6, pt;
/*01e8*/ /*0x00925c4238020000*/ LOP32I.OR R9, R9, 0x800000;
/*01f0*/ /*0x07f1dc042010c000*/ SEL R7, RZ, 0x1, !P0;
/*01f8*/ /*0x24a29c0368000000*/ LOP.AND R10, R10, R9;
/*0200*/ /*0x14a19c0358000000*/ SHR.U32.W R6, R10, R5;
/*0208*/ /*0x24515c0368000000*/ LOP.AND R5, R5, R9;
/*0210*/ /*0x08621c036800c000*/ LOP.AND R8, R6, 0x2;
/*0218*/ /*0x14715c4368000000*/ LOP.OR R5, R7, R5;
/*0220*/ /*0x1021dc034800fffc*/ IADD R7, R2, 0xfff04;
/*0228*/ /*0xfc81dc031a8e0000*/ ISETP.NE.U32.AND P0, pt, R8, RZ, pt;
/*0230*/ /*0x04609c036800c000*/ LOP.AND R2, R6, 0x1;
/*0238*/ /*0xfc51dc031aa00000*/ ISETP.NE.U32.OR P0, pt, R5, RZ, P0;
/*0240*/ /*0x1c915c0358000000*/ SHR.U32.W R5, R9, R7;
/*0248*/ /*0xfc21dc231a800000*/ ISETP.NE.AND P0, pt, R2, RZ, P0;
/*0250*/ /*0x045140034800c000*/ @P0 IADD R5, R5, 0x1;
/*0258*/ /*0x04509c036000c000*/ SHL.W R2, R5, 0x1;
/*0260*/ /*0x14209c0331060000*/ ICMP.EQ.U32 R2, R2, R5, R3;
/*0268*/ /*0x10211c4368000000*/ LOP.OR R4, R2, R4;
/*0270*/ /*0x20001de740000000*/ BRA 0x280;
/*0278*/ /*0x10411c00c8000000*/ MUFU.RCP R4, R4;
/*0280*/ /*0x00001de790000000*/ RET;
The CC 1.1 instruction stream is:
Function : _Z11ktestReciprPfi
/*0000*/ /*0x1000020540004780*/ G2R.U16 R0H, g [0x1].U16;
/*0008*/ /*0xa000000504000780*/ I2I.U32.U16 R1, R0L;
/*0010*/ /*0x60014c0100204780*/ IMAD.U16 R0, g [0x6].U16, R0H, R1;
/*0018*/ /*0xa000000544014780*/ I2F.F32.S32 R1, R0;
/*0020*/ /*0xb000020500004780*/ FADD R1, R1, R1;
/*0028*/ /*0x30020001c4100780*/ SHL R0, R0, 0x2;
/*0030*/ /*0xb000020503f80003*/ FADD32I R1, R1, 0x3f800000;
/*0038*/ /*0x2100e800 */ IADD32 R0, g [0x4], R0;
/*003c*/ /*0x90000204 */ RCP32 R1, R1;
/*0040*/ /*0xd00e0005a0c00781*/ GST.U32 global14 [R0], R1;
Seems weird…