The basic problem is that the compiler generates insane ISA for bitfields, while using bit-field extract manually results in MUCH more compact (and faster) code.
Take the following two simple kernels, one that uses bitfields and the other that uses bit-field extract:
struct bitField {
int a : 3;
int b : 7;
int c : 22;
};
__global__
void separateBitFields(bitField *in, int *out1, int *out2, int *out3)
{
const int tid = threadIdx.x;
bitField foo = in[tid];
out1[tid] = foo.a;
out2[tid] = foo.b;
out3[tid] = foo.c;
}
__device__ __forceinline__
unsigned int bfe(unsigned int x, unsigned int bit, unsigned int numBits) {
unsigned int ret;
asm("bfe.u32 %0, %1, %2, %3;" :
"=r"(ret) : "r"(x), "r"(bit), "r"(numBits));
return ret;
}
__global__
void separateBitFields(unsigned int *in, unsigned int *out1, unsigned int *out2, unsigned int *out3)
{
const int tid = threadIdx.x;
unsigned int foo = in[tid];
out1[tid] = bfe(foo, 0, 3);
out2[tid] = bfe(foo, 3, 7);
out3[tid] = bfe(foo, 10, 22);
}
And then look at the output of the generated ISA for the two:
code for sm_20
Function : _Z17separateBitFieldsPjS_S_S_
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x84019c042c000000*/ S2R R6, SR_Tid_X;
/*0010*/ /*0x10029de218000000*/ MOV32I R10, 0x4;
/*0018*/ /*0x1061dce35000c000*/ IMUL.HI R7, R6, 0x4;
/*0020*/ /*0x80621ca320158000*/ IMAD R8.CC, R6, R10, c [0x0] [0x20];
/*0028*/ /*0x90725c4348004000*/ IADD.X R9, R7, c [0x0] [0x24];
/*0030*/ /*0xa0611ca320158000*/ IMAD R4.CC, R6, R10, c [0x0] [0x28];
/*0038*/ /*0x00801c8584000000*/ LD.E R0, [R8];
/*0040*/ /*0xb0715c4348004000*/ IADD.X R5, R7, c [0x0] [0x2c];
/*0048*/ /*0xc0609ca320158000*/ IMAD R2.CC, R6, R10, c [0x0] [0x30];
/*0050*/ /*0xd070dc4348004000*/ IADD.X R3, R7, c [0x0] [0x34];
/*0058*/ /*0xe0621ca320158000*/ IMAD R8.CC, R6, R10, c [0x0] [0x38];
/*0060*/ /*0xf0725c4348004000*/ IADD.X R9, R7, c [0x0] [0x3c];
/*0068*/ /*0x00019c037000c00c*/ BFE.U32 R6, R0, 0x300;
/*0070*/ /*0x0c01dc037000c01c*/ BFE.U32 R7, R0, 0x703;
/*0078*/ /*0x28001c037000c058*/ BFE.U32 R0, R0, 0x160a;
/*0080*/ /*0x00419c8594000000*/ ST.E [R4], R6;
/*0088*/ /*0x0021dc8594000000*/ ST.E [R2], R7;
/*0090*/ /*0x00801c8594000000*/ ST.E [R8], R0;
/*0098*/ /*0x00001de780000000*/ EXIT;
..............................................
Function : _Z17separateBitFieldsP8bitFieldPiS1_S1_
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x84021c042c000000*/ S2R R8, SR_Tid_X;
/*0010*/ /*0x10031de218000000*/ MOV32I R12, 0x4;
/*0018*/ /*0x1081dce35000c000*/ IMUL.HI R7, R8, 0x4;
/*0020*/ /*0x80829ca320198000*/ IMAD R10.CC, R8, R12, c [0x0] [0x20];
/*0028*/ /*0x9072dc4348004000*/ IADD.X R11, R7, c [0x0] [0x24];
/*0030*/ /*0xa0809ca320198000*/ IMAD R2.CC, R8, R12, c [0x0] [0x28];
/*0038*/ /*0x00a19c8584000000*/ LD.E R6, [R10];
/*0040*/ /*0xb070dc4348004000*/ IADD.X R3, R7, c [0x0] [0x2c];
/*0048*/ /*0xc0811ca320198000*/ IMAD R4.CC, R8, R12, c [0x0] [0x30];
/*0050*/ /*0xd0715c4348004000*/ IADD.X R5, R7, c [0x0] [0x34];
/*0058*/ /*0x18229c041d800000*/ I2I.U32.U8 R10, R6.B3;
/*0060*/ /*0x18225c041d000000*/ I2I.U32.U8 R9, R6.B2;
/*0068*/ /*0x18201c041c000000*/ I2I.U32.U8 R0, R6;
/*0070*/ /*0x18219c041c800000*/ I2I.U32.U8 R6, R6.B1;
/*0078*/ /*0x2822dc041c000000*/ I2I.U32.U8 R11, R10;
/*0080*/ /*0x24229c041c000000*/ I2I.U32.U8 R10, R9;
/*0088*/ /*0x18225c041c000000*/ I2I.U32.U8 R9, R6;
/*0090*/ /*0xe0819ca320198000*/ IMAD R6.CC, R8, R12, c [0x0] [0x38];
/*0098*/ /*0x60b2dc036000c000*/ SHL R11, R11, 0x18;
/*00a0*/ /*0x20921c036000c000*/ SHL R8, R9, 0x8;
/*00a8*/ /*0x40a29c036000c000*/ SHL R10, R10, 0x10;
/*00b0*/ /*0xf071dc4348004000*/ IADD.X R7, R7, c [0x0] [0x3c];
/*00b8*/ /*0x00825c4368000000*/ LOP.OR R9, R8, R0;
/*00c0*/ /*0x2ca29c4368000000*/ LOP.OR R10, R10, R11;
/*00c8*/ /*0x74001c036000c000*/ SHL R0, R0, 0x1d;
/*00d0*/ /*0x58925c036000c000*/ SHL R9, R9, 0x16;
/*00d8*/ /*0x20a29c4368000000*/ LOP.OR R10, R10, R8;
/*00e0*/ /*0x74001c235800c000*/ SHR R0, R0, 0x1d;
/*00e8*/ /*0x64921c235800c000*/ SHR R8, R9, 0x19;
/*00f0*/ /*0x28a25c235800c000*/ SHR R9, R10, 0xa;
/*00f8*/ /*0x00201c8594000000*/ ST.E [R2], R0;
/*0100*/ /*0x00421c8594000000*/ ST.E [R4], R8;
/*0108*/ /*0x00625c8594000000*/ ST.E [R6], R9;
/*0110*/ /*0x00001de780000000*/ EXIT;
........................................................
Is there any reason the compiler doesn’t just use bfe automatically when using bit-fields?