I created a simper example to see the difference without a complex ray intersection function. I also set the grid size at compile time.
- It does look exactly like this.
I am using CUDA 5.5 with compiler flags:
NVCXXFLAGS= -Xcompiler `echo "$(CXXFLAGS)" | tr ' ' ','` --ptxas-options=-v -lineinfo -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_30,code=sm_30 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_35,code=compute_35
Here is a simple functor to set all elements of an array to a value:
struct IntMemsetFunctor
{
const int value;
IntMemsetFunctor(const int v)
: value(v)
{}
__device__ inline tuple<int> operator()(int dummy){
return tuple<int>(value);
}
The tuple<> is used so that the generic map parallel primitive being implemented here can take a variable number of inputs. The collect method just takes the array index of the input and places the result in the output.
Some sass:
myKernel
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0018*/ IMAD R4, R0, c[0x0][0x8], R2; /* 0x2004400020011ca3 */
/*0020*/ MOV R0, c[0x0][0x14]; /* 0x2800400050001de4 */
/*0028*/ ISETP.GE.AND P0, PT, R4, c[0x0][0x20], PT; /* 0x1b0e40008041dc23 */
/*0030*/ IMUL R0, R0, c[0x0][0x8]; /* 0x5000400020001ca3 */
/*0038*/ @P0 EXIT ; /* 0x80000000000001e7 */
/*0040*/ I2I.S32.S32 R5, |c[0x0] [0x68]|; /* 0x1c004001a1215ec4 */
/*0048*/ I2I.S32.S32 R7, |c[0x0] [0x6c]|; /* 0x1c004001b121dec4 */
/*0050*/ LOP.PASS_B R9, RZ, ~c[0x0][0x68]; /* 0x68004001a3f25dc3 */
/*0058*/ I2F.F32.U32.RP R2, R5; /* 0x1804000015209c04 */
/*0060*/ I2F.F32.U32.RP R3, R7; /* 0x180400001d20dc04 */
/*0068*/ LOP.PASS_B R11, RZ, ~c[0x0][0x6c]; /* 0x68004001b3f2ddc3 */
/*0070*/ MUFU.RCP R2, R2; /* 0xc800000010209c00 */
/*0078*/ MOV R13, c[0x0][0x74]; /* 0x28004001d0035de4 */
/*0080*/ MUFU.RCP R3, R3; /* 0xc80000001030dc00 */
/*0088*/ IADD32I R2, R2, 0xffffffe; /* 0x083ffffff8209c02 */
/*0090*/ MOV32I R10, 0x4; /* 0x1800000010029de2 */
/*0098*/ IADD32I R3, R3, 0xffffffe; /* 0x083ffffff830dc02 */
/*00a0*/ F2I.FTZ.U32.F32.TRUNC R2, R2; /* 0x1486000009209c04 */
/*00a8*/ MOV R12, c[0x0][0x78]; /* 0x28004001e0031de4 */
/*00b0*/ F2I.FTZ.U32.F32.TRUNC R3, R3; /* 0x148600000d20dc04 */
/*00b8*/ IMUL.U32.U32 R6, R5, R2; /* 0x5000000008519c03 */
/*00c0*/ IMUL.U32.U32 R8, R7, R3; /* 0x500000000c721c03 */
/*00c8*/ I2I.S32.S32 R6, -R6; /* 0x1c00000019219f84 */
/*00d0*/ I2I.S32.S32 R8, -R8; /* 0x1c00000021221f84 */
/*00d8*/ IMAD.U32.U32.HI R6, R2, R6, R2; /* 0x2004000018219c43 */
/*00e0*/ IMAD.U32.U32.HI R8, R3, R8, R3; /* 0x2006000020321c43 */
/*00e8*/ I2I.S32.S32 R3, |R4|; /* 0x1c0000001120dec4 */
/*00f0*/ LOP.XOR R14, R4, c[0x0][0x68]; /* 0x68004001a0439c83 */
/*00f8*/ IADD R4, R4, R0; /* 0x4800000000411c03 */
/*0100*/ IMUL.U32.U32.HI R2, R6, R3; /* 0x500000000c609c43 */
/*0108*/ ISETP.GE.AND P1, PT, R14, RZ, PT; /* 0x1b0e0000fce3dc23 */
/*0110*/ IMAD.U32.U32 R3, -R5, R2, R3; /* 0x200600000850de03 */
/*0118*/ ISETP.LE.U32.AND P0, PT, R5, R3, PT; /* 0x198e00000c51dc03 */
/*0120*/ @P0 ISUB R3, R3, R5; /* 0x480000001430c103 */
/*0128*/ @P0 IADD R2, R2, 0x1; /* 0x4800c00004208003 */
/*0130*/ ISETP.GE.U32.AND P0, PT, R3, R5, PT; /* 0x1b0e00001431dc03 */
/*0138*/ @P0 IADD R2, R2, 0x1; /* 0x4800c00004208003 */
/*0140*/ ISETP.NE.AND P0, PT, RZ, c[0x0][0x68], PT; /* 0x1a8e4001a3f1dc23 */
/*0148*/ @!P1 I2I.S32.S32 R2, -R2; /* 0x1c0000000920a784 */
/*0150*/ SEL R14, R9, R2, !P0; /* 0x2010000008939c04 */
/*0158*/ I2I.S32.S32 R2, |R14|; /* 0x1c00000039209ec4 */
/*0160*/ ISETP.GE.AND P1, PT, R14, RZ, PT; /* 0x1b0e0000fce3dc23 */
/*0168*/ IMUL.U32.U32.HI R3, R8, R2; /* 0x500000000880dc43 */
/*0170*/ IMAD.U32.U32 R2, -R7, R3, R2; /* 0x200400000c709e03 */
/*0178*/ ISETP.LE.U32.AND P0, PT, R7, R2, PT; /* 0x198e00000871dc03 */
/*0180*/ @P0 ISUB R2, R2, R7; /* 0x480000001c208103 */
/*0188*/ ISETP.LE.U32.AND P0, PT, R7, R2, PT; /* 0x198e00000871dc03 */
/*0190*/ @P0 ISUB R2, R2, R7; /* 0x480000001c208103 */
/*0198*/ ISETP.NE.AND P0, PT, RZ, c[0x0][0x6c], PT; /* 0x1a8e4001b3f1dc23 */
/*01a0*/ @!P1 I2I.S32.S32 R2, -R2; /* 0x1c0000000920a784 */
/*01a8*/ SEL R2, R2, R11, P0; /* 0x200000002c209c04 */
/*01b0*/ ISETP.LT.AND P0, PT, R4, c[0x0][0x20], PT; /* 0x188e40008041dc23 */
/*01b8*/ IMAD R3, R2, c[0x0][0x70], R13; /* 0x201a4001c020dca3 */
/*01c0*/ IMAD R2.CC, R3, R10, c[0x0][0x58]; /* 0x2015800160309ca3 */
/*01c8*/ IMAD.HI.X R3, R3, R10, c[0x0][0x5c]; /* 0x209480017030dce3 */
/*01d0*/ ST.E [R2], R12; /* 0x9400000000231c85 */
/*01d8*/ @P0 BRA 0xe8; /* 0x4003fffc200001e7 */
/*01e0*/ EXIT ; /* 0x8000000000001de7 */
and myKernel2:
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0018*/ IMAD R2, R0, c[0x0][0x8], R2; /* 0x2004400020009ca3 */
/*0020*/ ISETP.GE.AND P0, PT, R2, c[0x0][0x20], PT; /* 0x1b0e40008021dc23 */
/*0028*/ @P0 EXIT ; /* 0x80000000000001e7 */
/*0030*/ I2I.S32.S32 R3, |c[0x0] [0x68]|; /* 0x1c004001a120dec4 */
/*0038*/ ISETP.NE.AND P1, PT, RZ, c[0x0][0x68], PT; /* 0x1a8e4001a3f3dc23 */
/*0040*/ I2F.F32.U32.RP R0, R3; /* 0x180400000d201c04 */
/*0048*/ MUFU.RCP R0, R0; /* 0xc800000010001c00 */
/*0050*/ IADD32I R0, R0, 0xffffffe; /* 0x083ffffff8001c02 */
/*0058*/ F2I.FTZ.U32.F32.TRUNC R0, R0; /* 0x1486000001201c04 */
/*0060*/ IMUL.U32.U32 R4, R3, R0; /* 0x5000000000311c03 */
/*0068*/ I2I.S32.S32 R5, -R4; /* 0x1c00000011215f84 */
/*0070*/ I2I.S32.S32 R4, |R2|; /* 0x1c00000009211ec4 */
/*0078*/ LOP.XOR R2, R2, c[0x0][0x68]; /* 0x68004001a0209c83 */
/*0080*/ IMAD.U32.U32.HI R0, R0, R5, R0; /* 0x2000000014001c43 */
/*0088*/ I2I.S32.S32 R5, |c[0x0] [0x6c]|; /* 0x1c004001b1215ec4 */
/*0090*/ ISETP.GE.AND P2, PT, R2, RZ, PT; /* 0x1b0e0000fc25dc23 */
/*0098*/ IMUL.U32.U32.HI R0, R0, R4; /* 0x5000000010001c43 */
/*00a0*/ I2F.F32.U32.RP R6, R5; /* 0x1804000015219c04 */
/*00a8*/ IMAD.U32.U32 R4, -R3, R0, R4; /* 0x2008000000311e03 */
/*00b0*/ MUFU.RCP R6, R6; /* 0xc800000010619c00 */
/*00b8*/ ISETP.LE.U32.AND P0, PT, R3, R4, PT; /* 0x198e00001031dc03 */
/*00c0*/ IADD32I R6, R6, 0xffffffe; /* 0x083ffffff8619c02 */
/*00c8*/ @P0 ISUB R4, R4, R3; /* 0x480000000c410103 */
/*00d0*/ @P0 IADD R0, R0, 0x1; /* 0x4800c00004000003 */
/*00d8*/ F2I.FTZ.U32.F32.TRUNC R6, R6; /* 0x1486000019219c04 */
/*00e0*/ ISETP.GE.U32.AND P0, PT, R4, R3, PT; /* 0x1b0e00000c41dc03 */
/*00e8*/ LOP.PASS_B R3, RZ, ~c[0x0][0x68]; /* 0x68004001a3f0ddc3 */
/*00f0*/ IMUL.U32.U32 R2, R5, R6; /* 0x5000000018509c03 */
/*00f8*/ @P0 IADD R0, R0, 0x1; /* 0x4800c00004000003 */
/*0100*/ MOV R4, c[0x0][0x74]; /* 0x28004001d0011de4 */
/*0108*/ I2I.S32.S32 R2, -R2; /* 0x1c00000009209f84 */
/*0110*/ @!P2 I2I.S32.S32 R0, -R0; /* 0x1c00000001202b84 */
/*0118*/ SEL R0, R3, R0, !P1; /* 0x2012000000301c04 */
/*0120*/ IMAD.U32.U32.HI R3, R6, R2, R6; /* 0x200c00000860dc43 */
/*0128*/ I2I.S32.S32 R2, |R0|; /* 0x1c00000001209ec4 */
/*0130*/ ISETP.GE.AND P1, PT, R0, RZ, PT; /* 0x1b0e0000fc03dc23 */
/*0138*/ LOP.PASS_B R0, RZ, ~c[0x0][0x6c]; /* 0x68004001b3f01dc3 */
/*0140*/ IMUL.U32.U32.HI R3, R3, R2; /* 0x500000000830dc43 */
/*0148*/ IMAD.U32.U32 R2, -R5, R3, R2; /* 0x200400000c509e03 */
/*0150*/ MOV32I R3, 0x4; /* 0x180000001000dde2 */
/*0158*/ ISETP.LE.U32.AND P0, PT, R5, R2, PT; /* 0x198e00000851dc03 */
/*0160*/ @P0 ISUB R2, R2, R5; /* 0x4800000014208103 */
/*0168*/ ISETP.LE.U32.AND P0, PT, R5, R2, PT; /* 0x198e00000851dc03 */
/*0170*/ @P0 ISUB R2, R2, R5; /* 0x4800000014208103 */
/*0178*/ ISETP.NE.AND P0, PT, RZ, c[0x0][0x6c], PT; /* 0x1a8e4001b3f1dc23 */
/*0180*/ @!P1 I2I.S32.S32 R2, -R2; /* 0x1c0000000920a784 */
/*0188*/ SEL R0, R2, R0, P0; /* 0x2000000000201c04 */
/*0190*/ IMAD R0, R0, c[0x0][0x70], R4; /* 0x20084001c0001ca3 */
/*0198*/ MOV R4, c[0x0][0x78]; /* 0x28004001e0011de4 */
/*01a0*/ IMAD R2.CC, R0, R3, c[0x0][0x58]; /* 0x2007800160009ca3 */
/*01a8*/ IMAD.HI.X R3, R0, R3, c[0x0][0x5c]; /* 0x208680017000dce3 */
/*01b0*/ ST.E [R2], R4; /* 0x9400000000211c85 */
/*01b8*/ EXIT ;
So in the case of the simple kernel, myKernel2 uses 6 registers to the 14 from the first. That lead me to think that there is some more overhead associated with myKernel. When I do the same test with the ray intersect functor, myKernel uses one less than the 49 registers of myKernel2. Maybe the longer the function is, the more likely that the early registers values get sent to local memory.
Also, I may just be looking in the wrong place for reducing the number of registers in my intersect function.