Thank you Njuffa.
I tried to use the __ldg intrinsic in the “global” memory kernel and indeed it speeds up it a lot on my K20. Here are some results
Global kernel results without __ldg
(10242 x 5122) 0.0161ms (*16) 0.0530ms (*64) 0.181ms (*128) 0.369ms
Global kernel results with __ldg
(10242 x 5122) 0.0122ms (*16) 0.0192ms (*64) 0.0396ms (*128) 0.0696ms
I must say that I do not understand the mechanism. If I understand correctly your previous posts, the __ldg intrinsic tries to exploit the texture even though the array has not been bound to the texture. Then, it is not really clear to my why it is much faster than the case when I explicitly bind the array to the texture. I recall that the results with binding the array to the texture were
Texture kernel results
(10242 x 5122) 0.0525ms (*16) 0.191ms (*64) 0.683ms (*128) 1.33ms
From the prototype of the __ldg intrinsic, I understand it does not apply to texture variables, but only global, am I right?
For completeness, here is the result of the cuobjdump command you recommended (regarding the “global” kernel)
Function : _Z40linear_interpolation_kernel_function_GPUPfPKfS1_i
i
/*0008*/ /*0x089c000664c03c00*/ MOV R1, c [0x0] [0x44];
/*0010*/ /*0xa01fc00274000000*/ MOV32I R0, 0x140;
/*0018*/ /*0x001c000a7ca00000*/ LDC R2, c [0x0] [R0];
/*0020*/ /*0xa21fc00274000000*/ MOV32I R0, 0x144;
/*0028*/ /*0x001c00427ca00000*/ LDC R16, c [0x0] [R0];
/*0030*/ /*0xa41fc00274000000*/ MOV32I R0, 0x148;
/*0038*/ /*0x001c00027ca00000*/ LDC R0, c [0x0] [R0];
/*0048*/ /*0xa61fc00e74000000*/ MOV32I R3, 0x14c;
/*0050*/ /*0x001c0c0e7ca00000*/ LDC R3, c [0x0] [R3];
/*0058*/ /*0xa81fc01274000000*/ MOV32I R4, 0x150;
/*0060*/ /*0x001c10127ca00000*/ LDC R4, c [0x0] [R4];
/*0068*/ /*0x109c001686400000*/ S2R R5, SR33;
/*0070*/ /*0x029c001ee4c03c00*/ MOV R7, R5;
/*0078*/ /*0x051c001664c03c00*/ MOV R5, c [0x0] [0x28];
/*0088*/ /*0x129c001a86400000*/ S2R R6, SR37;
/*0090*/ /*0x031c001ae4c03c00*/ MOV R6, R6;
/*0098*/ /*0x031c1446d1081c00*/ IMAD R17, R5, R6, R7;
/*00a0*/ /*0x021c441edb181c00*/ ISETP.LT.AND P0, PT, R17, R4, PT
;
/*00a8*/ /*0x001e001e84801c07*/ PSETP.AND.AND P0, PT, !P0, PT, P
T;
/*00b0*/ /*0x001c3c0285800000*/ NOP;
/*00b8*/ /*0x7c00000014800002*/ SSY 0x5b8;
/*00c8*/ /*0x7000003c12000002*/ @P0 BRA 0x5b0;
/*00d0*/ /*0x001c003c12000000*/ BRA 0xd8;
/*00d8*/ /*0x089c0012e4c03c00*/ MOV R4, R17;
/*00e0*/ /*0x011fc01674000000*/ MOV32I R5, 0x2;
/*00e8*/ /*0x7f9c001ee4c03c00*/ MOV R7, RZ;
/*00f0*/ /*0x029c0016e4c03c00*/ MOV R5, R5;
/*00f8*/ /*0x029c0016e4c03c00*/ MOV R5, R5;
/*0108*/ /*0x021c0012e4c03c00*/ MOV R4, R4;
/*0110*/ /*0x021c001ae4c03c00*/ MOV R6, R4;
/*0118*/ /*0x029c1812e2002000*/ LOP.XOR R4, R6, R5;
/*0120*/ /*0x039c103edb681c00*/ ISETP.GE.AND P1, PT, R4, R7, PT;
/*0128*/ /*0x029c0012e4c03c00*/ MOV R4, R5;
/*0130*/ /*0x031ce81ae6100000*/ I2I R6, |R6|;
/*0138*/ /*0x029ce816e6100000*/ I2I R5, |R5|;
/*0148*/ /*0x029c281ee5c00800*/ I2F.F32.U32.RP R7, R5;
/*0150*/ /*0x021c1c1e84000000*/ MUFU.RCP R7, R7;
/*0158*/ /*0x039c001ee4c03c00*/ MOV R7, R7;
/*0160*/ /*0xff1c1c1d4007ffff*/ IADD32I R7, R7, 0xffffffe;
/*0168*/ /*0x039c001ee4c03c00*/ MOV R7, R7;
/*0170*/ /*0x039c281ee5808c00*/ F2I.FTZ.U32.F32.TRUNC R7, R7;
/*0178*/ /*0x039c1422e1c00000*/ IMUL.U32.U32 R8, R5, R7;
/*0188*/ /*0x041ce822e6010000*/ I2I R8, -R8;
/*0190*/ /*0x041c1c22e1c00400*/ IMUL.U32.U32.HI R8, R7, R8;
/*0198*/ /*0x041c1c1ee0800000*/ IADD R7, R7, R8;
/*01a0*/ /*0x031c1c1ee1c00400*/ IMUL.U32.U32.HI R7, R7, R6;
/*01a8*/ /*0x009c1c25c0800000*/ IADD R9, R7, 0x1;
/*01b0*/ /*0x039c1422e1c00000*/ IMUL.U32.U32 R8, R5, R7;
/*01b8*/ /*0x041c181ae0880000*/ IADD R6, R6, -R8;
/*01c8*/ /*0x031c141edb301c00*/ ISETP.LE.U32.AND P0, PT, R5, R6,
PT;
/*01d0*/ /*0x039c2422e5000000*/ SEL R8, R9, R7, P0;
/*01d8*/ /*0x009c2025c0800000*/ IADD R9, R8, 0x1;
/*01e0*/ /*0x029c181ee0880000*/ IADD R7, R6, -R5;
/*01e8*/ /*0x031c1c1ae5000000*/ SEL R6, R7, R6, P0;
/*01f0*/ /*0x029c181edb601c00*/ ISETP.GE.U32.AND P0, PT, R6, R5,
PT;
/*01f8*/ /*0x041c2416e5000000*/ SEL R5, R9, R8, P0;
/*0208*/ /*0x001c3c0285800000*/ NOP;
/*0210*/ /*0x0c00000014800000*/ SSY 0x230;
/*0218*/ /*0x0404003c12000000*/ @P1 BRA 0x228;
/*0220*/ /*0x029ce816e6010000*/ I2I R5, -R5;
/*0228*/ /*0x025ffc1ee2003800*/ LOP.PASS_B.S R7, RZ, ~R4;
/*0230*/ /*0x7f9c001ae4c03c00*/ MOV R6, RZ;
/*0238*/ /*0x031c101edb581c00*/ ISETP.NE.AND P0, PT, R4, R6, PT;
/*0248*/ /*0x001e001e84801c07*/ PSETP.AND.AND P0, PT, !P0, PT, P
T;
/*0250*/ /*0x029c1c12e5000000*/ SEL R4, R7, R5, P0;
/*0258*/ /*0x021c0016e4c03c00*/ MOV R5, R4;
/*0260*/ /*0x021c0012e4c03c00*/ MOV R4, R4;
/*0268*/ /*0x021c0012e4c03c00*/ MOV R4, R4;
/*0270*/ /*0x021c0012e4c03c00*/ MOV R4, R4;
/*0278*/ /*0x011c1011c2400000*/ SHL R4, R4, 0x2;
/*0288*/ /*0x021c0002e0800000*/ IADD R0, R0, R4;
/*0290*/ /*0x001c0002e4c03c00*/ MOV R0, R0;
/*0298*/ /*0x001c0000c4000000*/ LD R0, [R0];
/*02a0*/ /*0x001c0002e4c03c00*/ MOV R0, R0;
/*02a8*/ /*0x019c000ee4c03c00*/ MOV R3, R3;
/*02b0*/ /*0x011fc01274000000*/ MOV32I R4, 0x2;
/*02b8*/ /*0x7f9c001ae4c03c00*/ MOV R6, RZ;
/*02c8*/ /*0x021c0012e4c03c00*/ MOV R4, R4;
/*02d0*/ /*0x021c0012e4c03c00*/ MOV R4, R4;
/*02d8*/ /*0x019c000ee4c03c00*/ MOV R3, R3;
/*02e0*/ /*0x019c0016e4c03c00*/ MOV R5, R3;
/*02e8*/ /*0x021c140ee2002000*/ LOP.XOR R3, R5, R4;
/*02f0*/ /*0x031c0c3edb681c00*/ ISETP.GE.AND P1, PT, R3, R6, PT;
/*02f8*/ /*0x021c000ee4c03c00*/ MOV R3, R4;
/*0308*/ /*0x029ce816e6100000*/ I2I R5, |R5|;
/*0310*/ /*0x021ce812e6100000*/ I2I R4, |R4|;
/*0318*/ /*0x021c281ae5c00800*/ I2F.F32.U32.RP R6, R4;
/*0320*/ /*0x021c181a84000000*/ MUFU.RCP R6, R6;
/*0328*/ /*0x031c001ae4c03c00*/ MOV R6, R6;
/*0330*/ /*0xff1c18194007ffff*/ IADD32I R6, R6, 0xffffffe;
/*0338*/ /*0x031c001ae4c03c00*/ MOV R6, R6;
/*0348*/ /*0x031c281ae5808c00*/ F2I.FTZ.U32.F32.TRUNC R6, R6;
/*0350*/ /*0x031c101ee1c00000*/ IMUL.U32.U32 R7, R4, R6;
/*0358*/ /*0x039ce81ee6010000*/ I2I R7, -R7;
/*0360*/ /*0x039c181ee1c00400*/ IMUL.U32.U32.HI R7, R6, R7;
/*0368*/ /*0x039c181ae0800000*/ IADD R6, R6, R7;
/*0370*/ /*0x029c181ae1c00400*/ IMUL.U32.U32.HI R6, R6, R5;
/*0378*/ /*0x009c1821c0800000*/ IADD R8, R6, 0x1;
/*0388*/ /*0x031c101ee1c00000*/ IMUL.U32.U32 R7, R4, R6;
/*0390*/ /*0x039c1416e0880000*/ IADD R5, R5, -R7;
/*0398*/ /*0x029c101edb301c00*/ ISETP.LE.U32.AND P0, PT, R4, R5,
PT;
/*03a0*/ /*0x031c201ee5000000*/ SEL R7, R8, R6, P0;
/*03a8*/ /*0x009c1c21c0800000*/ IADD R8, R7, 0x1;
/*03b0*/ /*0x021c141ae0880000*/ IADD R6, R5, -R4;
/*03b8*/ /*0x029c1816e5000000*/ SEL R5, R6, R5, P0;
/*03c8*/ /*0x021c141edb601c00*/ ISETP.GE.U32.AND P0, PT, R5, R4,
PT;
/*03d0*/ /*0x039c2012e5000000*/ SEL R4, R8, R7, P0;
/*03d8*/ /*0x001c3c0285800000*/ NOP;
/*03e0*/ /*0x0c00000014800000*/ SSY 0x400;
/*03e8*/ /*0x0404003c12000000*/ @P1 BRA 0x3f8;
/*03f0*/ /*0x021ce812e6010000*/ I2I R4, -R4;
/*03f8*/ /*0x01dffc1ae2003800*/ LOP.PASS_B.S R6, RZ, ~R3;
/*0408*/ /*0x7f9c0016e4c03c00*/ MOV R5, RZ;
/*0410*/ /*0x029c0c1edb581c00*/ ISETP.NE.AND P0, PT, R3, R5, PT;
/*0418*/ /*0x001e001e84801c07*/ PSETP.AND.AND P0, PT, !P0, PT, P
T;
/*0420*/ /*0x021c180ee5000000*/ SEL R3, R6, R4, P0;
/*0428*/ /*0x019c0012e4c03c00*/ MOV R4, R3;
/*0430*/ /*0x019c000ee4c03c00*/ MOV R3, R3;
/*0438*/ /*0x019c000ee4c03c00*/ MOV R3, R3;
/*0448*/ /*0x019c000ee4c03c00*/ MOV R3, R3;
/*0450*/ /*0x019ca80ee5c00000*/ I2F R3, R3;
/*0458*/ /*0x019c004ae2c00000*/ FADD R18, R0, R3;
/*0460*/ /*0x091c0012e4c03c00*/ MOV R4, R18;
/*0468*/ /*0x0000010011000000*/ JCAL 0x0;
/*0470*/ /*0x021c0002e4c03c00*/ MOV R0, R4;
/*0478*/ /*0x001c004ee4c03c00*/ MOV R19, R0;
/*0488*/ /*0x099c2c12e5400000*/ F2F.F64.F32 R4, R19;
/*0490*/ /*0x021c3802e5400000*/ F2F.F32.F64 R0, R4;
/*0498*/ /*0x001c0012e4c03c00*/ MOV R4, R0;
/*04a0*/ /*0x0000010011000000*/ JCAL 0x0;
/*04a8*/ /*0x021c0002e4c03c00*/ MOV R0, R4;
/*04b0*/ /*0x001c0052e4c03c00*/ MOV R20, R0;
/*04b8*/ /*0x099c484ae2c10000*/ FADD R18, R18, -R19;
/*04c8*/ /*0x009c4401c2000000*/ LOP.AND R0, R17, 0x1;
/*04d0*/ /*0x011c5001a1080000*/ IMAD R0, R20, 0x2, R0;
/*04d8*/ /*0x011c0001c2400000*/ SHL R0, R0, 0x2;
/*04e0*/ /*0x001c4002e0800000*/ IADD R0, R16, R0;
/*04e8*/ /*0x001c0012e4c03c00*/ MOV R4, R0;
/*04f0*/ /*0x0000010011000000*/ JCAL 0x0;
/*04f8*/ /*0x021c0002e4c03c00*/ MOV R0, R4;
/*0508*/ /*0x001c004ee4c03c00*/ MOV R19, R0;
/*0510*/ /*0x009c4401c2000000*/ LOP.AND R0, R17, 0x1;
/*0518*/ /*0x011c5001a1080000*/ IMAD R0, R20, 0x2, R0;
/*0520*/ /*0x011c0001c2400000*/ SHL R0, R0, 0x2;
/*0528*/ /*0x041c0001c0800000*/ IADD R0, R0, 0x8;
/*0530*/ /*0x001c4002e0800000*/ IADD R0, R16, R0;
/*0538*/ /*0x001c0012e4c03c00*/ MOV R4, R0;
/*0548*/ /*0x0000010011000000*/ JCAL 0x0;
/*0550*/ /*0x021c0002e4c03c00*/ MOV R0, R4;
/*0558*/ /*0x001c0002e4c03c00*/ MOV R0, R0;
/*0560*/ /*0x001c4802e3400000*/ FMUL R0, R18, R0;
/*0568*/ /*0x099c280ee5410000*/ F2F R3, -R19;
/*0570*/ /*0x091c0c0ee3400000*/ FMUL R3, R3, R18;
/*0578*/ /*0x099c0c0ee2c00000*/ FADD R3, R3, R19;
/*0588*/ /*0x019c000ee2c00000*/ FADD R3, R0, R3;
/*0590*/ /*0x011c4401c2400000*/ SHL R0, R17, 0x2;
/*0598*/ /*0x001c0802e0800000*/ IADD R0, R2, R0;
/*05a0*/ /*0x001c0002e4c03c00*/ MOV R0, R0;
/*05a8*/ /*0x001c000ce4000000*/ ST [R0], R3;
/*05b0*/ /*0x005c3c0285800000*/ NOP.S;
/*05b8*/ /*0x141c003c12000000*/ BRA 0x5e8;
/*05c8*/ /*0x7f9c03fee4c03c00*/ MOV RZ, RZ;
/*05d0*/ /*0x001c003c18000000*/ EXIT;
/*05d8*/ /*0x7f9c03fee4c03c00*/ MOV RZ, RZ;
/*05e0*/ /*0x001c003c18000000*/ EXIT;
/*05e8*/ /*0x7f9c03fee4c03c00*/ MOV RZ, RZ;
/*05f0*/ /*0x001c003c18000000*/ EXIT;
/*05f8*/ /*0x7f9c03fee4c03c00*/ MOV RZ, RZ;
/*0608*/ /*0x001c003c18000000*/ EXIT;
/*0610*/ /*0x001c3c0285800000*/ NOP;
/*0618*/ /*0x001c3c0285800000*/ NOP;
/*0620*/ /*0x001c3c0285800000*/ NOP;
/*0628*/ /*0x001c3c0285800000*/ NOP;
/*0630*/ /*0x001c3c0285800000*/ NOP;
/*0638*/ /*0x001c3c0285800000*/ NOP;
/*0640*/ /*0xfc1c003c12007fff*/ BRA 0x640;
/*0648*/ /*0x001c3c0285800000*/ NOP;
/*0650*/ /*0x001c3c0285800000*/ NOP;
/*0658*/ /*0x001c3c0285800000*/ NOP;
/*0660*/ /*0x001c3c0285800000*/ NOP;
/*0668*/ /*0x001c3c0285800000*/ NOP;
/*0670*/ /*0x001c3c0285800000*/ NOP;
/*0678*/ /*0x001c3c0285800000*/ NOP;
I will take also some time to read papers on using double-double precision on the GPU.
Again, thanks a lot.