lmem -- heeeelp :)

DarkAr · October 10, 2008, 1:35pm

this kernel:

constant unsigned int gNumberOfTriangles

global void KDKernelMINMAXCopy(float4 *Target, float4 *Source, uint2 *Keys)

{

const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;

if (x >= gNumberOfTriangles)

 return;

unsigned int Key = Keys.y;

float4 mins = Source[(Key<<1)+0];

float4 maxs = Source[(Key<<1)+1];

	mins.w = __int_as_float(Key);

	maxs.w = __int_as_float(Key);

Target[(x<<1)+0] = mins;

Target[(x<<1)+1] = maxs;

return;

}

Compilation commandline:

“$(CUDA_BIN_PATH)\nvcc.exe” -ccbin “C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin” -arch sm_13 -c -keep -Xptxas=-v -DUSE_GT200 -DWIN32 -D_CONSOLE -D_MBCS -Xcompiler /Ox,/Ob2,/Oi,/Ot,/EHsc,/MT,/GS-,/GR,/W3,/nologo,/Wp64,/Zi -I"$(CUDA_INC_PATH)" -I./ -I…/…/common/inc -I"$(DXSDK_DIR)\Include" -Xcudafe --diag_suppress=unsigned_compare_with_negative -o x64$(ConfigurationName)\KDKernelConstruct.obj …\KDTree\KDKernelConstruct.cu

gives this:

Used 11 registers, 16+0 bytes lmem, 40+32 bytes smem, 40 bytes cmem[0]

and there is no way to rearange it to not use this fucking ‘local’ memory External Media

since this is a kernel that just basically copies data from one place to another,

the lmem in the middle is the last thing i want External Media

anyone have any idea how to rearange it to force compiller to not use this lmem ?

as it is shown in the ptx asm below, lmem is used to save 4 registers

where there is no need to save any registers since 11+4 is < 16 !!!

.entry __globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2

{

.reg .u16 %rh<4>;

.reg .u32 %r<14>;

.reg .u64 %rd<13>;

.reg .f32 %f<9>;

.reg .pred %p<3>;

.param .u64 __cudaparm___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2_Target;

.param .u64 __cudaparm___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2_Source;

.param .u64 __cudaparm___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2_Keys;

.local .align 16 .b8 __cuda___cuda_maxs064[16];

.loc	15	237	0

$LBB1___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2:

mov.u16 Â %rh1, %ctaid.x; Â  Â  Â  // 

mov.u16 Â %rh2, %ntid.x; Â  Â  Â  Â // 

mul.wide.u16 Â %r1, %rh1, %rh2;	// 

cvt.u32.u16 Â %r2, %tid.x; Â  Â  Â // 

add.u32 Â %r3, %r2, %r1; Â  Â  Â  Â // 

ld.const.u32 Â %r4, [gNumberOfTriangles];	// id:68 gNumberOfTriangles+0x0

setp.gt.u32 Â %p1, %r4, %r3; Â  Â // 

@%p1 bra Â $Lt_0_4; Â  Â  Â  Â  Â  Â  // 

bra.uni Â $LBB4___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2;	//

$Lt_0_4:

.loc	15	244	0

mul.lo.u32 Â %r5, %r3, 8; Â  Â  Â  // 

cvt.u64.u32 Â %rd1, %r5; Â  Â  Â  Â // 

ld.param.u64 Â %rd2, [__cudaparm___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2_Keys];	// id:69 __cudaparm___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2_Keys+0x0

add.u64 Â %rd3, %rd1, %rd2; Â  Â  // 

ld.global.u32 Â %r6, [%rd3+4]; Â // id:70

.loc	15	246	0

mul.lo.u32 Â %r7, %r6, 32; Â  Â  Â // 

cvt.u64.u32 Â %rd4, %r7; Â  Â  Â  Â // 

ld.param.u64 Â %rd5, [__cudaparm___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2_Source];	// id:71 __cudaparm___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2_Source+0x0

add.u64 Â %rd6, %rd4, %rd5; Â  Â  // 

ld.global.v4.f32 Â {%f1,%f2,%f3,_}, [%rd6+0];	// 

.loc	15	247	0

add.u64 Â %rd7, %rd6, 16; Â  Â  Â  // 

mov.u64 Â %rd8, __cuda___cuda_maxs064;	// 

ld.global.v4.u32 Â {%r8,%r9,%r10,%r11}, [%rd7+0];	//

[b] st.local.u32 Â [%rd8+0], %r8; Â // id:89 __cuda___cuda_maxs064+0x0

st.local.u32 Â [%rd8+4], %r9; Â  // id:89 __cuda___cuda_maxs064+0x0

st.local.u32 Â [%rd8+8], %r10; Â // id:89 __cuda___cuda_maxs064+0x0

st.local.u32 Â [%rd8+12], %r11;	// id:89 __cuda___cuda_maxs064+0x0

[/b] .loc 15 251 0

mul.lo.u32 Â %r12, %r3, 32; Â  Â  // 

cvt.u64.u32 Â %rd9, %r12; Â  Â  Â  // 

ld.param.u64 Â %rd10, [__cudaparm___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2_Target];	// id:76 __cudaparm___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2_Target+0x0

add.u64 Â %rd11, %rd9, %rd10; Â  // 

mov.b32 Â %f4, %r6; Â  Â  Â  Â  Â  Â  // 

st.global.v4.f32 Â [%rd11+0], {%f1,%f2,%f3,%f4};	// 

.loc	15	252	0

[b] ld.local.f32 Â %f5, [__cuda___cuda_maxs064+0]; // id:82 __cuda___cuda_maxs064+0x0

ld.local.f32 Â %f6, [__cuda___cuda_maxs064+4];	// id:84 __cuda___cuda_maxs064+0x4

ld.local.f32 Â %f7, [__cuda___cuda_maxs064+8];	// id:86 __cuda___cuda_maxs064+0x8

[/b] st.global.v4.f32 Â [%rd11+16], {%f5,%f6,%f7,%f4}; //

$LBB4___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2:

.loc	15	254	0

exit; Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â  Â //

$LDWend___globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2:

} // __globfunc__Z18KDKernelMINMAXCopyP6float4S0_P5uint2

alex_dubinsky · October 10, 2008, 4:40pm

-maxrregcount=16

Reimar · October 10, 2008, 4:41pm

What kind of rearranging did you try? Since the compiler does it right for the mins, did you try e.g.

float Keyf = __int_as_float(Key);

Source += Key << 1;

Target += x << 1;

float4 vals  = *Source;

*Target = (float4){vals.x, vals.y, vals.z, Keyf};

Source++; Target++;

vals  = *Source;

*Target = (float4){vals.x, vals.y, vals.z, Keyf};

There is also some make_float4 or such macro if you do not like the C99 syntax there.

Just keep in mind that compilers still are really stupid and will miss any optimization opportunity you give them a chance to miss :P

DarkAr · October 13, 2008, 7:20am

it is not an option because i have 4 kernels in one *.cu file, and other 3 uses 32 regs

a really really dont want one separate *.cu for each kernel :)

DarkAr · October 13, 2008, 7:23am

What kind of rearranging did you try? Since the compiler does it right for the mins, did you try e.g.
float Keyf = __int_as_float(Key);

Source += Key << 1;

Target += x << 1;

float4 vals Â = *Source;

*Target = (float4){vals.x, vals.y, vals.z, Keyf};

Source++; Target++;

vals Â = *Source;

*Target = (float4){vals.x, vals.y, vals.z, Keyf};
There is also some make_float4 or such macro if you do not like the C99 syntax there.

Just keep in mind that compilers still are really stupid and will miss any optimization opportunity you give them a chance to miss :P

[snapback]450296[/snapback]

but what about a coalesced read/write ? first you read one element, the write it, then read another, then write it.

Reimar · October 13, 2008, 8:36am

You misunderstood coalescing, coalescing is about the memory accesses done in parallel by different threads, not about the serial memory access of a single thread (which means there is no coalescing with your current code anyway).

It might make a difference if the compiler would merge the two reads, but

I do not think it would do it
since you already read a float4 it can not merge it anyway, the largest read instruction is a 128 bit read

(the same applies to writes).

alex_dubinsky · October 14, 2008, 3:48am

It’s “not an option” because it disagrees with your aesthetics?

DarkAr · October 14, 2008, 7:13am

because it’s a pain in the ass,

for every kernel i need separate *.cu file to be able to define maxregs

there SHOULD be some #pragma in CUDA 2.1 to do this per kernel, not per file.

alex_dubinsky · October 14, 2008, 5:18pm

I completely agree about the #pragma.

But separate files aren’t so bad. You don’t need the Driver API or anything. In each .cu file you have your kernel and a C++ wrapper that call it, and let it compile into a .o. Then in the other files you just call the wrapper (ie, you don’t use the <<<>>> syntax). Everything links together like in ordinary C++.

alex_dubinsky · October 14, 2008, 5:21pm

P.S. I doubt you’ll lose much in performance if your 16-reg kernel is told to use 32 regs. a) it probably will still use 16 regs and b ) lower occupancy probably won’t hurt performance, esp on gtx260

Topic		Replies	Views
coalescing struct loading problem CUDA Programming and Performance	21	12712	March 5, 2010
Can a Kernel be too big?? CUDA_ERROR_NO_BINARY_FOR_GPU error 209 CUDA Programming and Performance	11	3024	November 13, 2017
Local memory performance Using more than 4kb kills it.. why? CUDA Programming and Performance	24	5076	September 6, 2008
Help a newbie with CUDA development! CUDA Programming and Performance	16	14552	May 1, 2008
Please help with __shared__ memory different usage than in samples CUDA Programming and Performance	30	3310	January 10, 2010
Can you GUESS this without experimenting? Latencies CUDA Programming and Performance	13	9347	January 7, 2008
Optimization suggestions for reading from main memory to registers and share memory CUDA Programming and Performance	10	169	May 22, 2024
How would you do this? CUDA Programming and Performance	12	4466	August 5, 2008
Transfer-Bound Application Looking for ideas to speed it up CUDA Programming and Performance	36	29323	April 23, 2010
Slow local memory, feigned constant memory. coalesced? global? CUDA Programming and Performance	29	7255	January 25, 2010

lmem -- heeeelp :)

Related topics