coalescing struct loading problem

Hi all,

I wrote a very small kernel:

struct  __align__(16) DATA 

{

	float u,v;

	int val;

	float f;

};

__global__ void

testKernel( DATA* g_idata, DATA* g_odata) 

{

  const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

g_odata[tid] = g_idata[tid];

}

When inspecting the PTX code, I noticed that the stuct was copied as

ld.global.v4.f32 	{%f1,%f2,_,%f3}, [%r6+0];

	st.global.v2.f32 	[%r8+0], {%f1,%f2};

	ld.global.s32 	%r9, [%r6+8];

	st.global.s32 	[%r8+8], %r9;

	st.global.f32 	[%r8+12], %f3;

This obstructs 128 bit coalescing reading/writing, is there a way around this without having to use __float_as_int?

1 Like

I don’t profess to be a PTX expert, but the load looks coalesced to me . The the odd part is this:

ld.global.s32	 %r9, [%r6+8];

st.global.s32	 [%r8+8], %r9;

which seems to be separately loading and storing the unused alignment word for no apparent reason. That looks more like a compiler bug than anything that should be worked around in C code.

Thanks for your replay.

I agree that this seems a compiler bug.

As far as I can see tough, only the first load instruction is coalescing, all other store and load instructions are not (on 1.1 architecture at least).

I also did another experiment

__global__ void

testKernel( float4* g_idata, float4* g_odata) 

{

  const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

  float4 c = g_idata[tid];

  c.y = (float)__int_as_float( __float_as_int(c.y) + 1);

  g_odata[tid] = c;

}

This generated in PTX

ld.global.v4.f32 	{%f1,%f2,%f3,%f4}, [%r6+0];

	st.global.f32 	[%r8+0], %f1;

	mov.b32 	%r9, %f2;

	add.s32 	%r10, %r9, 1;

	st.global.s32 	[%r8+4], %r10;

	st.global.v2.f32 	[%r8+8], {%f3,%f4};

This seems very inefficient to me, I would expect something like:

ld.global.v4.f32 	{%f1,%f2,%f3,%f4}, [%r6+0];

	mov.b32 	%r9, %f2;

	add.s32 	%r10, %r9, 1;

	mov.b32 	%f2, %r10;

	st.global.v4.f32 	[%r8+0], {%f1,%f2,%f3,%f4};

This results in less instructions and !more importantly! one coalescing 128 bit write.

Has anyone had this kind of problem and how did they work around it?

BTW, when using uint4 instead of float4 and handling one component as float, the compiler does create something like the shorter PTX fragment

To interpret this:

(1) - load float4 structure (x,y,z,w), but discard z - it is not stored in any register at all!

(2) - store (x,y) as float2 structure.

(3) - load integer i from the third component

(4) - store that i to global again

(5) - store the w component to global.

To make copying faster, you could just try:

((float4*)(g_odata))[tid] = ((float4*)(g_idata))[tid];

Hopefully it will help - didn’t try it!

I tried your suggestion

But it did not help, exactly the same PTX code was generated.

Also, this is a somewhat simplified example, I actually want to use/change all 4 fields before writing them back in one coalescing write. But I do not yet see how to achieve this without hacking PTX code directly External Media

Any other suggestions?

Thanks

Actually looking at the original code again, I will revise my opinion a bit. The problem is the mixed types in the structures. The vector load/store instructions in PTX are type specific, so the compiler should never issue an ld.global.v4.f32 { %f,%f,%f,%f} when there is an integer present in the structure, even when the structure could cast to a 128 bit vector type.

I would guess that you will get cleaner looking PTX for the store in you example (although still not what you want) if your structure was declared like this:

struct  __align__(16) DATA

{

	float u,v,f;

	int val;

};

but I am too lazy to test it myself. The work around would probably be to declare a “transformative” structure for loads and stores using uint32_t or something, and then add explicit CUDA cast operators where necessary.

Well, that sounds somewhat logical, although I would expect a smart optimizing compiler to notice that the load/store instruction does not modify the content in this case, so it can do the transformation trick itself (loading as floats and transforming to int or v.s.)

Also, that does not explain my second example, where the compiler stores the second element of the float4 as signed integer. I explicitly requested an __int_as_float and still the compiler decides to store the integer directly, requiring 3 writes instead on 1.

Are you sure you are reading correct file? Because I just tried that and it worked on my compiler (nothing special, CUDA 2.3)

struct  __align__(16) DATA

{

	float u,v;

	int val;

	float f;

};

__global__ void

testKernel( DATA* g_idata, DATA* g_odata)

{

  const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

  ((float4*)g_odata)[tid] = ((float4*)g_idata)[tid];

}
$LBB1__Z10testKernelP4DATAS0_:

	mov.u16 	%rh1, %ctaid.x;

	mov.u16 	%rh2, %ntid.x;

	mul.wide.u16 	%r1, %rh1, %rh2;

	cvt.u32.u16 	%r2, %tid.x;

	add.u32 	%r3, %r2, %r1;

	cvt.u64.u32 	%rd1, %r3;

	mul.lo.u64 	%rd2, %rd1, 16;

	cvt.s32.u64 	%r4, %rd2;

	ld.param.u32 	%r5, [__cudaparm__Z10testKernelP4DATAS0__g_idata];

	add.u32 	%r6, %r4, %r5;

	ld.param.u32 	%r7, [__cudaparm__Z10testKernelP4DATAS0__g_odata];

	add.u32 	%r8, %r4, %r7;

	ld.global.v4.f32 	{%f1,%f2,%f3,%f4}, [%r6+0];

	st.global.v4.f32 	[%r8+0], {%f1,%f2,%f3,%f4};

	exit;

$LDWend__Z10testKernelP4DATAS0_:

	} // _Z10testKernelP4DATAS0_

I am now experimenting to see if you could actually modify the data in between…

Edit: I tried to be more general and easy to use:

struct  __align__(16) DATA

{

	float u,v;

	int val;

	float f;

};

__device__ DATA get(DATA *ptr, int idx) {

	float4 element=((float4*)ptr)[idx];

	DATA out;

	out.u=element.x;

	out.v=element.y;

	out.val=__float_as_int(element.z);

	out.f=element.w;

	return out;

}

__device__ void set(DATA *ptr, int idx, DATA value) {

	float4 element;

	element.x=value.u;

	element.y=value.v;

	element.z=__int_as_float(value.val);

	element.w=value.f;

	((float4*)ptr)[idx]=element;

}

__global__ void

testKernel( DATA* g_idata, DATA* g_odata)

{

  const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

	DATA e=get(g_idata,tid);

	e.val+=1;

	set(g_odata,tid,e);

}

As a result, I get a good load instruction but I cannot make it produce a good store instruction :(

.entry _Z10testKernelP4DATAS0_ (

		.param .u32 __cudaparm__Z10testKernelP4DATAS0__g_idata,

		.param .u32 __cudaparm__Z10testKernelP4DATAS0__g_odata)

	{

	.reg .u16 %rh<4>;

	.reg .u32 %r<12>;

	.reg .u64 %rd<4>;

	.reg .f32 %f<6>;

$LBB1__Z10testKernelP4DATAS0_:

	mov.u16 	%rh1, %ctaid.x;

	mov.u16 	%rh2, %ntid.x;

	mul.wide.u16 	%r1, %rh1, %rh2;

	cvt.u32.u16 	%r2, %tid.x;

	add.u32 	%r3, %r2, %r1;

	cvt.u64.s32 	%rd1, %r3;

	mul.lo.u64 	%rd2, %rd1, 16;

	cvt.s32.u64 	%r4, %rd2;

	ld.param.u32 	%r5, [__cudaparm__Z10testKernelP4DATAS0__g_idata];

	add.u32 	%r6, %r4, %r5;

	ld.global.v4.f32 	{%f1,%f2,%f3,%f4}, [%r6+0];

	ld.param.u32 	%r7, [__cudaparm__Z10testKernelP4DATAS0__g_odata];

	add.u32 	%r8, %r4, %r7;

	st.global.v2.f32 	[%r8+0], {%f1,%f2};

	mov.b32 	%r9, %f3;

	add.s32 	%r10, %r9, 1;

	st.global.s32 	[%r8+8], %r10;

	st.global.f32 	[%r8+12], %f4;

	exit;

$LDWend__Z10testKernelP4DATAS0_:

	} // _Z10testKernelP4DATAS0_

Note: you might worry about excessive register usage with this setting, yet final code is optimised well enough so that no extra register memory is needed.

1>ptxas info	: Compiling entry function '_Z10testKernelP4DATAS0_'

1>ptxas info	: Used 6 registers, 8+16 bytes smem, 12 bytes cmem[0], 12 bytes cmem[14]

Thank you for trying this out for yourself!

I stand corrected, i tried it again and this time it worked, so I must have done something wrong the first time :">

About your modify-test, it is pretty much what I get with my modification code, I did not yet succeed in tricking the compiler to produce 1 store instruction. It is possible by moving the data trough shared memory once, but that’s just stupid.

This is quite annoying, as those non-coalescing writes are hurting my performance pretty hard; But I rather code everything in c, instead of having to write PTX myself. A compiler should be your friend, not your enemy External Media

Ok then, how about this:

#include <stdint.h>

struct  __align__(16) DATAA

{

	uint32_t u,v,val,f;

};

struct  __align__(16) DATAS

{

	float u,v;

	int val;

	float f;

};

union  __align__(16) DATAU

{

	struct DATAA array;

	struct DATAS structure;

};

__global__ void

testKernel( DATAU* g_idata, DATAU* g_odata)

{

  const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

g_odata[tid].array = g_idata[tid].array;

}

which compiles to this:

.entry _Z10testKernelP5DATAUS0_ (

				.param .u64 __cudaparm__Z10testKernelP5DATAUS0__g_idata,

				.param .u64 __cudaparm__Z10testKernelP5DATAUS0__g_odata)

		{

		.reg .u16 %rh<4>;

		.reg .u32 %r<9>;

		.reg .u64 %rd<8>;

		.loc	3	   22	  0

$LBB1__Z10testKernelP5DATAUS0_:

		.loc	3	   26	  0

		mov.u16		 %rh1, %ctaid.x;

		mov.u16		 %rh2, %ntid.x;

		mul.wide.u16	%r1, %rh1, %rh2;

		cvt.u32.u16	 %r2, %tid.x;

		add.u32		 %r3, %r2, %r1;

		cvt.u64.u32	 %rd1, %r3;

		mul.lo.u64	  %rd2, %rd1, 16;

		ld.param.u64	%rd3, [__cudaparm__Z10testKernelP5DATAUS0__g_idata];

		add.u64		 %rd4, %rd3, %rd2;

		ld.param.u64	%rd5, [__cudaparm__Z10testKernelP5DATAUS0__g_odata];

		add.u64		 %rd6, %rd5, %rd2;

		ld.global.v4.u32		{%r4,%r5,%r6,%r7}, [%rd4+0];

		st.global.v4.u32		[%rd6+0], {%r4,%r5,%r6,%r7};

		.loc	3	   28	  0

		exit;

$LDWend__Z10testKernelP5DATAUS0_:

		} // _Z10testKernelP5DATAUS0_

The C might not be all that pretty, but the PTX certainly is :)

Yes, but what happens when you start modifying the data between load and store? I couldn’t get the optimal results, even when using unions.

Still works as before. This is through shared memory:

__global__ void

testKernel( DATAU* g_idata, DATAU* g_odata)

{

  const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

__shared__ DATAU update[128];

update[threadIdx.x].array = g_idata[tid].array;

  update[threadIdx.x].structure.val = tid;

g_odata[tid].array = update[threadIdx.x].array;

}

which compiles to

.entry _Z10testKernelP5DATAUS0_ (

				.param .u64 __cudaparm__Z10testKernelP5DATAUS0__g_idata,

				.param .u64 __cudaparm__Z10testKernelP5DATAUS0__g_odata)

		{

		.reg .u16 %rh<4>;

		.reg .u32 %r<13>;

		.reg .u64 %rd<14>;

		.shared .align 16 .b8 __cuda_update16[2048];

		.loc	3	   22	  0

$LBB1__Z10testKernelP5DATAUS0_:

		.loc	3	   28	  0

		mov.u16		 %rh1, %ctaid.x;

		mov.u16		 %rh2, %ntid.x;

		mul.wide.u16	%r1, %rh1, %rh2;

		cvt.u32.u16	 %r2, %tid.x;

		add.u32		 %r3, %r1, %r2;

		mov.u64		 %rd1, __cuda_update16;

		cvt.u64.u32	 %rd2, %r2;

		mul.lo.u64	  %rd3, %rd2, 16;

		add.u64		 %rd4, %rd1, %rd3;

		cvt.u64.u32	 %rd5, %r3;

		mul.lo.u64	  %rd6, %rd5, 16;

		ld.param.u64	%rd7, [__cudaparm__Z10testKernelP5DATAUS0__g_idata];

		add.u64		 %rd8, %rd7, %rd6;

		mov.s64		 %rd9, %rd4;

		ld.global.v4.u32		{%r4,%r5,%r6,%r7}, [%rd8+0];

		st.shared.u32   [%rd9+0], %r4;

		st.shared.u32   [%rd9+4], %r5;

		st.shared.u32   [%rd9+8], %r6;

		st.shared.u32   [%rd9+12], %r7;

		.loc	3	   29	  0

		st.shared.u32   [%rd4+8], %r3;

		.loc	3	   31	  0

		mov.s64		 %rd10, %rd4;

		ld.param.u64	%rd11, [__cudaparm__Z10testKernelP5DATAUS0__g_odata];

		add.u64		 %rd12, %rd11, %rd6;

		ld.shared.u32   %r8, [%rd10+0];

		ld.shared.u32   %r9, [%rd10+4];

		ld.shared.u32   %r10, [%rd10+8];

		ld.shared.u32   %r11, [%rd10+12];

		st.global.v4.u32		[%rd12+0], {%r8,%r9,%r10,%r11};

		.loc	3	   33	  0

		exit;

$LDWend__Z10testKernelP5DATAUS0_:

		} // _Z10testKernelP5DATAUS0_

Local memory works the same way.

The shared memory trick is also the best I could come up with. But its frustrating that I have to copy 4 registers trough shared memory just because the compiler refuses to output the optimal PTX code.
There is no such thing as inline assembler for CUDA, is there?

Officially no. Unofficially, yes there is. I would be interested to see some benchmarks of the various codes in this thread, if you have them. CUDA stores are “fire forget”, so I wonder how much difference not have 128 bit transactions actually makes to the overall performance of the code.

I have the following code example, run with 10000 block of 128 threads each (should give 100% occupancy)

__global__ void

testKernel( DATA* g_idata, DATA* g_odata) 

{

	const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

	

	for( unsigned int i = tid; i < tid + 128; i++ )

		((float4*)(g_odata))[i] = ((float4*)(g_idata))[i];

	// ld.global.v4.f32 	{%f1,%f2,%f3,%f4}, [%r8+0];

	// st.global.v4.f32 	[%r10+0], {%f1,%f2,%f3,%f4};

}

Processing time: 295.661804 (ms)

__global__ void

testKernel( DATA* g_idata, DATA* g_odata) 

{

	const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

	for( unsigned int i = tid; i < tid + 128; i++ )

		g_odata[i] = g_idata[i];

	// ld.global.v4.f32 	{%f1,%f2,_,%f3}, [%r8+0];

	// st.global.v2.f32 	[%r10+0], {%f1,%f2};

	// ld.global.s32 	%r14, [%r8+8];

	// st.global.s32 	[%r10+8], %r14;

	// st.global.f32 	[%r10+12], %f3;

}

Processing time: 679.136230 (ms)

Results are similar for repeated test-runs.

In this case I think, the total bandwidth is probably the bottleneck, so the second fragment is probably slower due to the extra “ld.global.s32 %r14, [%r8+8];” instruction.

Also, if CUDA stores are “fire forget”, why is it supposed to be beneficial to do coalescing store’s? Only because a uncoalescing store requires 16 cycles, instead of 1?

That is my understanding, yes. I would agree that it is really the load latency that is most of the difference. You need two load transactions per thread in the slow case, and one in the second, and the ratio of performance between the cases is roughly 2x between the two. It is probably also worth bearing in mind that we are only looking at PTX. There is still another level of runtime optimization that could change the character of the actual machine code run on the device.

In that case, optimizing the store instructions is not that big a deal. Although 16 cycles can still be significant if the kernel is not bandwidth limited. Of course you are right that PTX is only intermediate, but I highly doubt if the runtime optimization is capable of stitching together the store sequence into a single large machine code store instruction.

Please, please… how can I use the inline assembler? Any unofficial links, hints, etc …?

Tim Murray will hate me for this, and he will hate you too if you use it, but start here.

Many thanks! Now the problem is trivial:

#define USE_UNSUPPORTED

struct  __align__(16) DATA

{

	float u,v;

	int val;

	float f;

};

__device__ DATA get(DATA *ptr, int idx) {

	DATA out;

#ifdef USE_UNSUPPORTED

	DATA *from=&ptr[idx];

	float tmp;

	asm("ld.global.v4.f32 {%0,%1,%2,%3}, [%4+0]; \n" : "=f"(out.u) ,  "=f"(out.v) , "=f"(tmp) , "=f"(out.f) : "r"(from) );

	out.val=__float_as_int(tmp);

#else

	float4 element=((float4*)ptr)[idx];

	out.u=element.x;

	out.v=element.y;

	out.val=__float_as_int(element.z);

	out.f=element.w;

#endif

	return out;

}

__device__ void set(DATA *ptr, int idx, DATA value) {

#ifdef USE_UNSUPPORTED

	DATA *to=&ptr[idx];

	float tmp=__int_as_float(value.val);

	asm("st.global.v4.f32 [%4+0], {%0,%1,%2,%3}; \n" : : "f"(value.u) ,  "f"(value.v) , "f"(tmp) , "f"(value.f) , "r"(to) );

#else

	float4 element;

	element.x=value.u;

	element.y=value.v;

	element.z=__int_as_float(value.val);

	element.w=value.f;

	((float4*)ptr)[idx]=element;

#endif

}

__global__ void

testKernel( DATA* g_idata, DATA* g_odata)

{

  const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

	DATA e=get(g_idata,tid);

	e.val+=1;

	set(g_odata,tid,e);

}

Which produces somewhat ugly PTX code:

.entry _Z10testKernelP4DATAS0_ (

		.param .u32 __cudaparm__Z10testKernelP4DATAS0__g_idata,

		.param .u32 __cudaparm__Z10testKernelP4DATAS0__g_odata)

	{

	.reg .u16 %rh<4>;

	.reg .u32 %r<12>;

	.reg .u64 %rd<4>;

	.reg .f32 %f<14>;

	.loc	2	94	0

$LBB1__Z10testKernelP4DATAS0_:

	mov.u16 	%rh1, %ctaid.x;

	mov.u16 	%rh2, %ntid.x;

	mul.wide.u16 	%r1, %rh1, %rh2;

	cvt.u32.u16 	%r2, %tid.x;

	add.u32 	%r3, %r2, %r1;

	cvt.u64.s32 	%rd1, %r3;

	mul.lo.u64 	%rd2, %rd1, 16;

	cvt.s32.u64 	%r4, %rd2;

	ld.param.u32 	%r5, [__cudaparm__Z10testKernelP4DATAS0__g_idata];

	add.u32 	%r6, %r4, %r5;

	ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%r6+0]; 

	mov.f32 	%f5, %f1;

	mov.f32 	%f6, %f2;

	mov.f32 	%f7, %f3;

	mov.f32 	%f8, %f4;

	mov.f32 	%f9, %f5;

	mov.f32 	%f10, %f6;

	mov.b32 	%r7, %f7;

	add.s32 	%r8, %r7, 1;

	mov.b32 	%f11, %r8;

	mov.f32 	%f12, %f8;

	ld.param.u32 	%r9, [__cudaparm__Z10testKernelP4DATAS0__g_odata];

	add.u32 	%r10, %r4, %r9;

	st.global.v4.f32 [%r10+0], {%f9,%f10,%f11,%f12}; 

	exit;

$LDWend__Z10testKernelP4DATAS0_:

	} // _Z10testKernelP4DATAS0_

Fortunately all that senseless register copy instructions get optimised. I guess the reason for this is to ensure that registers used inside my asm() inline code are never accessed elsewhere.

Note, I assumed here pointers are 32-bit values! When you compile a 64-bit version, supported code will be easily transformed, but my asm inlined code - not! Expect weird behaviour…