CUDA memory transactions

This is quite an essential question, but I still don’t understand this completely: As shown in the matrix multiplication example multiple threads can be used to fetch data in parallel. This is also described in the programming guide section G.3.2.2

Now assuming that I have a statement like

float3 a, b

a = make_float3(ptr[0], ptr[1], ptr[2]);

b = make_float3(ptr[3], ptr[4], ptr[5]);

in my kernel code. We fetch twice 3 floats, that’s 12 bytes each. Now since both lines are executed sequentially by the same thread does this result in two transactions or does the compiler (or driver) convert this one one transaction?

That should be three separate transactions for each of those (a quick look at the PTX should confirm it).

How can I do this at once? Of course I could just use 6 threads, but is there another way? How can I look at the ptx assembly code?

6 threads makes no difference in this case. The bit of the documentation you are quoting is talking about transactions per warp (ie 32 threads). If you are reading floats (so 4 bytes per thread), the transaction for the whole warp must fall into the same 128 byte segment of memory. There is no way your code snippet can satisfy that.

nvcc -ptx file.cu

I just wrote a small program to try this out.

#include <cuda.h>

typedef unsigned int uint;

__global__ void kernel(float *a)

{

	float3 v1 = make_float3(a[0], a[1], a[2]);

	float3 v2 = make_float3(a[3], a[4], a[5]);

	

	float b = v1.x + v1.y + v1.z + v2.x + v2.y + v2.z;

	a[0] = b;

}

int main()

{

	uint n = 100;

	float *aHost = new float[n];

	for(uint i=0; i<n; i++)

		aHost[i] = i;

	float *aDevice;

	cudaMalloc((void**)&aDevice, sizeof(float)*n);

	cudaMemcpy(aDevice, aHost, sizeof(float)*n, cudaMemcpyHostToDevice);

	kernel<<<1,1>>>(aDevice);

	return 0;

}

Here is an excerpt from cuobjdump --dump-ptx

ldu.global.f32  %f1, [%rd1+20];

ldu.global.f32  %f2, [%rd1+16];

ldu.global.f32  %f3, [%rd1+12];

ldu.global.f32  %f4, [%rd1+8];

ldu.global.f32  %f5, [%rd1+0];

ldu.global.f32  %f6, [%rd1+4];

ldu.global looks like a fetch from global mem, so you were right. But does coalescing happen at compile time? I’m not sure the device driver could also do that…

I don’t understand this, the floats are within a 128byte segement how would a code snippet look which could fetch all values at the same time?

You are confusing coalescing, where different threads from the same warp merge their memory loads or stores into a single large transaction, with some sort of thread level “instruction fusion” where several memory instructions are replaced by a single instruction which loads a larger type. They are not the same thing at all.

This is uncoalesced with 6 4 byte loads per thread:

__global__ void kernel(float *a)

{

        int tidx = 6 * (threadIdx.x + blockDim.x * blockIdx.x);

        float3 v1 = make_float3(a[tidx], a[tidx+1], a[tidx+2]);

        float3 v2 = make_float3(a[tidx+3], a[tidx+4], a[tidx+5]);

float b = v1.x + v1.y + v1.z + v2.x + v2.y + v2.z;

a[tidx] = b;

}

whereas something like this would be coalesced with 6 128 byte loads per 32 threads

__global__ void kernel(float *a)

{

        int tidx = threadIdx.x + blockDim.x * blockIdx.x;

        float3 v1 = make_float3(a[tidx], a[tidx+32], a[tidx+64]);

        float3 v2 = make_float3(a[tidx+128], a[tidx+160], a[tidx+192]);

float b = v1.x + v1.y + v1.z + v2.x + v2.y + v2.z;

a[tidx] = b;

}

To the best of my knowledge there is no way to load a float3 in a single instruction, only 2 or 4 element vectors of 1,2,4 or 8 byte types are supported in PTX.

Why the memory access in the second example is coalesced is clear to me, I’m trying to understand the “instruction fusion”. I changed the example above a bit, which resulted into a completely different PTX code, but I failed to understand it.

#include <cuda.h>

typedef unsigned int uint;

__global__ void kernel(float3 *a)

{

	float3 v1 = a[1];

	float3 v2 = a[2];

	float b = v1.x + v1.y + v1.z + v2.x + v2.y + v2.z;

	a[0].x = b;

}

int main()

{

	uint n = 100;

	float3 *aHost = new float3[n];

	for(uint i=0; i<n; i++) {

		aHost[i].x = i;

		aHost[i].y = i + 1;

		aHost[i].z = i + 2;

    }

	float3 *aDevice;

	cudaMalloc((void**)&aDevice, sizeof(float3)*n);

	cudaMemcpy(aDevice, aHost, sizeof(float3)*n, cudaMemcpyHostToDevice);

	kernel<<<1,1>>>(aDevice);

	return 0;

}
$LDWbegin__Z6kernelPf:

$LDWbeginblock_181_1:

	.loc	17	7	0

	ld.param.u64 	%rd1, [__cudaparm__Z6kernelPf_a];

	ld.global.f32 	%f1, [%rd1+0];

	st.param.f32 	[__cudaparma1__Z11make_float3fff], %f1;

	ld.param.u64 	%rd2, [__cudaparm__Z6kernelPf_a];

	ld.global.f32 	%f2, [%rd2+4];

	st.param.f32 	[__cudaparma2__Z11make_float3fff], %f2;

	ld.param.u64 	%rd3, [__cudaparm__Z6kernelPf_a];

	ld.global.f32 	%f3, [%rd3+8];

	st.param.f32 	[__cudaparma3__Z11make_float3fff], %f3;

	call.uni (__cudareta__Z11make_float3fff), _Z11make_float3fff, (__cudaparma1__Z11make_float3fff, __cudaparma2__Z11make_float3fff, __cudaparma3__Z11make_float3fff);

	ld.param.f32 	%f4, [__cudareta__Z11make_float3fff+0];

	st.local.f32 	[__cuda__temp__Mreturn__Z11make_float3fff36_16+0], %f4;

	ld.param.f32 	%f5, [__cudareta__Z11make_float3fff+4];

	st.local.f32 	[__cuda__temp__Mreturn__Z11make_float3fff36_16+4], %f5;

	ld.param.f32 	%f6, [__cudareta__Z11make_float3fff+8];

	st.local.f32 	[__cuda__temp__Mreturn__Z11make_float3fff36_16+8], %f6;

	ld.local.f32 	%f7, [__cuda__temp__Mreturn__Z11make_float3fff36_16+0];

	st.local.f32 	[__cuda_local_var_39816_9_non_const_v1_32+0], %f7;

	ld.local.f32 	%f8, [__cuda__temp__Mreturn__Z11make_float3fff36_16+4];

	st.local.f32 	[__cuda_local_var_39816_9_non_const_v1_32+4], %f8;

	ld.local.f32 	%f9, [__cuda__temp__Mreturn__Z11make_float3fff36_16+8];

	st.local.f32 	[__cuda_local_var_39816_9_non_const_v1_32+8], %f9;

	.loc	17	8	0

	ld.param.u64 	%rd4, [__cudaparm__Z6kernelPf_a];

	ld.f32 	%f10, [%rd4+12];

	st.param.f32 	[__cudaparma1__Z11make_float3fff], %f10;

	ld.param.u64 	%rd5, [__cudaparm__Z6kernelPf_a];

	ld.f32 	%f11, [%rd5+16];

	st.param.f32 	[__cudaparma2__Z11make_float3fff], %f11;

	ld.param.u64 	%rd6, [__cudaparm__Z6kernelPf_a];

	ld.f32 	%f12, [%rd6+20];

	st.param.f32 	[__cudaparma3__Z11make_float3fff], %f12;

	call.uni (__cudareta__Z11make_float3fff), _Z11make_float3fff, (__cudaparma1__Z11make_float3fff, __cudaparma2__Z11make_float3fff, __cudaparma3__Z11make_float3fff);

	ld.param.f32 	%f13, [__cudareta__Z11make_float3fff+0];

	st.local.f32 	[__cuda__temp__Mreturn__Z11make_float3fff37_48+0], %f13;

	ld.param.f32 	%f14, [__cudareta__Z11make_float3fff+4];

	st.local.f32 	[__cuda__temp__Mreturn__Z11make_float3fff37_48+4], %f14;

	ld.param.f32 	%f15, [__cudareta__Z11make_float3fff+8];

	st.local.f32 	[__cuda__temp__Mreturn__Z11make_float3fff37_48+8], %f15;

	ld.local.f32 	%f16, [__cuda__temp__Mreturn__Z11make_float3fff37_48+0];

	st.local.f32 	[__cuda_local_var_39817_9_non_const_v2_64+0], %f16;

	ld.local.f32 	%f17, [__cuda__temp__Mreturn__Z11make_float3fff37_48+4];

	st.local.f32 	[__cuda_local_var_39817_9_non_const_v2_64+4], %f17;

	ld.local.f32 	%f18, [__cuda__temp__Mreturn__Z11make_float3fff37_48+8];

	st.local.f32 	[__cuda_local_var_39817_9_non_const_v2_64+8], %f18;

	.loc	17	10	0

	ld.local.f32 	%f19, [__cuda_local_var_39817_9_non_const_v2_64+8];

	ld.local.f32 	%f20, [__cuda_local_var_39817_9_non_const_v2_64+4];

	ld.local.f32 	%f21, [__cuda_local_var_39817_9_non_const_v2_64+0];

	ld.local.f32 	%f22, [__cuda_local_var_39816_9_non_const_v1_32+8];

	ld.local.f32 	%f23, [__cuda_local_var_39816_9_non_const_v1_32+0];

	ld.local.f32 	%f24, [__cuda_local_var_39816_9_non_const_v1_32+4];

	add.f32 	%f25, %f23, %f24;

	add.f32 	%f26, %f22, %f25;

	add.f32 	%f27, %f21, %f26;

	add.f32 	%f28, %f20, %f27;

	add.f32 	%f29, %f19, %f28;

	mov.f32 	%f30, %f29;

	.loc	17	12	0

	mov.f32 	%f31, %f30;

	ld.param.u64 	%rd7, [__cudaparm__Z6kernelPf_a];

	st.f32 	[%rd7+0], %f31;

$LDWendblock_181_1:

	.loc	17	13	0

	exit;

$LDWend__Z6kernelPf:

	} // _Z6kernelPf

Now there seem to be only three fetches to the global memory, but it seems to just fetch one float each time and ld is used instead of ldu. What’s the difference?. And make_float3 isn’t inlined??

There are 6 fetches in that code. Each element of the float3 is loaded as a separate 4 byte transaction. But that PTX doesn’t match the C code you posted along with it, so I am not sure what it is you are trying to show. The “palate” of memory operations available in PTX is nicely documented and included in every version of the toolkit. It might be worth having a look at it to understand what is and is not possible in PTX.

The difference between LDU and LD is that LDU goes through the constant memory cache. It probably means that one version of the code was compiled for sm20 (ie. Fermi), where arguments are stored in constant memory, and another for sm_10 or sm_13.

A “proper” example of vector types in PTX would look like this:

.entry _Z7kernel2P6float4 (

		.param .u64 __cudaparm__Z7kernel2P6float4_a)

	{

	.reg .u64 %rd<3>;

	.reg .f32 %f<13>;

	.loc	17	12	0

 //  10  

 //  11  

 //  12  __global__ void kernel2(float4 *a)

$LDWbegin__Z7kernel2P6float4:

	.loc	17	21	0

 //  17  	float b = v1.x + v1.y + v1.z + v2.x + v2.y + v2.z;

 //  18  	void(v1.w);

 //  19  	void(v2.w);

 //  20  

 //  21  	a[0].x = b;

	ld.param.u64 	%rd1, [__cudaparm__Z7kernel2P6float4_a];

	ldu.global.v4.f32 	{%f1,%f2,%f3,_}, [%rd1+32];

	ldu.global.v4.f32 	{%f4,%f5,%f6,_}, [%rd1+16];

	add.f32 	%f7, %f4, %f5;

	add.f32 	%f8, %f6, %f7;

	add.f32 	%f9, %f1, %f8;

	add.f32 	%f10, %f2, %f9;

	add.f32 	%f11, %f3, %f10;

	st.global.f32 	[%rd1+0], %f11;

	.loc	17	22	0

 //  22  }

	exit;

$LDWend__Z7kernel2P6float4:

	} // _Z7kernel2P6float4

where the each float4 load is compiled to a single 16 byte instruction. This says nothing about whether the access would be coalesced or not.

Vector loading is only supported for vec2 or vec4, BUT NOT FOR vec3! I saw this when rereading the PTX manual. Here is an example which creates ldu.global.v4.f32 instructions.

#include <cuda.h>

typedef unsigned int uint;

__global__ void kernel(float4 *a)

{

	float4 v1 = a[1];

	float4 v2 = a[2];

	float b = v1.x + v1.y + v1.z + v2.x + v2.y + v2.z;

	a[0].x = b;

}

int main()

{

	uint n = 100;

	// Note that vector loading is only supported for float2 and float4!

	float4 *aHost = new float4[n];

	for(uint i=0; i<n; i++) {

		aHost[i].x = i;

		aHost[i].y = i + 1;

		aHost[i].z = i + 2;

    }

	float4 *aDevice;

	cudaMalloc((void**)&aDevice, sizeof(float3)*n);

	cudaMemcpy(aDevice, aHost, sizeof(float3)*n, cudaMemcpyHostToDevice);

	kernel<<<1,1>>>(aDevice);

	return 0;

}

I did tell you that in post #6 of this thread, two weeks ago., as well as posting almost identical code to yours in the post directly preceding yours…