Why the memory access in the second example is coalesced is clear to me, I’m trying to understand the “instruction fusion”. I changed the example above a bit, which resulted into a completely different PTX code, but I failed to understand it.
#include <cuda.h>
typedef unsigned int uint;
__global__ void kernel(float3 *a)
{
float3 v1 = a[1];
float3 v2 = a[2];
float b = v1.x + v1.y + v1.z + v2.x + v2.y + v2.z;
a[0].x = b;
}
int main()
{
uint n = 100;
float3 *aHost = new float3[n];
for(uint i=0; i<n; i++) {
aHost[i].x = i;
aHost[i].y = i + 1;
aHost[i].z = i + 2;
}
float3 *aDevice;
cudaMalloc((void**)&aDevice, sizeof(float3)*n);
cudaMemcpy(aDevice, aHost, sizeof(float3)*n, cudaMemcpyHostToDevice);
kernel<<<1,1>>>(aDevice);
return 0;
}
$LDWbegin__Z6kernelPf:
$LDWbeginblock_181_1:
.loc 17 7 0
ld.param.u64 %rd1, [__cudaparm__Z6kernelPf_a];
ld.global.f32 %f1, [%rd1+0];
st.param.f32 [__cudaparma1__Z11make_float3fff], %f1;
ld.param.u64 %rd2, [__cudaparm__Z6kernelPf_a];
ld.global.f32 %f2, [%rd2+4];
st.param.f32 [__cudaparma2__Z11make_float3fff], %f2;
ld.param.u64 %rd3, [__cudaparm__Z6kernelPf_a];
ld.global.f32 %f3, [%rd3+8];
st.param.f32 [__cudaparma3__Z11make_float3fff], %f3;
call.uni (__cudareta__Z11make_float3fff), _Z11make_float3fff, (__cudaparma1__Z11make_float3fff, __cudaparma2__Z11make_float3fff, __cudaparma3__Z11make_float3fff);
ld.param.f32 %f4, [__cudareta__Z11make_float3fff+0];
st.local.f32 [__cuda__temp__Mreturn__Z11make_float3fff36_16+0], %f4;
ld.param.f32 %f5, [__cudareta__Z11make_float3fff+4];
st.local.f32 [__cuda__temp__Mreturn__Z11make_float3fff36_16+4], %f5;
ld.param.f32 %f6, [__cudareta__Z11make_float3fff+8];
st.local.f32 [__cuda__temp__Mreturn__Z11make_float3fff36_16+8], %f6;
ld.local.f32 %f7, [__cuda__temp__Mreturn__Z11make_float3fff36_16+0];
st.local.f32 [__cuda_local_var_39816_9_non_const_v1_32+0], %f7;
ld.local.f32 %f8, [__cuda__temp__Mreturn__Z11make_float3fff36_16+4];
st.local.f32 [__cuda_local_var_39816_9_non_const_v1_32+4], %f8;
ld.local.f32 %f9, [__cuda__temp__Mreturn__Z11make_float3fff36_16+8];
st.local.f32 [__cuda_local_var_39816_9_non_const_v1_32+8], %f9;
.loc 17 8 0
ld.param.u64 %rd4, [__cudaparm__Z6kernelPf_a];
ld.f32 %f10, [%rd4+12];
st.param.f32 [__cudaparma1__Z11make_float3fff], %f10;
ld.param.u64 %rd5, [__cudaparm__Z6kernelPf_a];
ld.f32 %f11, [%rd5+16];
st.param.f32 [__cudaparma2__Z11make_float3fff], %f11;
ld.param.u64 %rd6, [__cudaparm__Z6kernelPf_a];
ld.f32 %f12, [%rd6+20];
st.param.f32 [__cudaparma3__Z11make_float3fff], %f12;
call.uni (__cudareta__Z11make_float3fff), _Z11make_float3fff, (__cudaparma1__Z11make_float3fff, __cudaparma2__Z11make_float3fff, __cudaparma3__Z11make_float3fff);
ld.param.f32 %f13, [__cudareta__Z11make_float3fff+0];
st.local.f32 [__cuda__temp__Mreturn__Z11make_float3fff37_48+0], %f13;
ld.param.f32 %f14, [__cudareta__Z11make_float3fff+4];
st.local.f32 [__cuda__temp__Mreturn__Z11make_float3fff37_48+4], %f14;
ld.param.f32 %f15, [__cudareta__Z11make_float3fff+8];
st.local.f32 [__cuda__temp__Mreturn__Z11make_float3fff37_48+8], %f15;
ld.local.f32 %f16, [__cuda__temp__Mreturn__Z11make_float3fff37_48+0];
st.local.f32 [__cuda_local_var_39817_9_non_const_v2_64+0], %f16;
ld.local.f32 %f17, [__cuda__temp__Mreturn__Z11make_float3fff37_48+4];
st.local.f32 [__cuda_local_var_39817_9_non_const_v2_64+4], %f17;
ld.local.f32 %f18, [__cuda__temp__Mreturn__Z11make_float3fff37_48+8];
st.local.f32 [__cuda_local_var_39817_9_non_const_v2_64+8], %f18;
.loc 17 10 0
ld.local.f32 %f19, [__cuda_local_var_39817_9_non_const_v2_64+8];
ld.local.f32 %f20, [__cuda_local_var_39817_9_non_const_v2_64+4];
ld.local.f32 %f21, [__cuda_local_var_39817_9_non_const_v2_64+0];
ld.local.f32 %f22, [__cuda_local_var_39816_9_non_const_v1_32+8];
ld.local.f32 %f23, [__cuda_local_var_39816_9_non_const_v1_32+0];
ld.local.f32 %f24, [__cuda_local_var_39816_9_non_const_v1_32+4];
add.f32 %f25, %f23, %f24;
add.f32 %f26, %f22, %f25;
add.f32 %f27, %f21, %f26;
add.f32 %f28, %f20, %f27;
add.f32 %f29, %f19, %f28;
mov.f32 %f30, %f29;
.loc 17 12 0
mov.f32 %f31, %f30;
ld.param.u64 %rd7, [__cudaparm__Z6kernelPf_a];
st.f32 [%rd7+0], %f31;
$LDWendblock_181_1:
.loc 17 13 0
exit;
$LDWend__Z6kernelPf:
} // _Z6kernelPf
Now there seem to be only three fetches to the global memory, but it seems to just fetch one float each time and ld is used instead of ldu. What’s the difference?. And make_float3 isn’t inlined??