Are you sure you are reading correct file? Because I just tried that and it worked on my compiler (nothing special, CUDA 2.3)
struct __align__(16) DATA
{
float u,v;
int val;
float f;
};
__global__ void
testKernel( DATA* g_idata, DATA* g_odata)
{
const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
((float4*)g_odata)[tid] = ((float4*)g_idata)[tid];
}
$LBB1__Z10testKernelP4DATAS0_:
mov.u16 %rh1, %ctaid.x;
mov.u16 %rh2, %ntid.x;
mul.wide.u16 %r1, %rh1, %rh2;
cvt.u32.u16 %r2, %tid.x;
add.u32 %r3, %r2, %r1;
cvt.u64.u32 %rd1, %r3;
mul.lo.u64 %rd2, %rd1, 16;
cvt.s32.u64 %r4, %rd2;
ld.param.u32 %r5, [__cudaparm__Z10testKernelP4DATAS0__g_idata];
add.u32 %r6, %r4, %r5;
ld.param.u32 %r7, [__cudaparm__Z10testKernelP4DATAS0__g_odata];
add.u32 %r8, %r4, %r7;
ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%r6+0];
st.global.v4.f32 [%r8+0], {%f1,%f2,%f3,%f4};
exit;
$LDWend__Z10testKernelP4DATAS0_:
} // _Z10testKernelP4DATAS0_
I am now experimenting to see if you could actually modify the data in between…
Edit: I tried to be more general and easy to use:
struct __align__(16) DATA
{
float u,v;
int val;
float f;
};
__device__ DATA get(DATA *ptr, int idx) {
float4 element=((float4*)ptr)[idx];
DATA out;
out.u=element.x;
out.v=element.y;
out.val=__float_as_int(element.z);
out.f=element.w;
return out;
}
__device__ void set(DATA *ptr, int idx, DATA value) {
float4 element;
element.x=value.u;
element.y=value.v;
element.z=__int_as_float(value.val);
element.w=value.f;
((float4*)ptr)[idx]=element;
}
__global__ void
testKernel( DATA* g_idata, DATA* g_odata)
{
const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
DATA e=get(g_idata,tid);
e.val+=1;
set(g_odata,tid,e);
}
As a result, I get a good load instruction but I cannot make it produce a good store instruction :(
.entry _Z10testKernelP4DATAS0_ (
.param .u32 __cudaparm__Z10testKernelP4DATAS0__g_idata,
.param .u32 __cudaparm__Z10testKernelP4DATAS0__g_odata)
{
.reg .u16 %rh<4>;
.reg .u32 %r<12>;
.reg .u64 %rd<4>;
.reg .f32 %f<6>;
$LBB1__Z10testKernelP4DATAS0_:
mov.u16 %rh1, %ctaid.x;
mov.u16 %rh2, %ntid.x;
mul.wide.u16 %r1, %rh1, %rh2;
cvt.u32.u16 %r2, %tid.x;
add.u32 %r3, %r2, %r1;
cvt.u64.s32 %rd1, %r3;
mul.lo.u64 %rd2, %rd1, 16;
cvt.s32.u64 %r4, %rd2;
ld.param.u32 %r5, [__cudaparm__Z10testKernelP4DATAS0__g_idata];
add.u32 %r6, %r4, %r5;
ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%r6+0];
ld.param.u32 %r7, [__cudaparm__Z10testKernelP4DATAS0__g_odata];
add.u32 %r8, %r4, %r7;
st.global.v2.f32 [%r8+0], {%f1,%f2};
mov.b32 %r9, %f3;
add.s32 %r10, %r9, 1;
st.global.s32 [%r8+8], %r10;
st.global.f32 [%r8+12], %f4;
exit;
$LDWend__Z10testKernelP4DATAS0_:
} // _Z10testKernelP4DATAS0_
Note: you might worry about excessive register usage with this setting, yet final code is optimised well enough so that no extra register memory is needed.
1>ptxas info : Compiling entry function '_Z10testKernelP4DATAS0_'
1>ptxas info : Used 6 registers, 8+16 bytes smem, 12 bytes cmem[0], 12 bytes cmem[14]