Hello all,
My kernel code looks like that:
__kernel void showcase(const float4 some_const, global float4* some_output)
{
float4 b = some_const;
if(b.y < 0.f)
b.z = -b.z;
some_output[0] = b;
}
and the corresponding PTX output looks like
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Sep 12 07:12:40 2013 (1378962760)
// Driver
//
.version 3.0
.target sm_20, texmode_independent
.address_size 32
.entry showcase(
.param .align 16 .b8 showcase_param_0[16],
.param .u32 .ptr .global .align 16 showcase_param_1
)
{
.reg .f32 %f<20>;
.reg .pred %p<3>;
.reg .s32 %r<3>;
mov.b32 %r1, showcase_param_0;
ld.param.u32 %r2, [showcase_param_1];
ld.param.v4.f32 {%f16, %f17, %f18, %f19}, [showcase_param_0];
setp.lt.f32 %p1, %f17, 0f00000000;
not.pred %p2, %p1;
@%p2 bra BB0_2;
neg.f32 %f3, %f18;
mov.f32 %f16, %f16;
mov.f32 %f17, %f17;
mov.f32 %f18, %f3;
mov.f32 %f19, %f19;
BB0_2:
st.global.v4.f32 [%r2], {%f16, %f17, %f18, %f19};
ret;
}
My question is why does it rewrite all four components of the float4 variable? When I outcomment the redundant mov instructions after the branch, the modified listing works well, so they seem to be definitely redundant to me.
Specs: GTX 470 + Driver 327.23, Windows 8.1