Isn’t it a bug of the compiler in CUDA2.0 and CUDA2.1beta?
Source code(.cu) is
[codebox] unsigned int val = g_idata[tid];
sdata[ tid * 4 + 0] = tex1Dfetch( tex, (val >> 24) & 0xff );
sdata[ tid * 4 + 1] = tex1Dfetch( tex, (val >> 16) & 0xff );
sdata[ tid * 4 + 2] = tex1Dfetch( tex, (val >> 8) & 0xff );
sdata[ tid * 4 + 3] = tex1Dfetch( tex, (val ) & 0xff );
[/codebox]
compilation result(.ptx) is
[codebox] ld.global.u32 %r4, [%r3+0];
shr.u32 %r5, %r4, 24; // (val >> 24) & 0xff
mov.s32 %r6, 0;
mov.s32 %r7, 0;
mov.s32 %r8, 0;
tex.1d.v4.u32.s32 {%r9,%r10,%r11,%r12},[tex,{%r5,%r6,%r7,%r8}];
.loc 14 73 0
mov.s32 %r13, %r9;
mul.wide.u16 %r14, %rh1, 8;
mov.u32 %r15, __cuda_sdata12;
add.u32 %r16, %r14, %r15;
st.shared.u16 [%r16+0], %r13;
shl.b32 %r17, %r4, 8; //
shr.s32 %r18, %r17, 24; // (val >> 16) & 0xff ??
mov.s32 %r19, 0;
mov.s32 %r20, 0;
mov.s32 %r21, 0;
tex.1d.v4.u32.s32 {%r22,%r23,%r24,%r25},[tex,{%r18,%r19,%r20,%r21}];
.loc 14 74 0
mov.s32 %r26, %r22;
st.shared.u16 [%r16+2], %r26;
shl.b32 %r27, %r4, 16; //
shr.s32 %r28, %r27, 24; // (val >> 8) & 0xff ??
mov.s32 %r29, 0;
mov.s32 %r30, 0;
mov.s32 %r31, 0;
tex.1d.v4.u32.s32 {%r32,%r33,%r34,%r35},[tex,{%r28,%r29,%r30,%r31}];
.loc 14 75 0
mov.s32 %r36, %r32;
st.shared.u16 [%r16+4], %r36;
and.b32 %r37, %r4, 255; // (val ) & 0xff
mov.s32 %r38, 0;
mov.s32 %r39, 0;
mov.s32 %r40, 0;
tex.1d.v4.u32.s32 {%r41,%r42,%r43,%r44},[tex,{%r37,%r38,%r39,%r40}];
.loc 14 76 0
mov.s32 %r45, %r41;
st.shared.u16 [%r16+6], %r45;
[/codebox]
‘shr.s32’ must be ‘shr.u32’.