The bug is fixed in CUDA 2.2:
.version 1.4
.target sm_10, map_f64_to_f32
// compiled with /usr/local/cuda-linux64-rel-nightly-2.2.1636/open64/lib//be
// nvopencc built on 2009-03-05
.reg .u32 %ra<17>;
.reg .u64 %rda<17>;
.reg .f32 %fa<17>;
.reg .f64 %fda<17>;
.reg .u32 %rv<5>;
.reg .u64 %rdv<5>;
.reg .f32 %fv<5>;
.reg .f64 %fdv<5>;
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00000a0e_00000000-7_bug_ptx.cpp3.i (/tmp/ccBI#.0CZnUP)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "/tmp/tmpxft_00000a0e_00000000-6_bug_ptx.cudafe2.gpu"
.file 2 "/usr/lib/gcc/x86_64-redhat-linux/3.4.6/include/stddef.h"
.file 3 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
.file 4 "/usr/local/cuda/bin/../include/host_defines.h"
.file 5 "/usr/local/cuda/bin/../include/builtin_types.h"
.file 6 "/usr/local/cuda/bin/../include/device_types.h"
.file 7 "/usr/local/cuda/bin/../include/driver_types.h"
.file 8 "/usr/local/cuda/bin/../include/texture_types.h"
.file 9 "/usr/local/cuda/bin/../include/vector_types.h"
.file 10 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
.file 11 "/usr/local/cuda/bin/../include/crt/storage_class.h"
.file 12 "/usr/include/bits/types.h"
.file 13 "/usr/include/time.h"
.file 14 "bug_ptx.cu"
.file 15 "/usr/local/cuda/bin/../include/common_functions.h"
.file 16 "/usr/local/cuda/bin/../include/crt/func_macro.h"
.file 17 "/usr/local/cuda/bin/../include/math_functions.h"
.file 18 "/usr/local/cuda/bin/../include/device_functions.h"
.file 19 "/usr/local/cuda/bin/../include/math_constants.h"
.file 20 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
.file 21 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
.file 22 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
.file 23 "/usr/local/cuda/bin/../include/common_types.h"
.file 24 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
.file 25 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
.tex .u64 trcTexture;
.tex .u64 sourceTexture;
.entry _Z10checkBuggyPj (
.param .u64 __cudaparm__Z10checkBuggyPj_dst)
{
.reg .u16 %rh<4>;
.reg .u32 %r<24>;
.reg .u64 %rd<6>;
.loc 14 9 0
$LBB1__Z10checkBuggyPj:
cvt.u32.u16 %r1, %tid.x; //
mov.u16 %rh1, %ctaid.x; //
mov.u16 %rh2, %ntid.x; //
mul.wide.u16 %r2, %rh1, %rh2; //
add.u32 %r3, %r1, %r2; //
mov.s32 %r4, %r3; //
mov.s32 %r5, 0; //
mov.s32 %r6, 0; //
mov.s32 %r7, 0; //
tex.1d.v4.u32.s32 {%r8,%r9,%r10,%r11},[sourceTexture,{%r4,%r5,%r6,%r7}];
.loc 14 12 0
mov.s32 %r12, %r8; //
cvt.u8.u32 %r13, %r12; //
mov.s32 %r14, 0; //
mov.s32 %r15, 0; //
mov.s32 %r16, 0; //
tex.1d.v4.u32.s32 {%r17,%r18,%r19,%r20},[trcTexture,{%r13,%r14,%r15,%r16}];
.loc 14 17 0
mov.s32 %r21, %r17; //
.loc 14 18 0
cvt.u16.u32 %r22, %r21; //
ld.param.u64 %rd1, [__cudaparm__Z10checkBuggyPj_dst]; // id:26 __cudaparm__Z10checkBuggyPj_dst+0x0
cvt.u64.s32 %rd2, %r3; //
mul.lo.u64 %rd3, %rd2, 4; //
add.u64 %rd4, %rd1, %rd3; //
st.global.u32 [%rd4+0], %r22; // id:27
.loc 14 19 0
exit; //
$LDWend__Z10checkBuggyPj:
} // _Z10checkBuggyPj
.version 1.4
.target sm_10, map_f64_to_f32
// compiled with /usr/local/cuda-linux64-rel-nightly-2.2.1636/open64/lib//be
// nvopencc built on 2009-03-05
.reg .u32 %ra<17>;
.reg .u64 %rda<17>;
.reg .f32 %fa<17>;
.reg .f64 %fda<17>;
.reg .u32 %rv<5>;
.reg .u64 %rdv<5>;
.reg .f32 %fv<5>;
.reg .f64 %fdv<5>;
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00000a2f_00000000-7_bug_ptx2.cpp3.i (/tmp/ccBI#.QIglVC)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "/tmp/tmpxft_00000a2f_00000000-6_bug_ptx2.cudafe2.gpu"
.file 2 "/usr/lib/gcc/x86_64-redhat-linux/3.4.6/include/stddef.h"
.file 3 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
.file 4 "/usr/local/cuda/bin/../include/host_defines.h"
.file 5 "/usr/local/cuda/bin/../include/builtin_types.h"
.file 6 "/usr/local/cuda/bin/../include/device_types.h"
.file 7 "/usr/local/cuda/bin/../include/driver_types.h"
.file 8 "/usr/local/cuda/bin/../include/texture_types.h"
.file 9 "/usr/local/cuda/bin/../include/vector_types.h"
.file 10 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
.file 11 "/usr/local/cuda/bin/../include/crt/storage_class.h"
.file 12 "/usr/include/bits/types.h"
.file 13 "/usr/include/time.h"
.file 14 "bug_ptx2.cu"
.file 15 "/usr/local/cuda/bin/../include/common_functions.h"
.file 16 "/usr/local/cuda/bin/../include/crt/func_macro.h"
.file 17 "/usr/local/cuda/bin/../include/math_functions.h"
.file 18 "/usr/local/cuda/bin/../include/device_functions.h"
.file 19 "/usr/local/cuda/bin/../include/math_constants.h"
.file 20 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
.file 21 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
.file 22 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
.file 23 "/usr/local/cuda/bin/../include/common_types.h"
.file 24 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
.file 25 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
.tex .u64 testTexture;
.entry _Z11testSignBugPf (
.param .u64 __cudaparm__Z11testSignBugPf_dst)
{
.reg .u32 %r<7>;
.reg .u64 %rd<6>;
.reg .f32 %f<7>;
.loc 14 3 0
$LBB1__Z11testSignBugPf:
cvt.u32.u16 %r1, %tid.x; //
cvt.u8.u32 %r2, %r1; //
mov.s32 %r3, 0; //
mov.s32 %r4, 0; //
mov.s32 %r5, 0; //
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[testTexture,{%r2,%r3,%r4,%r5}];
.loc 14 6 0
mov.f32 %f5, %f1; //
.loc 14 7 0
ld.param.u64 %rd1, [__cudaparm__Z11testSignBugPf_dst]; // id:14 __cudaparm__Z11testSignBugPf_dst+0x0
cvt.u64.u32 %rd2, %r1; //
mul.lo.u64 %rd3, %rd2, 4; //
add.u64 %rd4, %rd1, %rd3; //
st.global.f32 [%rd4+0], %f5; // id:15
.loc 14 8 0
exit; //
$LDWend__Z11testSignBugPf:
} // _Z11testSignBugPf