The ptx output is following:
[codebox] .version 1.2
.target sm_10, map_f64_to_f32
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc built on 2008-06-19
.reg .u32 %ra<17>;
.reg .u64 %rda<17>;
.reg .f32 %fa<17>;
.reg .f64 %fda<17>;
.reg .u32 %rv<5>;
.reg .u64 %rdv<5>;
.reg .f32 %fv<5>;
.reg .f64 %fdv<5>;
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00000d1c_00000000-7_unrolltest.cpp3.i (/tmp/ccBI#.Z1hPzl)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 “<コマンドライン>”
.file 2 "/tmp/tmpxft_00000d1c_00000000-6_unrolltest.cudafe2.gpu"
.file 3 "/usr/local/gcc-4.1.2/lib/gcc/x86_64-unknown-linux-gnu/4.1.2/include/stddef.h"
.file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/bin/../include/crt/../host_defines.h"
.file 6 "/usr/local/cuda/bin/../include/crt/../builtin_types.h"
.file 7 "/usr/local/cuda/bin/../include/crt/../device_types.h"
.file 8 "/usr/local/cuda/bin/../include/crt/../driver_types.h"
.file 9 "/usr/local/cuda/bin/../include/crt/../texture_types.h"
.file 10 "/usr/local/cuda/bin/../include/crt/../vector_types.h"
.file 11 "/usr/local/cuda/bin/../include/crt/../device_launch_parameters.h"
.file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
.file 13 "/usr/include/bits/types.h"
.file 14 "/usr/include/time.h"
.file 15 "unrolltest.cu"
.file 16 "/usr/local/cuda/bin/../include/common_functions.h"
.file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
.file 18 "/usr/local/cuda/bin/../include/math_functions.h"
.file 19 "/usr/local/cuda/bin/../include/device_functions.h"
.file 20 "/usr/local/cuda/bin/../include/math_constants.h"
.file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
.file 25 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
.entry __globfunc__Z6kernelPi
{
.reg .u32 %r<14>;
.reg .u64 %rd<3>;
.param .u64 __cudaparm___globfunc__Z6kernelPi_a;
// reg = 0
.loc 15 4 0
$LBB1___globfunc__Z6kernelPi:
.loc 15 9 0
mov.s32 %r1, 0; //
mov.s32 %r2, %r1; //
mov.s32 %r3, 1; //
mov.s32 %r2, %r3; //
mov.s32 %r4, 2; //
mov.s32 %r5, %r4; //
mov.s32 %r6, 3; //
mov.s32 %r5, %r6; //
.loc 15 13 0
ld.param.u64 %rd1, [__cudaparm___globfunc__Z6kernelPi_a]; // id:20 __cudaparm___globfunc__Z6kernelPi_a+0x0
mov.s32 %r7, 1; //
st.global.s32 [%rd1+0], %r7; // id:23
mov.s32 %r8, 3; //
st.global.s32 [%rd1+4], %r8; // id:24
mov.s32 %r9, %r10; //
st.global.s32 [%rd1+8], %r9; // id:26
mov.s32 %r11, %r12; //
st.global.s32 [%rd1+12], %r11; // id:28
.loc 15 15 0
exit; //
$LDWend___globfunc__Z6kernelPi:
} // __globfunc__Z6kernelPi
[/codebox]
This ptx output shows that both the r1 register and the r3 register are substituted for the r2 register. r1 and r3 should be substituted for different registers.
The decuda output is following:
[codebox]// Disassembling __globfunc__Z6kernelPi
000000: 1000c801 0423c780 mov.b32 $r0, s[0x0010]
000008: 10018009 00000003 mov.b32 $r2, 0x00000001
000010: d00e0009 a0c00780 mov.u32 g[$r0], $r2
000018: 2104e809 00000003 add.b32 $r2, s[0x0010], 0x00000004
000020: 10038001 00000003 mov.b32 $r0, 0x00000003
000028: d00e0401 a0c00780 mov.u32 g[$r2], $r0
000030: 2108e801 00000003 add.b32 $r0, s[0x0010], 0x00000008
000038: d00e0005 a0c00780 mov.u32 g[$r0], $r1
000040: 210ce801 00000003 add.b32 $r0, s[0x0010], 0x0000000c
000048: d00e0005 a0c00781 mov.end.u32 g[$r0], $r1[/codebox]