Hello everybody.
I’ve recently come across something very strange when I decompiled with cuobjdump two different short *.ptx snippsets, that should, in theory, occupy the same number of registers. Let’s consider the following piece of code responsible for adding two vectors of the arbitrary length:
.version 1.4
.target sm_11
.tex .u32 tex1;
.tex .u32 tex2;
.entry addKernel (
.param .b32 size,
.param .b32 devOutArr1
) {
.reg .pred p0;
.reg .b32 r1;
.reg .b32 r2;
.reg .b32 r3;
.reg .b32 r4;
$LDWbegin_addKernel:
mov.b32 r1, %ctaid.x;
mov.b32 r2, %ntid.x;
mul24.lo.s32 r1, r1, r2;
mov.b32 r2, %tid.x;
add.s32 r1, r1, r2;
mov.b32 r4, r1;
ld.param.s32 r2, ;
setp.lt.s32 p0, r1, r2;
@p0 tex.1d.v4.s32.s32 { r2, r3, r3, r3 }, [tex1, { r4 }];
/*
@p0 mov.f32 r3, r2;
@p0 add.f32 r2, r2, r3;
@p0 mov.f32 r3, r2;
@p0 add.f32 r2, r2, r3;
@p0 mov.f32 r3, r2;
@p0 add.f32 r2, r2, r3;
*/
@p0 tex.1d.v4.s32.s32 { r3, r4, r4, r4 }, [tex2, { r4 }];
@p0 add.f32 r2, r2, r3;
@p0 ld.param.b32 r3, [devOutArr1];
@p0 shl.b32 r1, r1, 2;
@p0 add.u32 r1, r1, r3;
@p0 st.global.b32 [r1], r2;
exit;
$LDWend_addKernel:
}
code for sm_11
Function : addKernel
.headerflags @"EF_CUDA_SM10 EF_CUDA_PTX_SM(EF_CUDA_SM10)"
/*0000*/ I2I.U32.U16 R1, g [0x6].U16; /* 0x04200780a0004c05 */
/*0008*/ I2I.U32.U16 R3, g [0x1].U16; /* 0x04200780a000420d */
/*0010*/ I2I.U32.U16 R0, R0L; /* 0x04000780a0000001 */
/*0018*/ IMAD.S24 R4, R1, R3, R0; /* 0x8000078060030211 */
/*0020*/ ISET.S32.C0 R3, g [0x4], R4, GT; /* 0x6c2107c03004c80d */
/*0028*/ MOV32 R1, R4; /* 0x10008804 */
/*002c*/ MOV32 R0, R4; /* 0x10008800 */
/*0030*/ I2I.S32.S32.C1 o[0x7f], R3; /* 0x0c0147d8a00007fd */
/*0038*/ TEX.UN.NODEP R1, 0x1, 0x1, 0x0, RXXX, 0x0; /* 0x00000784f3020205 */
/*0040*/ TEX.UN.NODEP R0, 0x0, 0x0, 0x0, RXXX, 0x0; /* 0x00000784f3000001 */
/*0048*/ MOV R1 (C1.EQU), R2; /* 0x0403d50010000405 */
/*0050*/ G2R.U32 R3, g [0x4].U32; /* 0x4400c7801000080d */
/*0058*/ G2R.U32 R0 (C1.EQU), g [0x4].U32; /* 0x4400d50010000801 */
/*0060*/ FADD R3 (C1.NE), R0, R1; /* 0x00005280b000000d */
/*0068*/ MOV32 R2, R1; /* 0x10008208 */
/*006c*/ MOV32 R0, R4; /* 0x10008800 */
/*0070*/ G2R.U32 R2 (C1.NEU), g [0x5].U32; /* 0x4400d68010000a09 */
/*0078*/ MOV R1, R0; /* 0x0403c78010000005 */
/*0080*/ SHL R1 (C0.NE), R0, 0x2; /* 0xc410028030020005 */
/*0088*/ IADD R0 (C0.NE), R1, R2; /* 0x0400828020000201 */
/*0090*/ RET C0.EQU; /* 0x0000050030000003 */
/*0098*/ GST.U32 global14[R0], R3; /* 0xa0c00781d00e000d */
..........................