CUDA 5.5 produces different PTX code from CUDA 5.0 with degraded performance

I have written a red/black stencil computation code (memory bound) which seemed to work well. However, when compiled with CUDA 5.5 the performance droped by >10% for unknown reason. I provide here both ptx files for target kernel produced for CC 2.0 (used cuobjdump). I tested them on a CC 3.0 device (GTX660) but producing CC 3.0 ptx does not improve performance.

Produced with CUDA 5.0:

.visible .entry _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0_(
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2,
.param .f32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_4
)

{
.reg .pred %p<6>;
.reg .s32 %r<29>;
.reg .f32 %f<67>;
.reg .s64 %rd<17>;

ld.param.u32 %r13, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0];
ld.param.u32 %r14, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1];
ld.param.u32 %r12, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2];
ld.param.f32 %f3, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3];

cvta.to.global.u32 %r1, %r13;

.loc 1 137 1

mov.u32 %r15, %ntid.y;
mov.u32 %r16, %ctaid.y;
mov.u32 %r17, %tid.y;
mad.lo.s32 %r18, %r15, %r16, %r17;
shl.b32 %r2, %r18, 1;
add.s32 %r3, %r2, 1;

.loc 1 138 1

mov.u32 %r19, %ntid.x;
mov.u32 %r20, %ctaid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r4, %r19, %r20, %r21;

.loc 1 142 1

mad.lo.s32 %r5, %r3, %r12, %r4;

.loc 1 143 1

shr.u32 %r6, %r14, 1;
setp.lt.u32 %p1, %r4, %r6;
add.s32 %r22, %r14, -2;
setp.lt.u32 %p2, %r3, %r22;
and.pred %p3, %p1, %p2;

.loc 1 144 1

@!%p3 bra BB93_5;
bra.uni BB93_1;

BB93_1:

.loc 1 147 1

add.s32 %r7, %r6, -1;
mov.f32 %f4, 0f3F800000;

.loc 1 149 1

sub.f32 %f1, %f4, %f3;
cvt.rn.f32.u32 %f2, %r4;
add.s32 %r8, %r4, 1;
add.s32 %r9, %r2, 2;

.loc 1 147 1

setp.eq.s32 %p4, %r4, 0;
@%p4 bra BB93_3;

.loc 1 148 1

shl.b32 %r23, %r5, 2;
add.s32 %r24, %r1, %r23;
ld.global.f32 %f29, [%r24];

.loc 1 149 1

mul.f32 %f30, %f1, %f29;
cvt.rn.f32.u32 %f16, %r3;
tex.2d.v4.f32.f32 {%f5, %f6, %f7, %f8}, [texData, {%f2, %f16}];
add.s32 %r25, %r4, -1;
cvt.rn.f32.u32 %f15, %r25;
tex.2d.v4.f32.f32 {%f11, %f12, %f13, %f14}, [texData, {%f15, %f16}];
add.f32 %f31, %f5, %f11;
cvt.rn.f32.u32 %f22, %r2;
tex.2d.v4.f32.f32 {%f17, %f18, %f19, %f20}, [texData, {%f2, %f22}];
add.f32 %f32, %f31, %f17;
cvt.rn.f32.u32 %f28, %r9;
tex.2d.v4.f32.f32 {%f23, %f24, %f25, %f26}, [texData, {%f2, %f28}];
add.f32 %f33, %f32, %f23;
mul.f32 %f34, %f33, %f3;
fma.rn.f32 %f35, %f34, 0f3E800000, %f30;

.loc 1 150 1

st.global.f32 [%r24], %f35;

BB93_3:

.loc 1 155 1

add.s32 %r11, %r5, %r12;

.loc 1 147 1

setp.ge.u32 %p5, %r4, %r7;
@%p5 bra BB93_5;

.loc 1 148 1

shl.b32 %r26, %r11, 2;
add.s32 %r27, %r1, %r26;
ld.global.f32 %f60, [%r27];

.loc 1 149 1

mul.f32 %f61, %f1, %f60;
cvt.rn.f32.u32 %f47, %r9;
tex.2d.v4.f32.f32 {%f36, %f37, %f38, %f39}, [texData, {%f2, %f47}];
cvt.rn.f32.u32 %f46, %r8;
tex.2d.v4.f32.f32 {%f42, %f43, %f44, %f45}, [texData, {%f46, %f47}];
add.f32 %f62, %f36, %f42;
cvt.rn.f32.u32 %f53, %r3;
tex.2d.v4.f32.f32 {%f48, %f49, %f50, %f51}, [texData, {%f2, %f53}];
add.f32 %f63, %f62, %f48;
add.s32 %r28, %r2, 3;
cvt.rn.f32.u32 %f59, %r28;
tex.2d.v4.f32.f32 {%f54, %f55, %f56, %f57}, [texData, {%f2, %f59}];
add.f32 %f64, %f63, %f54;
mul.f32 %f65, %f64, %f3;
fma.rn.f32 %f66, %f65, 0f3E800000, %f61;

.loc 1 150 1

st.global.f32 [%r27], %f66;

BB93_5:

.loc 1 169 2

ret;

}

Produced with CUDA 5.5:

.visible .entry _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0_(
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2,
.param .f32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_4
)

{
.reg .pred %p<6>;
.reg .s32 %r<29>;
.reg .f32 %f<69>;
.reg .s64 %rd<17>;

ld.param.u32 %r13, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0];
ld.param.u32 %r14, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1];
ld.param.u32 %r12, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2];
ld.param.f32 %f3, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3];

cvta.to.global.u32 %r1, %r13;

.loc 1 137 1

mov.u32 %r15, %ntid.y;
mov.u32 %r16, %ctaid.y;
mov.u32 %r17, %tid.y;
mad.lo.s32 %r18, %r15, %r16, %r17;
shl.b32 %r2, %r18, 1;
add.s32 %r3, %r2, 1;

.loc 1 138 1

mov.u32 %r19, %ntid.x;
mov.u32 %r20, %ctaid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r4, %r19, %r20, %r21;

.loc 1 142 1

mad.lo.s32 %r5, %r3, %r12, %r4;

.loc 1 143 1

shr.u32 %r6, %r14, 1;
setp.lt.u32	%p1, %r4, %r6;
add.s32 %r22, %r14, -2;
setp.lt.u32	%p2, %r3, %r22;
and.pred %p3, %p1, %p2;

.loc 1 144 1

@!%p3 bra BB93_5;
bra.uni BB93_1;

BB93_1:

.loc 1 147 1

add.s32 %r7, %r6, -1;
mov.f32 %f4, 0f3F800000;

.loc 1 149 1

sub.ftz.f32 %f1, %f4, %f3;
cvt.rn.f32.u32	%f2, %r4;
add.s32 %r8, %r4, 1;
add.s32 %r9, %r2, 2;

.loc 1 147 1

setp.eq.s32	%p4, %r4, 0;
@%p4 bra BB93_3;

.loc 1 148 1

shl.b32 %r23, %r5, 2;
add.s32 %r24, %r1, %r23;
ld.global.f32 %f29, [%r24];

.loc 1 149 1

cvt.rn.f32.u32	%f16, %r3;
tex.2d.v4.f32.f32 {%f5, %f6, %f7, %f8}, [texData, {%f2, %f16}];
add.s32 %r25, %r4, -1;
cvt.rn.f32.u32	%f15, %r25;
tex.2d.v4.f32.f32 {%f11, %f12, %f13, %f14}, [texData, {%f15, %f16}];
add.ftz.f32 %f30, %f5, %f11;
cvt.rn.f32.u32	%f22, %r2;
tex.2d.v4.f32.f32 {%f17, %f18, %f19, %f20}, [texData, {%f2, %f22}];
add.ftz.f32 %f31, %f30, %f17;
cvt.rn.f32.u32	%f28, %r9;
tex.2d.v4.f32.f32 {%f23, %f24, %f25, %f26}, [texData, {%f2, %f28}];
add.ftz.f32 %f32, %f31, %f23;
mul.ftz.f32 %f33, %f32, %f3;
mov.f32 %f34, 0f40800000;

.loc 2 3606 10

div.approx.ftz.f32 %f35, %f33, %f34;

.loc 1 149 120

fma.rn.ftz.f32 %f36, %f1, %f29, %f35;

.loc 1 150 1

st.global.f32 [%r24], %f36;

BB93_3:

.loc 1 155 1

add.s32 %r11, %r5, %r12;

.loc 1 147 1

setp.ge.u32	%p5, %r4, %r7;

@%p5 bra BB93_5;

.loc 1 148 1

shl.b32 %r26, %r11, 2;
add.s32 %r27, %r1, %r26;
ld.global.f32 %f61, [%r27];

.loc 1 149 1

cvt.rn.f32.u32	%f48, %r9;
tex.2d.v4.f32.f32 {%f37, %f38, %f39, %f40}, [texData, {%f2, %f48}];
cvt.rn.f32.u32	%f47, %r8;
tex.2d.v4.f32.f32 {%f43, %f44, %f45, %f46}, [texData, {%f47, %f48}];
add.ftz.f32 %f62, %f37, %f43;
cvt.rn.f32.u32	%f54, %r3;
tex.2d.v4.f32.f32 {%f49, %f50, %f51, %f52}, [texData, {%f2, %f54}];
add.ftz.f32 %f63, %f62, %f49;
add.s32 %r28, %r2, 3;
cvt.rn.f32.u32	%f60, %r28;
tex.2d.v4.f32.f32 {%f55, %f56, %f57, %f58}, [texData, {%f2, %f60}];
add.ftz.f32 %f64, %f63, %f55;
mul.ftz.f32 %f65, %f64, %f3;
mov.f32 %f66, 0f40800000;

.loc 2 3606 10

div.approx.ftz.f32 %f67, %f65, %f66;

.loc 1 149 120

fma.rn.ftz.f32 %f68, %f1, %f61, %f67;

.loc 1 150 1

st.global.f32 [%r27], %f68;

BB93_5:

.loc 1 169 2

ret;

}

It is hard to tell for sure, but it seems the two codes may not have been compiled with the same compiler switches for CUDA 5.0 and CUDA 5.5 (assuming the source code was identical). The presence of .ftz suffixes in the CUDA 5.5 code would indicate that the code was compiled with -ftz=true, or maybe -use_fast_math, while the absence of these suffixes in the CUDA 5.0 code indicates it may have been compiled with -ftz=false or compiler defaults.

The CUDA 5.5 code also contains single-precision floating-point divisions that do not seem to be present in the CUDA 5.0 code. Scanning the code it appears as if the CUDA 5.0 code contains multiplications that could be equivalent to these division. This could be a function of the division being IEEE-754 compliant in one case (i.e. CUDA 5.0 code was compiled with default settings or -prec-div=true) and approximate in the other case (i.e. CUDA 5.5 code was compiled with -prec-div=false or -use_fast_math).

If you see significant performance differences using the exact same nvcc compiler switches for CUDA 5.0 and CUDA 5.5, I would suggest filing a bug via the bug reporting form linked from the registered developer website, attaching self-contained repro code.

The floating point division seems to be caused by a division operator. However, the divisor is a constant literal (4.0f). In CUDA 5.0 seems like the division is converted to a multiplication by the reciprocal value (0.25f). Does the fact that the divisor is constant play any role in the compiler’s selection between a division and a multiplication instruction?

Thank you for your reply.

As I stated above, the two pieces of code shown above appear to have been compiled with different compiler settings. Without source code, I am afraid one can only speculate as to the differences seen in the PTX. Please note that PTX is only an intermediate language that is further compiled down to machine code (SASS) and ultimately only differences in SASS are relevant to performance.