CUDA 5.5 produces different PTX code from CUDA 5.0 with degraded performance

ekon · August 12, 2013, 8:32am

I have written a red/black stencil computation code (memory bound) which seemed to work well. However, when compiled with CUDA 5.5 the performance droped by >10% for unknown reason. I provide here both ptx files for target kernel produced for CC 2.0 (used cuobjdump). I tested them on a CC 3.0 device (GTX660) but producing CC 3.0 ptx does not improve performance.

Produced with CUDA 5.0:

.visible .entry _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0_(
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2,
.param .f32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_4
)

{
.reg .pred %p<6>;
.reg .s32 %r<29>;
.reg .f32 %f<67>;
.reg .s64 %rd<17>;

ld.param.u32 %r13, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0];
ld.param.u32 %r14, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1];
ld.param.u32 %r12, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2];
ld.param.f32 %f3, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3];

cvta.to.global.u32 %r1, %r13;

.loc 1 137 1

mov.u32 %r15, %ntid.y;
mov.u32 %r16, %ctaid.y;
mov.u32 %r17, %tid.y;
mad.lo.s32 %r18, %r15, %r16, %r17;
shl.b32 %r2, %r18, 1;
add.s32 %r3, %r2, 1;

.loc 1 138 1

mov.u32 %r19, %ntid.x;
mov.u32 %r20, %ctaid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r4, %r19, %r20, %r21;

.loc 1 142 1

mad.lo.s32 %r5, %r3, %r12, %r4;

.loc 1 143 1

shr.u32 %r6, %r14, 1;
setp.lt.u32 %p1, %r4, %r6;
add.s32 %r22, %r14, -2;
setp.lt.u32 %p2, %r3, %r22;
and.pred %p3, %p1, %p2;

.loc 1 144 1

@!%p3 bra BB93_5;
bra.uni BB93_1;

BB93_1:

.loc 1 147 1

add.s32 %r7, %r6, -1;
mov.f32 %f4, 0f3F800000;

.loc 1 149 1

sub.f32 %f1, %f4, %f3;
cvt.rn.f32.u32 %f2, %r4;
add.s32 %r8, %r4, 1;
add.s32 %r9, %r2, 2;

.loc 1 147 1

setp.eq.s32 %p4, %r4, 0;
@%p4 bra BB93_3;

.loc 1 148 1

shl.b32 %r23, %r5, 2;
add.s32 %r24, %r1, %r23;
ld.global.f32 %f29, [%r24];

.loc 1 149 1

mul.f32 %f30, %f1, %f29;
cvt.rn.f32.u32 %f16, %r3;
tex.2d.v4.f32.f32 {%f5, %f6, %f7, %f8}, [texData, {%f2, %f16}];
add.s32 %r25, %r4, -1;
cvt.rn.f32.u32 %f15, %r25;
tex.2d.v4.f32.f32 {%f11, %f12, %f13, %f14}, [texData, {%f15, %f16}];
add.f32 %f31, %f5, %f11;
cvt.rn.f32.u32 %f22, %r2;
tex.2d.v4.f32.f32 {%f17, %f18, %f19, %f20}, [texData, {%f2, %f22}];
add.f32 %f32, %f31, %f17;
cvt.rn.f32.u32 %f28, %r9;
tex.2d.v4.f32.f32 {%f23, %f24, %f25, %f26}, [texData, {%f2, %f28}];
add.f32 %f33, %f32, %f23;
mul.f32 %f34, %f33, %f3;
fma.rn.f32 %f35, %f34, 0f3E800000, %f30;

.loc 1 150 1

st.global.f32 [%r24], %f35;

BB93_3:

.loc 1 155 1

add.s32 %r11, %r5, %r12;

.loc 1 147 1

setp.ge.u32 %p5, %r4, %r7;
@%p5 bra BB93_5;

.loc 1 148 1

shl.b32 %r26, %r11, 2;
add.s32 %r27, %r1, %r26;
ld.global.f32 %f60, [%r27];

.loc 1 149 1

mul.f32 %f61, %f1, %f60;
cvt.rn.f32.u32 %f47, %r9;
tex.2d.v4.f32.f32 {%f36, %f37, %f38, %f39}, [texData, {%f2, %f47}];
cvt.rn.f32.u32 %f46, %r8;
tex.2d.v4.f32.f32 {%f42, %f43, %f44, %f45}, [texData, {%f46, %f47}];
add.f32 %f62, %f36, %f42;
cvt.rn.f32.u32 %f53, %r3;
tex.2d.v4.f32.f32 {%f48, %f49, %f50, %f51}, [texData, {%f2, %f53}];
add.f32 %f63, %f62, %f48;
add.s32 %r28, %r2, 3;
cvt.rn.f32.u32 %f59, %r28;
tex.2d.v4.f32.f32 {%f54, %f55, %f56, %f57}, [texData, {%f2, %f59}];
add.f32 %f64, %f63, %f54;
mul.f32 %f65, %f64, %f3;
fma.rn.f32 %f66, %f65, 0f3E800000, %f61;

.loc 1 150 1

st.global.f32 [%r27], %f66;

BB93_5:

.loc 1 169 2

ret;

}

Produced with CUDA 5.5:

.visible .entry _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0_(
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2,
.param .f32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_4
)

{
.reg .pred %p<6>;
.reg .s32 %r<29>;
.reg .f32 %f<69>;
.reg .s64 %rd<17>;

ld.param.u32 %r13, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0];
ld.param.u32 %r14, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1];
ld.param.u32 %r12, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2];
ld.param.f32 %f3, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3];

cvta.to.global.u32 %r1, %r13;

.loc 1 137 1

mov.u32 %r15, %ntid.y;
mov.u32 %r16, %ctaid.y;
mov.u32 %r17, %tid.y;
mad.lo.s32 %r18, %r15, %r16, %r17;
shl.b32 %r2, %r18, 1;
add.s32 %r3, %r2, 1;

.loc 1 138 1

mov.u32 %r19, %ntid.x;
mov.u32 %r20, %ctaid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r4, %r19, %r20, %r21;

.loc 1 142 1

mad.lo.s32 %r5, %r3, %r12, %r4;

.loc 1 143 1

shr.u32 %r6, %r14, 1;
setp.lt.u32	%p1, %r4, %r6;
add.s32 %r22, %r14, -2;
setp.lt.u32	%p2, %r3, %r22;
and.pred %p3, %p1, %p2;

.loc 1 144 1

@!%p3 bra BB93_5;
bra.uni BB93_1;

BB93_1:

.loc 1 147 1

add.s32 %r7, %r6, -1;
mov.f32 %f4, 0f3F800000;

.loc 1 149 1

sub.ftz.f32 %f1, %f4, %f3;
cvt.rn.f32.u32	%f2, %r4;
add.s32 %r8, %r4, 1;
add.s32 %r9, %r2, 2;

.loc 1 147 1

setp.eq.s32	%p4, %r4, 0;
@%p4 bra BB93_3;

.loc 1 148 1

shl.b32 %r23, %r5, 2;
add.s32 %r24, %r1, %r23;
ld.global.f32 %f29, [%r24];

.loc 1 149 1

cvt.rn.f32.u32	%f16, %r3;
tex.2d.v4.f32.f32 {%f5, %f6, %f7, %f8}, [texData, {%f2, %f16}];
add.s32 %r25, %r4, -1;
cvt.rn.f32.u32	%f15, %r25;
tex.2d.v4.f32.f32 {%f11, %f12, %f13, %f14}, [texData, {%f15, %f16}];
add.ftz.f32 %f30, %f5, %f11;
cvt.rn.f32.u32	%f22, %r2;
tex.2d.v4.f32.f32 {%f17, %f18, %f19, %f20}, [texData, {%f2, %f22}];
add.ftz.f32 %f31, %f30, %f17;
cvt.rn.f32.u32	%f28, %r9;
tex.2d.v4.f32.f32 {%f23, %f24, %f25, %f26}, [texData, {%f2, %f28}];
add.ftz.f32 %f32, %f31, %f23;
mul.ftz.f32 %f33, %f32, %f3;
mov.f32 %f34, 0f40800000;

.loc 2 3606 10

div.approx.ftz.f32 %f35, %f33, %f34;

.loc 1 149 120

fma.rn.ftz.f32 %f36, %f1, %f29, %f35;

.loc 1 150 1

st.global.f32 [%r24], %f36;

BB93_3:

.loc 1 155 1

add.s32 %r11, %r5, %r12;

.loc 1 147 1

setp.ge.u32	%p5, %r4, %r7;

@%p5 bra BB93_5;

.loc 1 148 1

shl.b32 %r26, %r11, 2;
add.s32 %r27, %r1, %r26;
ld.global.f32 %f61, [%r27];

.loc 1 149 1

cvt.rn.f32.u32	%f48, %r9;
tex.2d.v4.f32.f32 {%f37, %f38, %f39, %f40}, [texData, {%f2, %f48}];
cvt.rn.f32.u32	%f47, %r8;
tex.2d.v4.f32.f32 {%f43, %f44, %f45, %f46}, [texData, {%f47, %f48}];
add.ftz.f32 %f62, %f37, %f43;
cvt.rn.f32.u32	%f54, %r3;
tex.2d.v4.f32.f32 {%f49, %f50, %f51, %f52}, [texData, {%f2, %f54}];
add.ftz.f32 %f63, %f62, %f49;
add.s32 %r28, %r2, 3;
cvt.rn.f32.u32	%f60, %r28;
tex.2d.v4.f32.f32 {%f55, %f56, %f57, %f58}, [texData, {%f2, %f60}];
add.ftz.f32 %f64, %f63, %f55;
mul.ftz.f32 %f65, %f64, %f3;
mov.f32 %f66, 0f40800000;

.loc 2 3606 10

div.approx.ftz.f32 %f67, %f65, %f66;

.loc 1 149 120

fma.rn.ftz.f32 %f68, %f1, %f61, %f67;

.loc 1 150 1

st.global.f32 [%r27], %f68;

BB93_5:

.loc 1 169 2

ret;

}

njuffa · August 13, 2013, 1:55am

It is hard to tell for sure, but it seems the two codes may not have been compiled with the same compiler switches for CUDA 5.0 and CUDA 5.5 (assuming the source code was identical). The presence of .ftz suffixes in the CUDA 5.5 code would indicate that the code was compiled with -ftz=true, or maybe -use_fast_math, while the absence of these suffixes in the CUDA 5.0 code indicates it may have been compiled with -ftz=false or compiler defaults.

The CUDA 5.5 code also contains single-precision floating-point divisions that do not seem to be present in the CUDA 5.0 code. Scanning the code it appears as if the CUDA 5.0 code contains multiplications that could be equivalent to these division. This could be a function of the division being IEEE-754 compliant in one case (i.e. CUDA 5.0 code was compiled with default settings or -prec-div=true) and approximate in the other case (i.e. CUDA 5.5 code was compiled with -prec-div=false or -use_fast_math).

If you see significant performance differences using the exact same nvcc compiler switches for CUDA 5.0 and CUDA 5.5, I would suggest filing a bug via the bug reporting form linked from the registered developer website, attaching self-contained repro code.

ekon · August 15, 2013, 5:59pm

The floating point division seems to be caused by a division operator. However, the divisor is a constant literal (4.0f). In CUDA 5.0 seems like the division is converted to a multiplication by the reciprocal value (0.25f). Does the fact that the divisor is constant play any role in the compiler’s selection between a division and a multiplication instruction?

Thank you for your reply.

njuffa · August 15, 2013, 6:13pm

As I stated above, the two pieces of code shown above appear to have been compiled with different compiler settings. Without source code, I am afraid one can only speculate as to the differences seen in the PTX. Please note that PTX is only an intermediate language that is further compiled down to machine code (SASS) and ultimately only differences in SASS are relevant to performance.

Topic		Replies	Views
Bug with integer division? CUDA Programming and Performance	33	9318	September 9, 2015
Strange PTX Output CUDA Programming and Performance	9	3293	December 19, 2014
How to tell nvcc that some `if` must diverge and stop trying to fuse previous statements into it? CUDA Programming and Performance	20	443	March 3, 2024
performance difference for cuda between experiments and the documentation for float/double data type... CUDA Programming and Performance	8	1882	October 28, 2016
Processing image with a CUDA kernel gives me different result than a seemingly equivalent CPU function CUDA Programming and Performance opencv , cuda	27	1433	May 8, 2024
Code compiled in CUDA 4.0 slower than CUDA 3.1 CUDA Programming and Performance	11	17007	September 21, 2011
compiler bug? bit shift CUDA Programming and Performance	7	4519	November 28, 2008
output difference between quadro K600 and K620 CUDA Programming and Performance	13	4864	December 2, 2014
What is the reason why performance deteriorates when PTX code written with pipeline considerations is repeatedly used? CUDA Programming and Performance	4	358	April 28, 2023
why CUDA 2.0 does not expose all PTX ISA 1.3 ? CUDA Programming and Performance	20	27714	November 5, 2008

CUDA 5.5 produces different PTX code from CUDA 5.0 with degraded performance

Related topics