I have written a red/black stencil computation code (memory bound) which seemed to work well. However, when compiled with CUDA 5.5 the performance droped by >10% for unknown reason. I provide here both ptx files for target kernel produced for CC 2.0 (used cuobjdump). I tested them on a CC 3.0 device (GTX660) but producing CC 3.0 ptx does not improve performance.
Produced with CUDA 5.0:
.visible .entry _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0_(
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2,
.param .f32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_4
)
{
.reg .pred %p<6>;
.reg .s32 %r<29>;
.reg .f32 %f<67>;
.reg .s64 %rd<17>;
ld.param.u32 %r13, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0];
ld.param.u32 %r14, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1];
ld.param.u32 %r12, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2];
ld.param.f32 %f3, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3];
cvta.to.global.u32 %r1, %r13;
.loc 1 137 1
mov.u32 %r15, %ntid.y;
mov.u32 %r16, %ctaid.y;
mov.u32 %r17, %tid.y;
mad.lo.s32 %r18, %r15, %r16, %r17;
shl.b32 %r2, %r18, 1;
add.s32 %r3, %r2, 1;
.loc 1 138 1
mov.u32 %r19, %ntid.x;
mov.u32 %r20, %ctaid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r4, %r19, %r20, %r21;
.loc 1 142 1
mad.lo.s32 %r5, %r3, %r12, %r4;
.loc 1 143 1
shr.u32 %r6, %r14, 1;
setp.lt.u32 %p1, %r4, %r6;
add.s32 %r22, %r14, -2;
setp.lt.u32 %p2, %r3, %r22;
and.pred %p3, %p1, %p2;
.loc 1 144 1
@!%p3 bra BB93_5;
bra.uni BB93_1;
BB93_1:
.loc 1 147 1
add.s32 %r7, %r6, -1;
mov.f32 %f4, 0f3F800000;
.loc 1 149 1
sub.f32 %f1, %f4, %f3;
cvt.rn.f32.u32 %f2, %r4;
add.s32 %r8, %r4, 1;
add.s32 %r9, %r2, 2;
.loc 1 147 1
setp.eq.s32 %p4, %r4, 0;
@%p4 bra BB93_3;
.loc 1 148 1
shl.b32 %r23, %r5, 2;
add.s32 %r24, %r1, %r23;
ld.global.f32 %f29, [%r24];
.loc 1 149 1
mul.f32 %f30, %f1, %f29;
cvt.rn.f32.u32 %f16, %r3;
tex.2d.v4.f32.f32 {%f5, %f6, %f7, %f8}, [texData, {%f2, %f16}];
add.s32 %r25, %r4, -1;
cvt.rn.f32.u32 %f15, %r25;
tex.2d.v4.f32.f32 {%f11, %f12, %f13, %f14}, [texData, {%f15, %f16}];
add.f32 %f31, %f5, %f11;
cvt.rn.f32.u32 %f22, %r2;
tex.2d.v4.f32.f32 {%f17, %f18, %f19, %f20}, [texData, {%f2, %f22}];
add.f32 %f32, %f31, %f17;
cvt.rn.f32.u32 %f28, %r9;
tex.2d.v4.f32.f32 {%f23, %f24, %f25, %f26}, [texData, {%f2, %f28}];
add.f32 %f33, %f32, %f23;
mul.f32 %f34, %f33, %f3;
fma.rn.f32 %f35, %f34, 0f3E800000, %f30;
.loc 1 150 1
st.global.f32 [%r24], %f35;
BB93_3:
.loc 1 155 1
add.s32 %r11, %r5, %r12;
.loc 1 147 1
setp.ge.u32 %p5, %r4, %r7;
@%p5 bra BB93_5;
.loc 1 148 1
shl.b32 %r26, %r11, 2;
add.s32 %r27, %r1, %r26;
ld.global.f32 %f60, [%r27];
.loc 1 149 1
mul.f32 %f61, %f1, %f60;
cvt.rn.f32.u32 %f47, %r9;
tex.2d.v4.f32.f32 {%f36, %f37, %f38, %f39}, [texData, {%f2, %f47}];
cvt.rn.f32.u32 %f46, %r8;
tex.2d.v4.f32.f32 {%f42, %f43, %f44, %f45}, [texData, {%f46, %f47}];
add.f32 %f62, %f36, %f42;
cvt.rn.f32.u32 %f53, %r3;
tex.2d.v4.f32.f32 {%f48, %f49, %f50, %f51}, [texData, {%f2, %f53}];
add.f32 %f63, %f62, %f48;
add.s32 %r28, %r2, 3;
cvt.rn.f32.u32 %f59, %r28;
tex.2d.v4.f32.f32 {%f54, %f55, %f56, %f57}, [texData, {%f2, %f59}];
add.f32 %f64, %f63, %f54;
mul.f32 %f65, %f64, %f3;
fma.rn.f32 %f66, %f65, 0f3E800000, %f61;
.loc 1 150 1
st.global.f32 [%r27], %f66;
BB93_5:
.loc 1 169 2
ret;
}
Produced with CUDA 5.5:
.visible .entry _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0_(
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2,
.param .f32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3,
.param .u32 _Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_4
)
{
.reg .pred %p<6>;
.reg .s32 %r<29>;
.reg .f32 %f<69>;
.reg .s64 %rd<17>;
ld.param.u32 %r13, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_0];
ld.param.u32 %r14, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_1];
ld.param.u32 %r12, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_2];
ld.param.f32 %f3, [_Z8kcalcTexILi0ELb0ELi2EEvPfjjfS0__param_3];
cvta.to.global.u32 %r1, %r13;
.loc 1 137 1
mov.u32 %r15, %ntid.y;
mov.u32 %r16, %ctaid.y;
mov.u32 %r17, %tid.y;
mad.lo.s32 %r18, %r15, %r16, %r17;
shl.b32 %r2, %r18, 1;
add.s32 %r3, %r2, 1;
.loc 1 138 1
mov.u32 %r19, %ntid.x;
mov.u32 %r20, %ctaid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r4, %r19, %r20, %r21;
.loc 1 142 1
mad.lo.s32 %r5, %r3, %r12, %r4;
.loc 1 143 1
shr.u32 %r6, %r14, 1;
setp.lt.u32 %p1, %r4, %r6;
add.s32 %r22, %r14, -2;
setp.lt.u32 %p2, %r3, %r22;
and.pred %p3, %p1, %p2;
.loc 1 144 1
@!%p3 bra BB93_5;
bra.uni BB93_1;
BB93_1:
.loc 1 147 1
add.s32 %r7, %r6, -1;
mov.f32 %f4, 0f3F800000;
.loc 1 149 1
sub.ftz.f32 %f1, %f4, %f3;
cvt.rn.f32.u32 %f2, %r4;
add.s32 %r8, %r4, 1;
add.s32 %r9, %r2, 2;
.loc 1 147 1
setp.eq.s32 %p4, %r4, 0;
@%p4 bra BB93_3;
.loc 1 148 1
shl.b32 %r23, %r5, 2;
add.s32 %r24, %r1, %r23;
ld.global.f32 %f29, [%r24];
.loc 1 149 1
cvt.rn.f32.u32 %f16, %r3;
tex.2d.v4.f32.f32 {%f5, %f6, %f7, %f8}, [texData, {%f2, %f16}];
add.s32 %r25, %r4, -1;
cvt.rn.f32.u32 %f15, %r25;
tex.2d.v4.f32.f32 {%f11, %f12, %f13, %f14}, [texData, {%f15, %f16}];
add.ftz.f32 %f30, %f5, %f11;
cvt.rn.f32.u32 %f22, %r2;
tex.2d.v4.f32.f32 {%f17, %f18, %f19, %f20}, [texData, {%f2, %f22}];
add.ftz.f32 %f31, %f30, %f17;
cvt.rn.f32.u32 %f28, %r9;
tex.2d.v4.f32.f32 {%f23, %f24, %f25, %f26}, [texData, {%f2, %f28}];
add.ftz.f32 %f32, %f31, %f23;
mul.ftz.f32 %f33, %f32, %f3;
mov.f32 %f34, 0f40800000;
.loc 2 3606 10
div.approx.ftz.f32 %f35, %f33, %f34;
.loc 1 149 120
fma.rn.ftz.f32 %f36, %f1, %f29, %f35;
.loc 1 150 1
st.global.f32 [%r24], %f36;
BB93_3:
.loc 1 155 1
add.s32 %r11, %r5, %r12;
.loc 1 147 1
setp.ge.u32 %p5, %r4, %r7;
@%p5 bra BB93_5;
.loc 1 148 1
shl.b32 %r26, %r11, 2;
add.s32 %r27, %r1, %r26;
ld.global.f32 %f61, [%r27];
.loc 1 149 1
cvt.rn.f32.u32 %f48, %r9;
tex.2d.v4.f32.f32 {%f37, %f38, %f39, %f40}, [texData, {%f2, %f48}];
cvt.rn.f32.u32 %f47, %r8;
tex.2d.v4.f32.f32 {%f43, %f44, %f45, %f46}, [texData, {%f47, %f48}];
add.ftz.f32 %f62, %f37, %f43;
cvt.rn.f32.u32 %f54, %r3;
tex.2d.v4.f32.f32 {%f49, %f50, %f51, %f52}, [texData, {%f2, %f54}];
add.ftz.f32 %f63, %f62, %f49;
add.s32 %r28, %r2, 3;
cvt.rn.f32.u32 %f60, %r28;
tex.2d.v4.f32.f32 {%f55, %f56, %f57, %f58}, [texData, {%f2, %f60}];
add.ftz.f32 %f64, %f63, %f55;
mul.ftz.f32 %f65, %f64, %f3;
mov.f32 %f66, 0f40800000;
.loc 2 3606 10
div.approx.ftz.f32 %f67, %f65, %f66;
.loc 1 149 120
fma.rn.ftz.f32 %f68, %f1, %f61, %f67;
.loc 1 150 1
st.global.f32 [%r27], %f68;
BB93_5:
.loc 1 169 2
ret;
}