Hi,
I’m having some problems trying to understand the .cubin and decuda output compared to the original source code and to try and see if something can be further optimized.
The original C code, which resides in the main loop and presumably takes most of the time, is as follow:
__inline__ __device__ void CalcTraceData( int iTraceIndex, int iTimeIndex,
int Nb, int numMoveSamples, float w2, int InputPos,
float &fOut1, float &fOut2, float &fOut3, float &fOut4 )
{
int iInputPos = InputPos + iTimeIndex;
float fPhaseVal = tex1Dfetch( texPhaseInput, iInputPos );;
float fPhaseValPlusOne = tex1Dfetch( texPhaseInput, iInputPos + 1 );
float fPhaseValW1 = fPhaseVal * w1;
float fPhaseValPlusOneW2 = fPhaseValPlusOne * w2;
fOut1 += fPhaseValW1 + fPhaseValPlusOneW2;
fOut2 += fPhaseVal * fPhaseValW1 + fPhaseValPlusOne * fPhaseValPlusOneW2;
fOut3 += tex1Dfetch( texUseInput, iInputPos ) * w1 + tex1Dfetch( texUseInput, iInputPos + 1 ) * w2;
fOut4++;
}
for( int i = 0; i < BLOCK_THREAD_SIZE; i++ )
{
CalcTraceData( i, iTimeIndex,
smNb[ iTraceIndex ], smnumMoveSamples[ iTraceIndex ], smw2[ iTraceIndex ], smInputPos[ iTraceIndex ],
fOut1, fOut2, fOu3, fOut4 );
}
I guess the relevant .ptx part is this:
//<loop> Loop body line 558, nesting depth: 1, iterations: 256
.loc 14 561 0
// 559 for( int i = 0; i < BLOCK_THREAD_SIZE; i++ )
// 560 {
// 561 CalcTraceData( i, iTimeIndex,
ld.shared.s32 %r84, [%rd34+0]; // id:494 __cuda_smnumMoveSamples18016+0x0
set.gt.u32.s32 %r85, %r84, %r59; //
neg.s32 %r86, %r85; //
ld.shared.s32 %r87, [%rd35+0]; // id:495 __cuda_smNb16992+0x0
set.le.u32.s32 %r88, %r87, %r59; //
neg.s32 %r89, %r88; //
and.b32 %r90, %r86, %r89; //
mov.u32 %r91, 0; //
setp.eq.s32 %p10, %r90, %r91; //
@%p10 bra $Lt_1_91; //
//<loop> Part of loop body line 558, head labeled $Lt_1_90
add.u64 %rd36, %rd33, %rd22; //
ld.shared.s32 %r92, [%rd36+0]; // id:496 __cuda_smInputPos19040+0x0
add.s32 %r93, %r92, %r59; //
mov.s32 %r94, %r93; //
mov.s32 %r95, 0; //
mov.s32 %r96, 0; //
mov.s32 %r97, 0; //
tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[texInput1,{%r94,%r95,%r96,%r97}];
.loc 14 82 0
mov.f32 %f24, %f20; //
add.s32 %r98, %r93, 1; //
mov.s32 %r99, %r98; //
mov.s32 %r100, 0; //
mov.s32 %r101, 0; //
mov.s32 %r102, 0; //
tex.1d.v4.f32.s32 {%f25,%f26,%f27,%f28},[texInput1,{%r99,%r100,%r101,%r102}];
.loc 14 83 0
mov.f32 %f29, %f25; //
.loc 14 88 0
add.u64 %rd37, %rd33, %rd23; //
ld.shared.f32 %f30, [%rd37+0]; // id:497 __cuda_smw220064+0x0
mul.f32 %f31, %f30, %f29; //
mov.f32 %f32, 0f3f800000; // 1
sub.f32 %f33, %f32, %f30; //
mul.f32 %f34, %f33, %f24; //
add.f32 %f35, %f31, %f34; //
add.f32 %f15, %f15, %f35; //
mov.f32 %f36, %f15; //
.loc 14 89 0
mul.f32 %f37, %f31, %f29; //
mad.f32 %f38, %f24, %f34, %f37; //
add.f32 %f17, %f17, %f38; //
mov.f32 %f39, %f17; //
mov.s32 %r103, %r93; //
mov.s32 %r104, 0; //
mov.s32 %r105, 0; //
mov.s32 %r106, 0; //
tex.1d.v4.f32.s32 {%f40,%f41,%f42,%f43},[texUseInput,{%r103,%r104,%r105,%r106}];
.loc 14 90 0
mov.f32 %f44, %f40; //
mov.s32 %r107, %r98; //
mov.s32 %r108, 0; //
mov.s32 %r109, 0; //
mov.s32 %r110, 0; //
tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[texUseInput,{%r107,%r108,%r109,%r110}];
mov.f32 %f49, %f45; //
mul.f32 %f50, %f33, %f44; //
mad.f32 %f51, %f49, %f30, %f50; //
add.f32 %f11, %f11, %f51; //
mov.f32 %f52, %f11; //
.loc 14 96 0
mov.f32 %f53, 0f3f800000; // 1
add.f32 %f13, %f13, %f53; //
mov.f32 %f54, %f13; //
And the .decuda output is this:
add.b32 $ofs4, $ofs3, 0x000008b0
add.u32 $r0, s[$ofs4+0x0000], $r10
add.b32 $r3, $r0, 0x00000001
add.b32 $ofs4, $ofs3, 0x00000cb0
mov.half.b32 $r2, $r0// (unk0 00020000)
mov.half.b32 $r1, $r3
tex.1d.s32 {$r2,_,_,_}, $tex2, {$r2}
tex.1d.s32 {$r0,_,_,_}, $tex0, {$r0}
tex.1d.s32 {$r1,_,_,_}, $tex0, {$r1}
tex.1d.s32 {$r3,_,_,_}, $tex2, {$r3}
add.rn.f32 $r12, -s[$ofs4+0x0000], c1[$ofs4+0x0028]
mul.rn.f32 $r13, $r12, $r2
mul.rn.f32 $r2, s[$ofs4+0x0000], $r1
mad.rn.f32 $r13, s[$ofs4+0x0000], $r3, $r13
mul.half.rn.f32 $r3, $r12, $r0
mul.half.rn.f32 $r12, $r2, $r1
add.half.rn.f32 $r5, $r5, $r13
add.half.rn.f32 $r1, $r2, $r3
mad.rn.f32 $r0, $r3, $r0, $r12
add.half.rn.f32 $r7, $r7, $r1
add.half.rn.f32 $r6, $r6, $r0
add.rn.f32 $r4, $r4, 0x3f800000
mul.rn.f32 $r5, $r5, 0x4007e69b
A few questions:
-
I couldnt find any reference to this .half. part of the add. What is it?
-
Any chance to change the seperate mul and add functions to a mad function instead?
-
Any ideas, suggestions, insights on this issue and how to maybe squeeze a bit more performance out of this code is more then welcomed :)
thanks
eyal