Decuda assistance

Hi,

I’m having some problems trying to understand the .cubin and decuda output compared to the original source code and to try and see if something can be further optimized.

The original C code, which resides in the main loop and presumably takes most of the time, is as follow:

__inline__ __device__ void CalcTraceData( int iTraceIndex, int iTimeIndex, 

					int Nb, int numMoveSamples, float w2, int InputPos,

					float &fOut1, float &fOut2, float &fOut3, float &fOut4 )

{

		int iInputPos = InputPos + iTimeIndex;

	   float fPhaseVal = tex1Dfetch( texPhaseInput, iInputPos );; 

	   float fPhaseValPlusOne = tex1Dfetch( texPhaseInput, iInputPos + 1 ); 

	   float fPhaseValW1 = fPhaseVal * w1;

	   float fPhaseValPlusOneW2 = fPhaseValPlusOne * w2;

	   fOut1 += fPhaseValW1 + fPhaseValPlusOneW2;

	   fOut2 += fPhaseVal * fPhaseValW1 + fPhaseValPlusOne * fPhaseValPlusOneW2;

	   fOut3 += tex1Dfetch( texUseInput, iInputPos ) * w1 + tex1Dfetch( texUseInput, iInputPos + 1 ) * w2;

	   fOut4++;

}

for( int i = 0; i < BLOCK_THREAD_SIZE; i++ )

{

	CalcTraceData( i, iTimeIndex, 

		smNb[ iTraceIndex ], smnumMoveSamples[ iTraceIndex ], smw2[ iTraceIndex ], smInputPos[ iTraceIndex ],

		fOut1, fOut2, fOu3, fOut4 );

}

I guess the relevant .ptx part is this:

//<loop> Loop body line 558, nesting depth: 1, iterations: 256

	.loc	14	561	0

 // 559		  for( int i = 0; i < BLOCK_THREAD_SIZE; i++ )

 // 560		  {

 // 561				  CalcTraceData( i, iTimeIndex, 

	ld.shared.s32 	%r84, [%rd34+0];	// id:494 __cuda_smnumMoveSamples18016+0x0

	set.gt.u32.s32 	%r85, %r84, %r59;	// 

	neg.s32 	%r86, %r85;			  // 

	ld.shared.s32 	%r87, [%rd35+0];	// id:495 __cuda_smNb16992+0x0

	set.le.u32.s32 	%r88, %r87, %r59;	// 

	neg.s32 	%r89, %r88;			  // 

	and.b32 	%r90, %r86, %r89;		// 

	mov.u32 	%r91, 0;			 	// 

	setp.eq.s32 	%p10, %r90, %r91;	// 

	@%p10 bra 	$Lt_1_91;			  // 

 //<loop> Part of loop body line 558, head labeled $Lt_1_90

	add.u64 	%rd36, %rd33, %rd22; 	// 

	ld.shared.s32 	%r92, [%rd36+0];	// id:496 __cuda_smInputPos19040+0x0

	add.s32 	%r93, %r92, %r59;		// 

	mov.s32 	%r94, %r93;			  // 

	mov.s32 	%r95, 0;			 	// 

	mov.s32 	%r96, 0;			 	// 

	mov.s32 	%r97, 0;			 	// 

	tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[texInput1,{%r94,%r95,%r96,%r97}];

	.loc	14	82	0

	mov.f32 	%f24, %f20;			  // 

	add.s32 	%r98, %r93, 1;	   	// 

	mov.s32 	%r99, %r98;			  // 

	mov.s32 	%r100, 0;				// 

	mov.s32 	%r101, 0;				// 

	mov.s32 	%r102, 0;				// 

	tex.1d.v4.f32.s32 {%f25,%f26,%f27,%f28},[texInput1,{%r99,%r100,%r101,%r102}];

	.loc	14	83	0

	mov.f32 	%f29, %f25;			  // 

	.loc	14	88	0

	add.u64 	%rd37, %rd33, %rd23; 	// 

	ld.shared.f32 	%f30, [%rd37+0];	// id:497 __cuda_smw220064+0x0

	mul.f32 	%f31, %f30, %f29;		// 

	mov.f32 	%f32, 0f3f800000;		// 1

	sub.f32 	%f33, %f32, %f30;		// 

	mul.f32 	%f34, %f33, %f24;		// 

	add.f32 	%f35, %f31, %f34;		// 

	add.f32 	%f15, %f15, %f35;		// 

	mov.f32 	%f36, %f15;			  // 

	.loc	14	89	0

	mul.f32 	%f37, %f31, %f29;		// 

	mad.f32 	%f38, %f24, %f34, %f37;	// 

	add.f32 	%f17, %f17, %f38;		// 

	mov.f32 	%f39, %f17;			  // 

	mov.s32 	%r103, %r93;		 	// 

	mov.s32 	%r104, 0;				// 

	mov.s32 	%r105, 0;				// 

	mov.s32 	%r106, 0;				// 

	tex.1d.v4.f32.s32 {%f40,%f41,%f42,%f43},[texUseInput,{%r103,%r104,%r105,%r106}];

	.loc	14	90	0

	mov.f32 	%f44, %f40;			  // 

	mov.s32 	%r107, %r98;		 	// 

	mov.s32 	%r108, 0;				// 

	mov.s32 	%r109, 0;				// 

	mov.s32 	%r110, 0;				// 

	tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[texUseInput,{%r107,%r108,%r109,%r110}];

	mov.f32 	%f49, %f45;			  // 

	mul.f32 	%f50, %f33, %f44;		// 

	mad.f32 	%f51, %f49, %f30, %f50;	// 

	add.f32 	%f11, %f11, %f51;		// 

	mov.f32 	%f52, %f11;			  // 

	.loc	14	96	0

	mov.f32 	%f53, 0f3f800000;		// 1

	add.f32 	%f13, %f13, %f53;		// 

	mov.f32 	%f54, %f13;			  //

And the .decuda output is this:

add.b32 $ofs4, $ofs3, 0x000008b0

add.u32 $r0, s[$ofs4+0x0000], $r10

add.b32 $r3, $r0, 0x00000001

add.b32 $ofs4, $ofs3, 0x00000cb0

mov.half.b32 $r2, $r0// (unk0 00020000)

mov.half.b32 $r1, $r3

tex.1d.s32 {$r2,_,_,_}, $tex2, {$r2}

tex.1d.s32 {$r0,_,_,_}, $tex0, {$r0}

tex.1d.s32 {$r1,_,_,_}, $tex0, {$r1}

tex.1d.s32 {$r3,_,_,_}, $tex2, {$r3}

add.rn.f32 $r12, -s[$ofs4+0x0000], c1[$ofs4+0x0028]

mul.rn.f32 $r13, $r12, $r2

mul.rn.f32 $r2, s[$ofs4+0x0000], $r1

mad.rn.f32 $r13, s[$ofs4+0x0000], $r3, $r13

mul.half.rn.f32 $r3, $r12, $r0

mul.half.rn.f32 $r12, $r2, $r1

add.half.rn.f32 $r5, $r5, $r13

add.half.rn.f32 $r1, $r2, $r3

mad.rn.f32 $r0, $r3, $r0, $r12

add.half.rn.f32 $r7, $r7, $r1

add.half.rn.f32 $r6, $r6, $r0

add.rn.f32 $r4, $r4, 0x3f800000

mul.rn.f32 $r5, $r5, 0x4007e69b

A few questions:

  • I couldnt find any reference to this .half. part of the add. What is it?

  • Any chance to change the seperate mul and add functions to a mad function instead?

  • Any ideas, suggestions, insights on this issue and how to maybe squeeze a bit more performance out of this code is more then welcomed :)

thanks

eyal

.half isn’t really an instruction modifier. It just means two instructions are packed into one 64-bit chunk and nothing more.

Hi,

Thanks for the reply… any performance improvements suggestions ??? ;)

eyal