Decuda assistance

eyalhir74 · March 2, 2009, 9:21am

Hi,

I’m having some problems trying to understand the .cubin and decuda output compared to the original source code and to try and see if something can be further optimized.

The original C code, which resides in the main loop and presumably takes most of the time, is as follow:

__inline__ __device__ void CalcTraceData( int iTraceIndex, int iTimeIndex, 

					int Nb, int numMoveSamples, float w2, int InputPos,

					float &fOut1, float &fOut2, float &fOut3, float &fOut4 )

{

		int iInputPos = InputPos + iTimeIndex;

	   float fPhaseVal = tex1Dfetch( texPhaseInput, iInputPos );; 

	   float fPhaseValPlusOne = tex1Dfetch( texPhaseInput, iInputPos + 1 ); 

	   float fPhaseValW1 = fPhaseVal * w1;

	   float fPhaseValPlusOneW2 = fPhaseValPlusOne * w2;

	   fOut1 += fPhaseValW1 + fPhaseValPlusOneW2;

	   fOut2 += fPhaseVal * fPhaseValW1 + fPhaseValPlusOne * fPhaseValPlusOneW2;

	   fOut3 += tex1Dfetch( texUseInput, iInputPos ) * w1 + tex1Dfetch( texUseInput, iInputPos + 1 ) * w2;

	   fOut4++;

}

for( int i = 0; i < BLOCK_THREAD_SIZE; i++ )

{

	CalcTraceData( i, iTimeIndex, 

		smNb[ iTraceIndex ], smnumMoveSamples[ iTraceIndex ], smw2[ iTraceIndex ], smInputPos[ iTraceIndex ],

		fOut1, fOut2, fOu3, fOut4 );

}

I guess the relevant .ptx part is this:

//<loop> Loop body line 558, nesting depth: 1, iterations: 256

	.loc	14	561	0

 // 559		  for( int i = 0; i < BLOCK_THREAD_SIZE; i++ )

 // 560		  {

 // 561				  CalcTraceData( i, iTimeIndex, 

	ld.shared.s32 	%r84, [%rd34+0];	// id:494 __cuda_smnumMoveSamples18016+0x0

	set.gt.u32.s32 	%r85, %r84, %r59;	// 

	neg.s32 	%r86, %r85;			  // 

	ld.shared.s32 	%r87, [%rd35+0];	// id:495 __cuda_smNb16992+0x0

	set.le.u32.s32 	%r88, %r87, %r59;	// 

	neg.s32 	%r89, %r88;			  // 

	and.b32 	%r90, %r86, %r89;		// 

	mov.u32 	%r91, 0;			 	// 

	setp.eq.s32 	%p10, %r90, %r91;	// 

	@%p10 bra 	$Lt_1_91;			  // 

 //<loop> Part of loop body line 558, head labeled $Lt_1_90

	add.u64 	%rd36, %rd33, %rd22; 	// 

	ld.shared.s32 	%r92, [%rd36+0];	// id:496 __cuda_smInputPos19040+0x0

	add.s32 	%r93, %r92, %r59;		// 

	mov.s32 	%r94, %r93;			  // 

	mov.s32 	%r95, 0;			 	// 

	mov.s32 	%r96, 0;			 	// 

	mov.s32 	%r97, 0;			 	// 

	tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[texInput1,{%r94,%r95,%r96,%r97}];

	.loc	14	82	0

	mov.f32 	%f24, %f20;			  // 

	add.s32 	%r98, %r93, 1;	   	// 

	mov.s32 	%r99, %r98;			  // 

	mov.s32 	%r100, 0;				// 

	mov.s32 	%r101, 0;				// 

	mov.s32 	%r102, 0;				// 

	tex.1d.v4.f32.s32 {%f25,%f26,%f27,%f28},[texInput1,{%r99,%r100,%r101,%r102}];

	.loc	14	83	0

	mov.f32 	%f29, %f25;			  // 

	.loc	14	88	0

	add.u64 	%rd37, %rd33, %rd23; 	// 

	ld.shared.f32 	%f30, [%rd37+0];	// id:497 __cuda_smw220064+0x0

	mul.f32 	%f31, %f30, %f29;		// 

	mov.f32 	%f32, 0f3f800000;		// 1

	sub.f32 	%f33, %f32, %f30;		// 

	mul.f32 	%f34, %f33, %f24;		// 

	add.f32 	%f35, %f31, %f34;		// 

	add.f32 	%f15, %f15, %f35;		// 

	mov.f32 	%f36, %f15;			  // 

	.loc	14	89	0

	mul.f32 	%f37, %f31, %f29;		// 

	mad.f32 	%f38, %f24, %f34, %f37;	// 

	add.f32 	%f17, %f17, %f38;		// 

	mov.f32 	%f39, %f17;			  // 

	mov.s32 	%r103, %r93;		 	// 

	mov.s32 	%r104, 0;				// 

	mov.s32 	%r105, 0;				// 

	mov.s32 	%r106, 0;				// 

	tex.1d.v4.f32.s32 {%f40,%f41,%f42,%f43},[texUseInput,{%r103,%r104,%r105,%r106}];

	.loc	14	90	0

	mov.f32 	%f44, %f40;			  // 

	mov.s32 	%r107, %r98;		 	// 

	mov.s32 	%r108, 0;				// 

	mov.s32 	%r109, 0;				// 

	mov.s32 	%r110, 0;				// 

	tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[texUseInput,{%r107,%r108,%r109,%r110}];

	mov.f32 	%f49, %f45;			  // 

	mul.f32 	%f50, %f33, %f44;		// 

	mad.f32 	%f51, %f49, %f30, %f50;	// 

	add.f32 	%f11, %f11, %f51;		// 

	mov.f32 	%f52, %f11;			  // 

	.loc	14	96	0

	mov.f32 	%f53, 0f3f800000;		// 1

	add.f32 	%f13, %f13, %f53;		// 

	mov.f32 	%f54, %f13;			  //

And the .decuda output is this:

add.b32 $ofs4, $ofs3, 0x000008b0

add.u32 $r0, s[$ofs4+0x0000], $r10

add.b32 $r3, $r0, 0x00000001

add.b32 $ofs4, $ofs3, 0x00000cb0

mov.half.b32 $r2, $r0// (unk0 00020000)

mov.half.b32 $r1, $r3

tex.1d.s32 {$r2,_,_,_}, $tex2, {$r2}

tex.1d.s32 {$r0,_,_,_}, $tex0, {$r0}

tex.1d.s32 {$r1,_,_,_}, $tex0, {$r1}

tex.1d.s32 {$r3,_,_,_}, $tex2, {$r3}

add.rn.f32 $r12, -s[$ofs4+0x0000], c1[$ofs4+0x0028]

mul.rn.f32 $r13, $r12, $r2

mul.rn.f32 $r2, s[$ofs4+0x0000], $r1

mad.rn.f32 $r13, s[$ofs4+0x0000], $r3, $r13

mul.half.rn.f32 $r3, $r12, $r0

mul.half.rn.f32 $r12, $r2, $r1

add.half.rn.f32 $r5, $r5, $r13

add.half.rn.f32 $r1, $r2, $r3

mad.rn.f32 $r0, $r3, $r0, $r12

add.half.rn.f32 $r7, $r7, $r1

add.half.rn.f32 $r6, $r6, $r0

add.rn.f32 $r4, $r4, 0x3f800000

mul.rn.f32 $r5, $r5, 0x4007e69b

A few questions:

I couldnt find any reference to this .half. part of the add. What is it?
Any chance to change the seperate mul and add functions to a mad function instead?
Any ideas, suggestions, insights on this issue and how to maybe squeeze a bit more performance out of this code is more then welcomed :)

thanks

eyal

quvhiwxvvmy · March 2, 2009, 5:30pm

.half isn’t really an instruction modifier. It just means two instructions are packed into one 64-bit chunk and nothing more.

eyalhir74 · March 3, 2009, 7:43am

Hi,

Thanks for the reply… any performance improvements suggestions ??? ;)

eyal

Topic		Replies	Views
Cubin assembler is now available decuda 0.4.0 released CUDA Programming and Performance	33	23328	May 21, 2009
Decuda error? CUDA Programming and Performance	9	1574	July 2, 2012
Extra MOV instructions? ...doubles the number of instructions executed... CUDA Programming and Performance	10	7774	July 15, 2010
may I use decuda on tesla? CUDA Programming and Performance	2	4830	March 18, 2010
What do you think of `decuda'? CUDA Programming and Performance	1	5542	July 13, 2010
ptx assembly question mystery registers? CUDA Programming and Performance	7	5705	January 26, 2010
nvcc/ptxas unnecessary lmem loads/stores Bug in nvcc alias analysis/PRE stages CUDA Programming and Performance	19	12095	June 28, 2010
Openness about 'real' cubin instructions CUDA Programming and Performance	27	20658	April 29, 2009
decuda for cuda-3.x CUDA Programming and Performance	4	3477	August 27, 2010
Yet another performance question CUDA Programming and Performance	16	4389	February 12, 2009

Decuda assistance

Related topics