Fastmath functions Speed or accuracy

I’d like to know if the use of fastmath functions could be useful to me.

So two factor are important:

  • how much is the boost I can get with fastmath functions?
  • how much is the error produced by these functions?

Could you help me or tell me where I can find more informations?

Thank you.

The fast math functions use the “special function unit” in each multiprocessor, taking one instruction, whereas the normal implementations can take many, many instructions. The CUDA programming guide does not list the speed difference, but it does list the accuracy for the fast math functions in Table C-3.

You can use the compiler flag to force usage of intrinsics for the fast math functions. No change to the source, so this would quickly give you an idea of potential performance effect on your code. If after that you decide to mix “fast” and “regular” math operations, then explicitly use intrinsics where appropriate in your case.

Paulius

Note the differences between the same source code:

__global__ void k_sequenceLooping(float *ptr, int N) {

	int i = threadIdx.x;

	ptr[i] = cos((float)i);

}

compiled with -use_fast_math

.entry _Z17k_sequenceLoopingPfi

	{

	.reg .u32 %r<3>;

	.reg .u64 %rd<6>;

	.reg .f32 %f<4>;

	.param .u64 __cudaparm__Z17k_sequenceLoopingPfi_ptr;

	.param .s32 __cudaparm__Z17k_sequenceLoopingPfi_N;

	.loc	15	6	0

$LBB1__Z17k_sequenceLoopingPfi:

	.loc	15	8	0

	cvt.s32.u16 	%r1, %tid.x;	 	// 

	cvt.rn.f32.s32 	%f1, %r1;	 	// 

	cos.f32 	%f2, %f1;				// 

	ld.param.u64 	%rd1, [__cudaparm__Z17k_sequenceLoopingPfi_ptr];	// id:16 __cudaparm__Z17k_sequenceLoopingPfi_ptr+0x0

	cvt.u64.s32 	%rd2, %r1;	   	// 

	mul.lo.u64 	%rd3, %rd2, 4;		// 

	add.u64 	%rd4, %rd1, %rd3;		// 

	st.global.f32 	[%rd4+0], %f2; 	// id:17

	.loc	15	10	0

	exit;						 	// 

$LDWend__Z17k_sequenceLoopingPfi:

	} // _Z17k_sequenceLoopingPfi

and without

.const .align 4 .b8 __cudart_i2opi_f[24] = {65,144,67,60,153,149,98,219,192,221,52,245,209,87,39,252,41

,21,68,78,110,131,249,162};

	.entry _Z17k_sequenceLoopingPfi

	{

	.reg .u32 %r<78>;

	.reg .u64 %rd<12>;

	.reg .f32 %f<38>;

	.reg .pred %p<14>;

	.param .u64 __cudaparm__Z17k_sequenceLoopingPfi_ptr;

	.param .s32 __cudaparm__Z17k_sequenceLoopingPfi_N;

	.local .align 4 .b8 __cuda___cuda_result_1612[28];

	.loc	15	6	0

$LBB1__Z17k_sequenceLoopingPfi:

	.loc	18	1801	0

	cvt.s32.u16 	%r1, %tid.x;	 	// 

	cvt.rn.f32.s32 	%f1, %r1;	 	// 

	abs.f32 	%f2, %f1;				// 

	mov.f32 	%f3, 0f7f800000;	 	// ((1.0F)/(0.0F))

	setp.eq.f32 	%p1, %f2, %f3;   	// 

	@!%p1 bra 	$Lt_0_3;		   	// 

	mov.f32 	%f4, 0f7fffffff;	 	// nan

	bra.uni 	$Lt_0_1;			 	// 

$Lt_0_3:

	.loc	18	1508	0

	mov.f32 	%f5, 0f473ba700;	 	// 48039

	setp.gt.f32 	%p2, %f2, %f5;   	// 

	@!%p2 bra 	$Lt_0_48;			  // 

	.loc	18	1511	0

	mov.b32 	%r2, %f1;				// 

	and.b32 	%r3, %r2, -2147483648;	// 

	mov.s32 	%r4, %r3;				// 

	shl.b32 	%r5, %r2, 8;		 	// 

	mov.u64 	%rd1, __cudart_i2opi_f;	// 

	mov.u64 	%rd2, __cuda___cuda_result_1612;	// 

	or.b32 	%r6, %r5, -2147483648;	// 

	mov.s32 	%r7, 0;				  // 

	mov.u32 	%r8, 0;				  // 

$Lt_0_52:

 //<loop> Loop body line 1511, nesting depth: 1, iterations: 6

	.loc	18	1527	0

	ld.const.u32 	%r9, [%rd1+0];	  // id:221 __cudart_i2opi_f+0x0

	mul.lo.u32 	%r10, %r6, %r9;   	// 

	add.u32 	%r11, %r10, %r8;	 	// 

	.loc	18	1528	0

	set.gt.u32.u32 	%r12, %r10, %r11;	// 

	neg.s32 	%r13, %r12;			  // 

	mul.hi.u32 	%r14, %r9, %r6;   	// 

	add.u32 	%r8, %r13, %r14;	 	// 

	.loc	18	1529	0

	st.local.u32 	[%rd2+0], %r11; 	// id:222 __cuda___cuda_result_1612+0x0

	add.s32 	%r7, %r7, 1;		 	// 

	add.u64 	%rd2, %rd2, 4;	   	// 

	add.u64 	%rd1, %rd1, 4;	   	// 

	mov.u32 	%r15, 6;			 	// 

	setp.ne.s32 	%p3, %r7, %r15;	  // 

	@%p3 bra 	$Lt_0_52;		   	// 

	.loc	18	1531	0

	st.local.u32 	[__cuda___cuda_result_1612+24], %r8;	// id:223 __cuda___cuda_result_1612+0x0

	.loc	18	1536	0

	shl.b32 	%r16, %r2, 1;			// 

	shr.u32 	%r17, %r16, 24;		  // 

	sub.u32 	%r18, %r17, 128;	 	// 

	mov.u64 	%rd3, __cuda___cuda_result_1612;	// 

	shr.u32 	%r19, %r18, 5;	   	// 

	mov.s32 	%r20, 4;			 	// 

	sub.s32 	%r21, %r20, %r19;		// 

	cvt.s64.s32 	%rd4, %r21;		  // 

	mul.lo.u64 	%rd5, %rd4, 4;		// 

	add.u64 	%rd6, %rd3, %rd5;		// 

	ld.local.u32 	%r8, [%rd6+8];	  // id:224 __cuda___cuda_result_1612+0x0

	.loc	18	1537	0

	ld.local.u32 	%r22, [%rd6+4]; 	// id:225 __cuda___cuda_result_1612+0x0

	and.b32 	%r23, %r18, 31;		  // 

	mov.u32 	%r24, 0;			 	// 

	setp.eq.u32 	%p4, %r23, %r24; 	// 

	@%p4 bra 	$Lt_0_54;		   	// 

	.loc	18	1540	0

	mov.s32 	%r25, 32;				// 

	sub.s32 	%r26, %r25, %r23;		// 

	shr.u32 	%r27, %r22, %r26;		// 

	shl.b32 	%r28, %r8, %r23;	 	// 

	or.b32 	%r8, %r27, %r28;		  // 

	.loc	18	1541	0

	ld.local.u32 	%r29, [%rd6+0]; 	// id:226 __cuda___cuda_result_1612+0x0

	shr.u32 	%r30, %r29, %r26;		// 

	shl.b32 	%r31, %r22, %r23;		// 

	or.b32 	%r22, %r30, %r31;	 	// 

$Lt_0_54:

	.loc	18	1543	0

	shr.u32 	%r7, %r8, 30;			// 

	.loc	18	1545	0

	shr.u32 	%r32, %r22, 30;		  // 

	shl.b32 	%r33, %r8, 2;			// 

	or.b32 	%r8, %r32, %r33;		  // 

	.loc	18	1546	0

	shl.b32 	%r22, %r22, 2;	   	// 

	mov.u32 	%r34, 0;			 	// 

	setp.eq.u32 	%p5, %r22, %r34; 	// 

	@%p5 bra 	$Lt_0_57;		   	// 

	.loc	18	1547	0

	add.u32 	%r35, %r8, 1;			// 

	mov.u32 	%r36, -2147483648;   	// 

	set.gt.u32.u32 	%r37, %r35, %r36;	// 

	neg.s32 	%r38, %r37;			  // 

	bra.uni 	$Lt_0_56;				// 

$Lt_0_57:

	mov.u32 	%r39, -2147483648;   	// 

	set.gt.u32.u32 	%r40, %r8, %r39;	// 

	neg.s32 	%r38, %r40;			  // 

$Lt_0_56:

	.loc	18	1548	0

	add.u32 	%r7, %r7, %r38;		  // 

	.loc	18	1547	0

	neg.s32 	%r41, %r7;		   	// 

	mov.u32 	%r42, 0;			 	// 

	setp.ne.u32 	%p6, %r3, %r42;	  // 

	selp.s32 	%r7, %r41, %r7, %p6;	// 

	mov.u32 	%r43, 0;			 	// 

	setp.eq.u32 	%p7, %r38, %r43; 	// 

	@%p7 bra 	$Lt_0_58;		   	// 

	.loc	18	1553	0

	neg.s32 	%r22, %r22;			  // 

	.loc	18	1555	0

	mov.u32 	%r44, 0;			 	// 

	set.eq.u32.u32 	%r45, %r22, %r44;	// 

	neg.s32 	%r46, %r45;			  // 

	not.b32 	%r47, %r8;		   	// 

	add.u32 	%r8, %r46, %r47;	 	// 

	.loc	18	1556	0

	xor.b32 	%r4, %r3, -2147483648;	// 

$Lt_0_58:

	.loc	18	1558	0

	mov.s32 	%r48, %r7;		   	// 

	mov.u32 	%r49, 0;			 	// 

	setp.le.s32 	%p8, %r8, %r49;	  // 

	mov.u32 	%r50, 0;			 	// 

	@%p8 bra 	$Lt_0_70;		   	// 

$Lt_0_62:

 //<loop> Loop body line 1558, nesting depth: 1, estimated iterations: unknown

	.loc	18	1562	0

	shr.u32 	%r51, %r22, 31;		  // 

	shl.b32 	%r52, %r8, 1;			// 

	or.b32 	%r8, %r51, %r52;		  // 

	.loc	18	1563	0

	shl.b32 	%r22, %r22, 1;	   	// 

	.loc	18	1564	0

	sub.u32 	%r50, %r50, 1;	   	// 

	mov.u32 	%r53, 0;			 	// 

	setp.gt.s32 	%p9, %r8, %r53;	  // 

	@%p9 bra 	$Lt_0_62;		   	// 

	bra.uni 	$Lt_0_60;				// 

$Lt_0_70:

$Lt_0_60:

	.loc	18	1566	0

	mul.lo.u32 	%r22, %r8, -921707870;	// 

	.loc	18	1567	0

	mov.u32 	%r54, -921707870;		// 

	mul.hi.u32 	%r8, %r8, %r54;   	// 

	mov.u32 	%r55, 0;			 	// 

	setp.le.s32 	%p10, %r8, %r55; 	// 

	@%p10 bra 	$Lt_0_64;			  // 

	.loc	18	1569	0

	shr.u32 	%r56, %r22, 31;		  // 

	shl.b32 	%r57, %r8, 1;			// 

	or.b32 	%r8, %r56, %r57;		  // 

	.loc	18	1570	0

	shl.b32 	%r22, %r22, 1;	   	// 

	.loc	18	1571	0

	sub.u32 	%r50, %r50, 1;	   	// 

$Lt_0_64:

	.loc	18	1573	0

	mov.u32 	%r58, 0;			 	// 

	set.ne.u32.u32 	%r59, %r22, %r58;	// 

	neg.s32 	%r60, %r59;			  // 

	add.u32 	%r8, %r60, %r8;		  // 

	shl.b32 	%r61, %r8, 24;	   	// 

	mov.s32 	%r62, 0;			 	// 

	set.lt.u32.s32 	%r63, %r61, %r62;	// 

	neg.s32 	%r64, %r63;			  // 

	shr.u32 	%r65, %r8, 8;			// 

	add.u32 	%r66, %r50, 126;	 	// 

	shl.b32 	%r67, %r66, 23;		  // 

	add.u32 	%r68, %r65, %r67;		// 

	add.u32 	%r69, %r64, %r68;		// 

	or.b32 	%r70, %r4, %r69;		  // 

	mov.b32 	%f6, %r70;		   	// 

	bra.uni 	$Lt_0_2;			 	// 

$Lt_0_48:

	.loc	18	1583	0

	mov.f32 	%f7, 0f3f22f983;	 	// 0.63662

	mul.f32 	%f8, %f1, %f7;	   	// 

	cvt.rni.s32.f32 	%r71, %f8;   	// 

	mov.s32 	%r48, %r71;			  // 

	cvt.rn.f32.s32 	%f9, %r71;		// 

	mov.f32 	%f10, 0f3fc90000;		// 1.57031

	mul.f32 	%f11, %f9, %f10;	 	// 

	sub.f32 	%f12, %f1, %f11;	 	// 

	mov.f32 	%f13, 0f39fd8000;		// 0.000483513

	mul.f32 	%f14, %f9, %f13;	 	// 

	sub.f32 	%f15, %f12, %f14;		// 

	mov.f32 	%f16, 0f34a88000;		// 3.13856e-07

	mul.f32 	%f17, %f9, %f16;	 	// 

	sub.f32 	%f18, %f15, %f17;		// 

	mov.f32 	%f19, 0f2e85a309;		// 6.0771e-11

	mul.f32 	%f20, %f9, %f19;	 	// 

	sub.f32 	%f6, %f18, %f20;	 	// 

$Lt_0_2:

	.loc	18	1804	0

	add.s32 	%r72, %r48, 1;	   	// 

	mul.f32 	%f21, %f6, %f6;		  // 

	and.b32 	%r73, %r72, 1;	   	// 

	mov.u32 	%r74, 0;			 	// 

	setp.eq.s32 	%p11, %r73, %r74;	// 

	@%p11 bra 	$Lt_0_67;			  // 

	.loc	18	1808	0

	mov.f32 	%f22, 0f3f800000;		// 1

	mov.f32 	%f23, 0fbf000000;		// -0.5

	mov.f32 	%f24, 0f3d2aaaa5;		// 0.0416666

	mov.f32 	%f25, 0fbab6061a;		// -0.00138873

	mov.f32 	%f26, 0f37ccf5ce;		// 2.44332e-05

	mad.f32 	%f27, %f21, %f26, %f25;	// 

	mad.f32 	%f28, %f27, %f21, %f24;	// 

	mad.f32 	%f29, %f28, %f21, %f23;	// 

	mad.f32 	%f30, %f29, %f21, %f22;	// 

	bra.uni 	$Lt_0_66;				// 

$Lt_0_67:

	.loc	18	1810	0

	mov.f32 	%f31, 0fbe2aaaa3;		// -0.166667

	mov.f32 	%f32, 0f3c08839e;		// 0.00833216

	mov.f32 	%f33, 0fb94ca1f9;		// -0.000195153

	mad.f32 	%f34, %f21, %f33, %f32;	// 

	mad.f32 	%f35, %f34, %f21, %f31;	// 

	mul.f32 	%f36, %f21, %f35;		// 

	mad.f32 	%f30, %f6, %f36, %f6;	// 

$Lt_0_66:

	and.b32 	%r75, %r72, 2;	   	// 

	mov.u32 	%r76, 0;			 	// 

	setp.eq.s32 	%p12, %r75, %r76;	// 

	@%p12 bra 	$Lt_0_68;			  // 

	.loc	18	1813	0

	neg.f32 	%f30, %f30;			  // 

$Lt_0_68:

	mov.f32 	%f4, %f30;		   	// 

$Lt_0_1:

	.loc	15	8	0

	ld.param.u64 	%rd7, [__cudaparm__Z17k_sequenceLoopingPfi_ptr];	// id:227 __cudaparm__Z17k_sequenceLoopingPfi_ptr+0x0

	cvt.u64.s32 	%rd8, %r1;	   	// 

	mul.lo.u64 	%rd9, %rd8, 4;		// 

	add.u64 	%rd10, %rd7, %rd9;   	// 

	st.global.f32 	[%rd10+0], %f4;	// id:228

	.loc	15	10	0

	exit;						 	// 

$LDWend__Z17k_sequenceLoopingPfi:

	} // _Z17k_sequenceLoopingPfi

OK, I already knew about this compiler option.

The problem is that my application is a bit complex and randomized (montecarlo simulation), so it is not so easy for me to evaluate the speed boost and the accuracy of these functions.

At least, fastmath functions drastically reduce the number of instructions in assembly code, that is the number of instruction really executed in the GPU… Though I don’t know how much time is gained this way, and how much accuracy is affected.

I have code that calculates many sines and cosines per run. Although I do not have comparative numbers available - I did such comparisons about a year ago - the difference between --use_fast_math and its absence is significant. I routinely use --use_fast_math as a compiler flags, since my calculations are based on measured data, which are certainly far less accurate than float32 and I use algorithms that are stable, i.e. no propagating errors that grow over time.

fastmath operations take 16 to 32 cycles per warp while standard operations are in “hundreds” of cycles (according to the slides from ECE 498AL). Naturally we’re talking about those difficult functions (logf, sinf, reciprocal, sqrt etc.), not your everyday muls and adds - those take 4 cycles per warp.

The biggest thing to be aware of with the fastmath operations is not how good/bad your input data is, but rather its range! See the list of all fastmath functions (i.e. __cosf()) in the programming guide. They only produce valid results for a given range of input values. There have been a number of questions on the forum in the past few months about invalid values from math functions that turned out to be the result of passing input values outside the range and using the fastmath compiler option.

I always compile without the fastmath option so there are no surprises and then directly call the fastmath intrinsic functions in the code where and when I am positive the input values will not be outside the defined range.