Fastmath functions Speed or accuracy

Quelo · March 27, 2009, 1:55pm

I’d like to know if the use of fastmath functions could be useful to me.

So two factor are important:

how much is the boost I can get with fastmath functions?
how much is the error produced by these functions?

Could you help me or tell me where I can find more informations?

Thank you.

seibert · March 27, 2009, 2:44pm

The fast math functions use the “special function unit” in each multiprocessor, taking one instruction, whereas the normal implementations can take many, many instructions. The CUDA programming guide does not list the speed difference, but it does list the accuracy for the fast math functions in Table C-3.

paulius · March 27, 2009, 7:00pm

You can use the compiler flag to force usage of intrinsics for the fast math functions. No change to the source, so this would quickly give you an idea of potential performance effect on your code. If after that you decide to mix “fast” and “regular” math operations, then explicitly use intrinsics where appropriate in your case.

Paulius

Gregory_Diamos · March 27, 2009, 8:09pm

Note the differences between the same source code:

__global__ void k_sequenceLooping(float *ptr, int N) {

	int i = threadIdx.x;

	ptr[i] = cos((float)i);

}

compiled with -use_fast_math

.entry _Z17k_sequenceLoopingPfi

	{

	.reg .u32 %r<3>;

	.reg .u64 %rd<6>;

	.reg .f32 %f<4>;

	.param .u64 __cudaparm__Z17k_sequenceLoopingPfi_ptr;

	.param .s32 __cudaparm__Z17k_sequenceLoopingPfi_N;

	.loc	15	6	0

$LBB1__Z17k_sequenceLoopingPfi:

	.loc	15	8	0

	cvt.s32.u16 	%r1, %tid.x;	 	// 

	cvt.rn.f32.s32 	%f1, %r1;	 	// 

	cos.f32 	%f2, %f1;				// 

	ld.param.u64 	%rd1, [__cudaparm__Z17k_sequenceLoopingPfi_ptr];	// id:16 __cudaparm__Z17k_sequenceLoopingPfi_ptr+0x0

	cvt.u64.s32 	%rd2, %r1;	   	// 

	mul.lo.u64 	%rd3, %rd2, 4;		// 

	add.u64 	%rd4, %rd1, %rd3;		// 

	st.global.f32 	[%rd4+0], %f2; 	// id:17

	.loc	15	10	0

	exit;						 	// 

$LDWend__Z17k_sequenceLoopingPfi:

	} // _Z17k_sequenceLoopingPfi

and without

.const .align 4 .b8 __cudart_i2opi_f[24] = {65,144,67,60,153,149,98,219,192,221,52,245,209,87,39,252,41

,21,68,78,110,131,249,162};

	.entry _Z17k_sequenceLoopingPfi

	{

	.reg .u32 %r<78>;

	.reg .u64 %rd<12>;

	.reg .f32 %f<38>;

	.reg .pred %p<14>;

	.param .u64 __cudaparm__Z17k_sequenceLoopingPfi_ptr;

	.param .s32 __cudaparm__Z17k_sequenceLoopingPfi_N;

	.local .align 4 .b8 __cuda___cuda_result_1612[28];

	.loc	15	6	0

$LBB1__Z17k_sequenceLoopingPfi:

	.loc	18	1801	0

	cvt.s32.u16 	%r1, %tid.x;	 	// 

	cvt.rn.f32.s32 	%f1, %r1;	 	// 

	abs.f32 	%f2, %f1;				// 

	mov.f32 	%f3, 0f7f800000;	 	// ((1.0F)/(0.0F))

	setp.eq.f32 	%p1, %f2, %f3;   	// 

	@!%p1 bra 	$Lt_0_3;		   	// 

	mov.f32 	%f4, 0f7fffffff;	 	// nan

	bra.uni 	$Lt_0_1;			 	// 

$Lt_0_3:

	.loc	18	1508	0

	mov.f32 	%f5, 0f473ba700;	 	// 48039

	setp.gt.f32 	%p2, %f2, %f5;   	// 

	@!%p2 bra 	$Lt_0_48;			  // 

	.loc	18	1511	0

	mov.b32 	%r2, %f1;				// 

	and.b32 	%r3, %r2, -2147483648;	// 

	mov.s32 	%r4, %r3;				// 

	shl.b32 	%r5, %r2, 8;		 	// 

	mov.u64 	%rd1, __cudart_i2opi_f;	// 

	mov.u64 	%rd2, __cuda___cuda_result_1612;	// 

	or.b32 	%r6, %r5, -2147483648;	// 

	mov.s32 	%r7, 0;				  // 

	mov.u32 	%r8, 0;				  // 

$Lt_0_52:

 //<loop> Loop body line 1511, nesting depth: 1, iterations: 6

	.loc	18	1527	0

	ld.const.u32 	%r9, [%rd1+0];	  // id:221 __cudart_i2opi_f+0x0

	mul.lo.u32 	%r10, %r6, %r9;   	// 

	add.u32 	%r11, %r10, %r8;	 	// 

	.loc	18	1528	0

	set.gt.u32.u32 	%r12, %r10, %r11;	// 

	neg.s32 	%r13, %r12;			  // 

	mul.hi.u32 	%r14, %r9, %r6;   	// 

	add.u32 	%r8, %r13, %r14;	 	// 

	.loc	18	1529	0

	st.local.u32 	[%rd2+0], %r11; 	// id:222 __cuda___cuda_result_1612+0x0

	add.s32 	%r7, %r7, 1;		 	// 

	add.u64 	%rd2, %rd2, 4;	   	// 

	add.u64 	%rd1, %rd1, 4;	   	// 

	mov.u32 	%r15, 6;			 	// 

	setp.ne.s32 	%p3, %r7, %r15;	  // 

	@%p3 bra 	$Lt_0_52;		   	// 

	.loc	18	1531	0

	st.local.u32 	[__cuda___cuda_result_1612+24], %r8;	// id:223 __cuda___cuda_result_1612+0x0

	.loc	18	1536	0

	shl.b32 	%r16, %r2, 1;			// 

	shr.u32 	%r17, %r16, 24;		  // 

	sub.u32 	%r18, %r17, 128;	 	// 

	mov.u64 	%rd3, __cuda___cuda_result_1612;	// 

	shr.u32 	%r19, %r18, 5;	   	// 

	mov.s32 	%r20, 4;			 	// 

	sub.s32 	%r21, %r20, %r19;		// 

	cvt.s64.s32 	%rd4, %r21;		  // 

	mul.lo.u64 	%rd5, %rd4, 4;		// 

	add.u64 	%rd6, %rd3, %rd5;		// 

	ld.local.u32 	%r8, [%rd6+8];	  // id:224 __cuda___cuda_result_1612+0x0

	.loc	18	1537	0

	ld.local.u32 	%r22, [%rd6+4]; 	// id:225 __cuda___cuda_result_1612+0x0

	and.b32 	%r23, %r18, 31;		  // 

	mov.u32 	%r24, 0;			 	// 

	setp.eq.u32 	%p4, %r23, %r24; 	// 

	@%p4 bra 	$Lt_0_54;		   	// 

	.loc	18	1540	0

	mov.s32 	%r25, 32;				// 

	sub.s32 	%r26, %r25, %r23;		// 

	shr.u32 	%r27, %r22, %r26;		// 

	shl.b32 	%r28, %r8, %r23;	 	// 

	or.b32 	%r8, %r27, %r28;		  // 

	.loc	18	1541	0

	ld.local.u32 	%r29, [%rd6+0]; 	// id:226 __cuda___cuda_result_1612+0x0

	shr.u32 	%r30, %r29, %r26;		// 

	shl.b32 	%r31, %r22, %r23;		// 

	or.b32 	%r22, %r30, %r31;	 	// 

$Lt_0_54:

	.loc	18	1543	0

	shr.u32 	%r7, %r8, 30;			// 

	.loc	18	1545	0

	shr.u32 	%r32, %r22, 30;		  // 

	shl.b32 	%r33, %r8, 2;			// 

	or.b32 	%r8, %r32, %r33;		  // 

	.loc	18	1546	0

	shl.b32 	%r22, %r22, 2;	   	// 

	mov.u32 	%r34, 0;			 	// 

	setp.eq.u32 	%p5, %r22, %r34; 	// 

	@%p5 bra 	$Lt_0_57;		   	// 

	.loc	18	1547	0

	add.u32 	%r35, %r8, 1;			// 

	mov.u32 	%r36, -2147483648;   	// 

	set.gt.u32.u32 	%r37, %r35, %r36;	// 

	neg.s32 	%r38, %r37;			  // 

	bra.uni 	$Lt_0_56;				// 

$Lt_0_57:

	mov.u32 	%r39, -2147483648;   	// 

	set.gt.u32.u32 	%r40, %r8, %r39;	// 

	neg.s32 	%r38, %r40;			  // 

$Lt_0_56:

	.loc	18	1548	0

	add.u32 	%r7, %r7, %r38;		  // 

	.loc	18	1547	0

	neg.s32 	%r41, %r7;		   	// 

	mov.u32 	%r42, 0;			 	// 

	setp.ne.u32 	%p6, %r3, %r42;	  // 

	selp.s32 	%r7, %r41, %r7, %p6;	// 

	mov.u32 	%r43, 0;			 	// 

	setp.eq.u32 	%p7, %r38, %r43; 	// 

	@%p7 bra 	$Lt_0_58;		   	// 

	.loc	18	1553	0

	neg.s32 	%r22, %r22;			  // 

	.loc	18	1555	0

	mov.u32 	%r44, 0;			 	// 

	set.eq.u32.u32 	%r45, %r22, %r44;	// 

	neg.s32 	%r46, %r45;			  // 

	not.b32 	%r47, %r8;		   	// 

	add.u32 	%r8, %r46, %r47;	 	// 

	.loc	18	1556	0

	xor.b32 	%r4, %r3, -2147483648;	// 

$Lt_0_58:

	.loc	18	1558	0

	mov.s32 	%r48, %r7;		   	// 

	mov.u32 	%r49, 0;			 	// 

	setp.le.s32 	%p8, %r8, %r49;	  // 

	mov.u32 	%r50, 0;			 	// 

	@%p8 bra 	$Lt_0_70;		   	// 

$Lt_0_62:

 //<loop> Loop body line 1558, nesting depth: 1, estimated iterations: unknown

	.loc	18	1562	0

	shr.u32 	%r51, %r22, 31;		  // 

	shl.b32 	%r52, %r8, 1;			// 

	or.b32 	%r8, %r51, %r52;		  // 

	.loc	18	1563	0

	shl.b32 	%r22, %r22, 1;	   	// 

	.loc	18	1564	0

	sub.u32 	%r50, %r50, 1;	   	// 

	mov.u32 	%r53, 0;			 	// 

	setp.gt.s32 	%p9, %r8, %r53;	  // 

	@%p9 bra 	$Lt_0_62;		   	// 

	bra.uni 	$Lt_0_60;				// 

$Lt_0_70:

$Lt_0_60:

	.loc	18	1566	0

	mul.lo.u32 	%r22, %r8, -921707870;	// 

	.loc	18	1567	0

	mov.u32 	%r54, -921707870;		// 

	mul.hi.u32 	%r8, %r8, %r54;   	// 

	mov.u32 	%r55, 0;			 	// 

	setp.le.s32 	%p10, %r8, %r55; 	// 

	@%p10 bra 	$Lt_0_64;			  // 

	.loc	18	1569	0

	shr.u32 	%r56, %r22, 31;		  // 

	shl.b32 	%r57, %r8, 1;			// 

	or.b32 	%r8, %r56, %r57;		  // 

	.loc	18	1570	0

	shl.b32 	%r22, %r22, 1;	   	// 

	.loc	18	1571	0

	sub.u32 	%r50, %r50, 1;	   	// 

$Lt_0_64:

	.loc	18	1573	0

	mov.u32 	%r58, 0;			 	// 

	set.ne.u32.u32 	%r59, %r22, %r58;	// 

	neg.s32 	%r60, %r59;			  // 

	add.u32 	%r8, %r60, %r8;		  // 

	shl.b32 	%r61, %r8, 24;	   	// 

	mov.s32 	%r62, 0;			 	// 

	set.lt.u32.s32 	%r63, %r61, %r62;	// 

	neg.s32 	%r64, %r63;			  // 

	shr.u32 	%r65, %r8, 8;			// 

	add.u32 	%r66, %r50, 126;	 	// 

	shl.b32 	%r67, %r66, 23;		  // 

	add.u32 	%r68, %r65, %r67;		// 

	add.u32 	%r69, %r64, %r68;		// 

	or.b32 	%r70, %r4, %r69;		  // 

	mov.b32 	%f6, %r70;		   	// 

	bra.uni 	$Lt_0_2;			 	// 

$Lt_0_48:

	.loc	18	1583	0

	mov.f32 	%f7, 0f3f22f983;	 	// 0.63662

	mul.f32 	%f8, %f1, %f7;	   	// 

	cvt.rni.s32.f32 	%r71, %f8;   	// 

	mov.s32 	%r48, %r71;			  // 

	cvt.rn.f32.s32 	%f9, %r71;		// 

	mov.f32 	%f10, 0f3fc90000;		// 1.57031

	mul.f32 	%f11, %f9, %f10;	 	// 

	sub.f32 	%f12, %f1, %f11;	 	// 

	mov.f32 	%f13, 0f39fd8000;		// 0.000483513

	mul.f32 	%f14, %f9, %f13;	 	// 

	sub.f32 	%f15, %f12, %f14;		// 

	mov.f32 	%f16, 0f34a88000;		// 3.13856e-07

	mul.f32 	%f17, %f9, %f16;	 	// 

	sub.f32 	%f18, %f15, %f17;		// 

	mov.f32 	%f19, 0f2e85a309;		// 6.0771e-11

	mul.f32 	%f20, %f9, %f19;	 	// 

	sub.f32 	%f6, %f18, %f20;	 	// 

$Lt_0_2:

	.loc	18	1804	0

	add.s32 	%r72, %r48, 1;	   	// 

	mul.f32 	%f21, %f6, %f6;		  // 

	and.b32 	%r73, %r72, 1;	   	// 

	mov.u32 	%r74, 0;			 	// 

	setp.eq.s32 	%p11, %r73, %r74;	// 

	@%p11 bra 	$Lt_0_67;			  // 

	.loc	18	1808	0

	mov.f32 	%f22, 0f3f800000;		// 1

	mov.f32 	%f23, 0fbf000000;		// -0.5

	mov.f32 	%f24, 0f3d2aaaa5;		// 0.0416666

	mov.f32 	%f25, 0fbab6061a;		// -0.00138873

	mov.f32 	%f26, 0f37ccf5ce;		// 2.44332e-05

	mad.f32 	%f27, %f21, %f26, %f25;	// 

	mad.f32 	%f28, %f27, %f21, %f24;	// 

	mad.f32 	%f29, %f28, %f21, %f23;	// 

	mad.f32 	%f30, %f29, %f21, %f22;	// 

	bra.uni 	$Lt_0_66;				// 

$Lt_0_67:

	.loc	18	1810	0

	mov.f32 	%f31, 0fbe2aaaa3;		// -0.166667

	mov.f32 	%f32, 0f3c08839e;		// 0.00833216

	mov.f32 	%f33, 0fb94ca1f9;		// -0.000195153

	mad.f32 	%f34, %f21, %f33, %f32;	// 

	mad.f32 	%f35, %f34, %f21, %f31;	// 

	mul.f32 	%f36, %f21, %f35;		// 

	mad.f32 	%f30, %f6, %f36, %f6;	// 

$Lt_0_66:

	and.b32 	%r75, %r72, 2;	   	// 

	mov.u32 	%r76, 0;			 	// 

	setp.eq.s32 	%p12, %r75, %r76;	// 

	@%p12 bra 	$Lt_0_68;			  // 

	.loc	18	1813	0

	neg.f32 	%f30, %f30;			  // 

$Lt_0_68:

	mov.f32 	%f4, %f30;		   	// 

$Lt_0_1:

	.loc	15	8	0

	ld.param.u64 	%rd7, [__cudaparm__Z17k_sequenceLoopingPfi_ptr];	// id:227 __cudaparm__Z17k_sequenceLoopingPfi_ptr+0x0

	cvt.u64.s32 	%rd8, %r1;	   	// 

	mul.lo.u64 	%rd9, %rd8, 4;		// 

	add.u64 	%rd10, %rd7, %rd9;   	// 

	st.global.f32 	[%rd10+0], %f4;	// id:228

	.loc	15	10	0

	exit;						 	// 

$LDWend__Z17k_sequenceLoopingPfi:

	} // _Z17k_sequenceLoopingPfi

Quelo · March 27, 2009, 8:14pm

OK, I already knew about this compiler option.

The problem is that my application is a bit complex and randomized (montecarlo simulation), so it is not so easy for me to evaluate the speed boost and the accuracy of these functions.

Quelo · March 27, 2009, 8:22pm

At least, fastmath functions drastically reduce the number of instructions in assembly code, that is the number of instruction really executed in the GPU… Though I don’t know how much time is gained this way, and how much accuracy is affected.

apaehler · April 16, 2009, 2:32pm

I have code that calculates many sines and cosines per run. Although I do not have comparative numbers available - I did such comparisons about a year ago - the difference between --use_fast_math and its absence is significant. I routinely use --use_fast_math as a compiler flags, since my calculations are based on measured data, which are certainly far less accurate than float32 and I use algorithms that are stable, i.e. no propagating errors that grow over time.

_Big_Mac · April 16, 2009, 4:12pm

fastmath operations take 16 to 32 cycles per warp while standard operations are in “hundreds” of cycles (according to the slides from ECE 498AL). Naturally we’re talking about those difficult functions (logf, sinf, reciprocal, sqrt etc.), not your everyday muls and adds - those take 4 cycles per warp.

MisterAnderson42 · April 16, 2009, 4:33pm

The biggest thing to be aware of with the fastmath operations is not how good/bad your input data is, but rather its range! See the list of all fastmath functions (i.e. __cosf()) in the programming guide. They only produce valid results for a given range of input values. There have been a number of questions on the forum in the past few months about invalid values from math functions that turned out to be the result of passing input values outside the range and using the fastmath compiler option.

I always compile without the fastmath option so there are no surprises and then directly call the fastmath intrinsic functions in the code where and when I am positive the input values will not be outside the defined range.

Topic		Replies	Views
Question on --use_fast_math speedup and register usage CUDA Programming and Performance	12	3981	October 18, 2013
Using fast_math used to be much faster on arch 2.0 and 3.0 but is now even slower on arch 3.5 and up ! CUDA Programming and Performance	19	2215	October 31, 2016
Yet another performance question CUDA Programming and Performance	16	4229	February 12, 2009
Measurements of different CUDA operator throughputs CUDA Programming and Performance	32	49892	August 24, 2009
A more accurate, performance-competitive implementation of expf() CUDA Programming and Performance	24	8254	November 19, 2017
performance difference for cuda between experiments and the documentation for float/double data type... CUDA Programming and Performance	8	1903	October 28, 2016
Mythical Tflops CUDA Programming and Performance	11	1113	January 14, 2019
Inline functions not inlined in CUDA 6.5? CUDA Programming and Performance	7	5931	November 29, 2014
-arch sm_13 vs -arch sm_20 (sm_20 slower on C2050) CUDA Programming and Performance	21	7212	December 21, 2010
GPU/CPU precision comparison and Kernel instructions question CUDA Programming and Performance	5	679	April 4, 2017

Fastmath functions Speed or accuracy

Related topics