exp() gives too much impresision with float


the exp() function using floats , gives me a precision up to 6th decimal place. Is this normal?

master waver # nvcc -o bug -L../../lib/lib -lcutil -I../../common/inc/ bug.cu

master waver # ./bug

CPU's exp() != GPU's exp()!

CPU's exp()=0.586093842983245849609375000000

GPU's exp()=0.586093783378601074218750000000

master waver #

I know that float are imprecise, but that’s too much imprecise for 24 bits, or isn’t it? up to 6 decimal places???

the same code, compiled on device emu gives:

master waver # nvcc -o bug -deviceemu -L../../lib/lib -lcutil -I../../common/inc/ bug.cu

master waver # ./bug

CPU's exp()=0.586093842983245849609375000000

GPU's exp()=0.586093842983245849609375000000

master waver #

Well, now it is acceptable for me. I would like to make GPU to give the same result as deviceemu on single precision. How is this done? Any help is greatly apriciated.

And here is the code:

Main code:

master waver # cat bug.cu

#include <cutil_inline.h>

#define _MAGIC_NUMBER_ -0.534275388027487307951446382503

__global__ void bug_reproduce(float *result) { 



void cuda_test()


	float *d_result,h_result,cpu_result;

	cudaSetDevice( cutGetMaxGflopsDeviceId() );


	bug_reproduce<<< 1 , 1 >>>(d_result);

	cutilCheckMsg("Kernel execution failed");

	cutilSafeCall(cudaMemcpy( &h_result,d_result,sizeof(float),cudaMemcpyDeviceToHost));


	if (cpu_result!=h_result) {

		printf("CPU's exp() != GPU's exp()!\n");


	printf("CPU's exp()=%2.30f\nGPU's exp()=%2.30f\n",cpu_result,h_result);


int main() {



master waver #

Actually, the PTX code shows a compilation of 32 bits. And gives 6 decimal places precision??? :mellow: :mellow:

master waver # nvcc -ptx -L../../lib/lib -lcutil -I../../common/inc/ bug.cu

master waver # cat bug.ptx

		.version 1.4

		.target sm_10, map_f64_to_f32

		// compiled with /usr/local/cuda/open64/lib//be

		// nvopencc built on 2009-04-09

		.reg .u32 %ra<17>;

		.reg .u64 %rda<17>;

		.reg .f32 %fa<17>;

		.reg .f64 %fda<17>;

		.reg .u32 %rv<5>;

		.reg .u64 %rdv<5>;

		.reg .f32 %fv<5>;

		.reg .f64 %fdv<5>;


		// Compiling /tmp/tmpxft_0000178d_00000000-7_bug.cpp3.i (/tmp/ccBI#.8XcZWT)



		// Options:


		//  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64

		//  -O3 (Optimization level)

		//  -g0 (Debug level)

		//  -m2 (Report advisories)


		.file   1	   "<command-line>"

		.file   2	   "/tmp/tmpxft_0000178d_00000000-6_bug.cudafe2.gpu"

		.file   3	   "/usr/lib/gcc/x86_64-pc-linux-gnu/4.3.2/include/stddef.h"

		.file   4	   "/usr/local/cuda/bin/../include/crt/device_runtime.h"

		.file   5	   "/usr/local/cuda/bin/../include/host_defines.h"

		.file   6	   "/usr/local/cuda/bin/../include/builtin_types.h"

		.file   7	   "/usr/local/cuda/bin/../include/device_types.h"

		.file   8	   "/usr/local/cuda/bin/../include/driver_types.h"

		.file   9	   "/usr/local/cuda/bin/../include/texture_types.h"

		.file   10	  "/usr/local/cuda/bin/../include/vector_types.h"

		.file   11	  "/usr/local/cuda/bin/../include/device_launch_parameters.h"

		.file   12	  "/usr/local/cuda/bin/../include/crt/storage_class.h"

		.file   13	  "/usr/include/bits/types.h"

		.file   14	  "/usr/include/time.h"

		.file   15	  "bug.cu"

		.file   16	  "/usr/local/cuda/bin/../include/common_functions.h"

		.file   17	  "/usr/local/cuda/bin/../include/crt/func_macro.h"

		.file   18	  "/usr/local/cuda/bin/../include/math_functions.h"

		.file   19	  "/usr/local/cuda/bin/../include/device_functions.h"

		.file   20	  "/usr/local/cuda/bin/../include/math_constants.h"

		.file   21	  "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"

		.file   22	  "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"

		.file   23	  "/usr/local/cuda/bin/../include/sm_13_double_functions.h"

		.file   24	  "/usr/local/cuda/bin/../include/common_types.h"

		.file   25	  "/usr/local/cuda/bin/../include/texture_fetch_functions.h"

		.file   26	  "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"

		.entry _Z13bug_reproducePf (

				.param .u64 __cudaparm__Z13bug_reproducePf_result)


		.reg .u64 %rd<3>;

		.reg .f32 %f<14>;

		.loc	15	  5	   0


		.loc	15	  6	   0

		mov.f32		 %f1, 0fbf4552eb;		// -0.770796

		cvt.rzi.f32.f32		 %f2, %f1;	   // 

		ex2.approx.f32  %f3, %f2;	   // 

		mov.f32		 %f4, 0fbf08c646;		// -0.534275

		mov.f32		 %f5, 0fbf317200;		// -0.693146

		mad.f32		 %f6, %f2, %f5, %f4;	 // 

		mov.f32		 %f7, 0fb5bfbe8e;		// -1.42861e-06

		mad.f32		 %f8, %f2, %f7, %f6;	 // 

		mov.f32		 %f9, 0f3fb8aa3b;		// 1.4427

		mul.f32		 %f10, %f8, %f9;		 // 

		ex2.approx.f32  %f11, %f10;	 // 

		mul.f32		 %f12, %f3, %f11;		// 

		ld.param.u64	%rd1, [__cudaparm__Z13bug_reproducePf_result];  // id:11 __cudaparm__Z13bug_reproducePf_result+0x0

		st.global.f32   [%rd1+0], %f12; // id:12

		.loc	15	  7	   0

		exit;						   // 


		} // _Z13bug_reproducePf

master waver #

System info (in case it matters):

master waver # ../../bin/linux/release/deviceQuery

CUDA Device Query (Runtime API) version (CUDART static linking)

There is 1 device supporting CUDA

Device 0: "GeForce GTX 280"

  CUDA Capability Major revision number:		 1

  CUDA Capability Minor revision number:		 3

  Total amount of global memory:				 1073479680 bytes

  Number of multiprocessors:					 30

  Number of cores:							   240

  Total amount of constant memory:			   65536 bytes

  Total amount of shared memory per block:	   16384 bytes

  Total number of registers available per block: 16384

  Warp size:									 32

  Maximum number of threads per block:		   512

  Maximum sizes of each dimension of a block:	512 x 512 x 64

  Maximum sizes of each dimension of a grid:	 65535 x 65535 x 1

  Maximum memory pitch:						  262144 bytes

  Texture alignment:							 256 bytes

  Clock rate:									1.35 GHz

  Concurrent copy and execution:				 Yes

  Run time limit on kernels:					 No

  Integrated:									No

  Support host page-locked memory mapping:	   Yes

  Compute mode:								  Default (multiple host threads can use this device simultaneously)


Press ENTER to exit...

master waver # uname -a

Linux master 2.6.28-gentoo-r5 #4 SMP Tue May 26 11:27:16 Local time zone must be set--see zic  x86_64 Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz GenuineIntel GNU/Linux

master waver # 

master waver # nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver

Copyright (c) 2005-2009 NVIDIA Corporation

Built on Thu_Apr__9_05:05:52_PDT_2009

Cuda compilation tools, release 2.2, V0.2.1221

master waver #

Blame the precission of floats:
GPU=0.586093783378601074218750000000 == 0 01111110 00101100000101000111110
CPU=0.586093842983245849609375000000 == 0 01111110 00101100000101000111111

Indeed. In general, single precision floats are good only to 6 significant digits.

then, how do you explain that if you change the MAGIC_NUMBER to … say -0.134275388027487307951446382503

the GPU and CPU exp() functions match ??? (using floats and computing capability of 1.0)

master waver # head bug.cu

#include <cutil_inline.h>

#define _MAGIC_NUMBER_ -0.134275388027487307951446382503

__global__ void bug_reproduce(float *result) { 



void cuda_test()


master waver # nvcc -o bug -L../../lib/lib -lcutil -I../../common/inc/ bug.cu

master waver # ./bug

CPU's exp()=0.874349236488342285156250000000

GPU's exp()=0.874349236488342285156250000000

master waver #

That value is still only good to about 6 figures. Printing lots of digits doesn’t mean they’re all correct. I used your example and used a higher order math package to find the real answer for your example. As expected, the floating point precision is only about 7 decimal digits.

CPU’s exp() =0.874349236488342285156250000000

Correct value=0.87434924613136686628922468071

I don’t see any problem with this. You found a number which happens to give the exact same answer with the GPU and CPU implementations of exp(). There is no requirement that the two implementations have to differ after 6 decimal places. It’s just very, very likely. :)

According to Programming Guide for Cuda 2.2, Page 120, Table C-2
Maximum ulp error for exp(x) function is 1, which means that the computed value may be different (but doesn’t have to be) to what in reality it should be by 1 Unit of Least Precision (hence ULP).
In my first answer I gave you binary representation of your CPU and GPU-computed values. As you can see, they differ only at the last position - the least significant bit. That’s this ulp.

thanks, btw how did you converted it to binary? i did a printf("%X") but it differs a lot. i want to repeat the steps you did, if its possible of course.

master waver # cat bin.c

#include <stdio.h>

main() {

	printf("%2.30f %X\n",0.586093783378601074218750000000f,0.5860937833786010742


	printf("%2.30f %X\n",0.586093842983245849609375000000f,0.5860938429832458496




master waver # gcc -o bin bin.c

master waver # ./bin

0.586093783378601074218750000000 CE57D498

0.586093842983245849609375000000 C657A000


master waver #

I just read bit by bit.


float x=[whatever number];

int y=(int)&x;

for (int i=31; i>=0; --i) {

if (i==30 || i==22)

printf(" ");

printf((*y & (1 << i))?“1”:“0”);


Results of your printfs are for some reason incorrect though! Something weird with printf function there…

it appears that the previous code was printing the addres of the constant , instead of the value. here is the correct code:

master waver # cat bin.c

#include <stdio.h>

#include <sys/types.h>

void float2bin(float x){

	int i;

	int *y=(int*)&x;

	for (i=31; i>=0; --i) {

		if (i==30 || i==22)

		printf(" ");

		printf((*y & (1 << i))?"1":"0");



main() {

	uint *int1,*int2;

	float float1,float2;



	int1=(uint*) &float1;

	int2=(uint*) &float2;

	printf("%2.30f %X\n",float1,*int1);

	printf("%2.30f %X\n",float2,*int2);






master waver # gcc -o bin bin.c

master waver # ./bin

0.586093783378601074218750000000 3F160A3E

0.586093842983245849609375000000 3F160A3F

0 01111110 00101100000101000111110

0 01111110 00101100000101000111111

master waver #

nice little float2bin helper function there! In the past I’ve always just puzzled through the hex, counting digits in my head, but after seeing it in binary, it sure makes it easy to compare…

Are you explicitly compiling for a 64bit device. Otherwise it would be quite reasonable for the compiler to implicitly cast a 64 bit exp() to a 32bit expf()