Hi,
the exp() function using floats , gives me a precision up to 6th decimal place. Is this normal?
master waver # nvcc -o bug -L../../lib/lib -lcutil -I../../common/inc/ bug.cu
master waver # ./bug
CPU's exp() != GPU's exp()!
CPU's exp()=0.586093842983245849609375000000
GPU's exp()=0.586093783378601074218750000000
master waver #
I know that float are imprecise, but that’s too much imprecise for 24 bits, or isn’t it? up to 6 decimal places???
the same code, compiled on device emu gives:
master waver # nvcc -o bug -deviceemu -L../../lib/lib -lcutil -I../../common/inc/ bug.cu
master waver # ./bug
CPU's exp()=0.586093842983245849609375000000
GPU's exp()=0.586093842983245849609375000000
master waver #
Well, now it is acceptable for me. I would like to make GPU to give the same result as deviceemu on single precision. How is this done? Any help is greatly apriciated.
And here is the code:
Main code:
master waver # cat bug.cu
#include <cutil_inline.h>
#define _MAGIC_NUMBER_ -0.534275388027487307951446382503
__global__ void bug_reproduce(float *result) {
result[0]=exp(_MAGIC_NUMBER_);;
}
void cuda_test()
{
float *d_result,h_result,cpu_result;
cudaSetDevice( cutGetMaxGflopsDeviceId() );
cutilSafeCall(cudaMalloc((void**)&d_result,sizeof(float)));
bug_reproduce<<< 1 , 1 >>>(d_result);
cutilCheckMsg("Kernel execution failed");
cutilSafeCall(cudaMemcpy( &h_result,d_result,sizeof(float),cudaMemcpyDeviceToHost));
cpu_result=exp(_MAGIC_NUMBER_);
if (cpu_result!=h_result) {
printf("CPU's exp() != GPU's exp()!\n");
}
printf("CPU's exp()=%2.30f\nGPU's exp()=%2.30f\n",cpu_result,h_result);
}
int main() {
cuda_test();
}
master waver #
Actually, the PTX code shows a compilation of 32 bits. And gives 6 decimal places precision??? :mellow: :mellow:
master waver # nvcc -ptx -L../../lib/lib -lcutil -I../../common/inc/ bug.cu
master waver # cat bug.ptx
.version 1.4
.target sm_10, map_f64_to_f32
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc built on 2009-04-09
.reg .u32 %ra<17>;
.reg .u64 %rda<17>;
.reg .f32 %fa<17>;
.reg .f64 %fda<17>;
.reg .u32 %rv<5>;
.reg .u64 %rdv<5>;
.reg .f32 %fv<5>;
.reg .f64 %fdv<5>;
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_0000178d_00000000-7_bug.cpp3.i (/tmp/ccBI#.8XcZWT)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_0000178d_00000000-6_bug.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-pc-linux-gnu/4.3.2/include/stddef.h"
.file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/bin/../include/host_defines.h"
.file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
.file 7 "/usr/local/cuda/bin/../include/device_types.h"
.file 8 "/usr/local/cuda/bin/../include/driver_types.h"
.file 9 "/usr/local/cuda/bin/../include/texture_types.h"
.file 10 "/usr/local/cuda/bin/../include/vector_types.h"
.file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
.file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
.file 13 "/usr/include/bits/types.h"
.file 14 "/usr/include/time.h"
.file 15 "bug.cu"
.file 16 "/usr/local/cuda/bin/../include/common_functions.h"
.file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
.file 18 "/usr/local/cuda/bin/../include/math_functions.h"
.file 19 "/usr/local/cuda/bin/../include/device_functions.h"
.file 20 "/usr/local/cuda/bin/../include/math_constants.h"
.file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/bin/../include/common_types.h"
.file 25 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
.file 26 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
.entry _Z13bug_reproducePf (
.param .u64 __cudaparm__Z13bug_reproducePf_result)
{
.reg .u64 %rd<3>;
.reg .f32 %f<14>;
.loc 15 5 0
$LBB1__Z13bug_reproducePf:
.loc 15 6 0
mov.f32 %f1, 0fbf4552eb; // -0.770796
cvt.rzi.f32.f32 %f2, %f1; //
ex2.approx.f32 %f3, %f2; //
mov.f32 %f4, 0fbf08c646; // -0.534275
mov.f32 %f5, 0fbf317200; // -0.693146
mad.f32 %f6, %f2, %f5, %f4; //
mov.f32 %f7, 0fb5bfbe8e; // -1.42861e-06
mad.f32 %f8, %f2, %f7, %f6; //
mov.f32 %f9, 0f3fb8aa3b; // 1.4427
mul.f32 %f10, %f8, %f9; //
ex2.approx.f32 %f11, %f10; //
mul.f32 %f12, %f3, %f11; //
ld.param.u64 %rd1, [__cudaparm__Z13bug_reproducePf_result]; // id:11 __cudaparm__Z13bug_reproducePf_result+0x0
st.global.f32 [%rd1+0], %f12; // id:12
.loc 15 7 0
exit; //
$LDWend__Z13bug_reproducePf:
} // _Z13bug_reproducePf
master waver #
System info (in case it matters):
master waver # ../../bin/linux/release/deviceQuery
CUDA Device Query (Runtime API) version (CUDART static linking)
There is 1 device supporting CUDA
Device 0: "GeForce GTX 280"
CUDA Capability Major revision number: 1
CUDA Capability Minor revision number: 3
Total amount of global memory: 1073479680 bytes
Number of multiprocessors: 30
Number of cores: 240
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 16384 bytes
Total number of registers available per block: 16384
Warp size: 32
Maximum number of threads per block: 512
Maximum sizes of each dimension of a block: 512 x 512 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
Maximum memory pitch: 262144 bytes
Texture alignment: 256 bytes
Clock rate: 1.35 GHz
Concurrent copy and execution: Yes
Run time limit on kernels: No
Integrated: No
Support host page-locked memory mapping: Yes
Compute mode: Default (multiple host threads can use this device simultaneously)
Test PASSED
Press ENTER to exit...
master waver # uname -a
Linux master 2.6.28-gentoo-r5 #4 SMP Tue May 26 11:27:16 Local time zone must be set--see zic x86_64 Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz GenuineIntel GNU/Linux
master waver #
master waver # nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2009 NVIDIA Corporation
Built on Thu_Apr__9_05:05:52_PDT_2009
Cuda compilation tools, release 2.2, V0.2.1221
master waver #