Float vs double precision on Titan V

Hi guys,

I created a simple kernel to calculate a couple of FMA in double and float precisions. The objective of the code is to compare the outputs of both precisions. I compare the outputs dividing them at line 44 then printing in the next line.

#include <stdio.h>

#define SIZE 23

__device__ double AD = { -9866.65154024647, 6414.46876973767,
		-6197.84490099783, 6789.84388520848, 4656.50749453597, 8673.96542916471,
		-7484.26208635956, -8827.02013096759, -9246.49610468186,
		-9145.44518523173, 0.414549484352603, 0.746388289656307,
		-0.442820648912679, -0.896772811392324, 0.842169980331744,
		-0.484362075486823, -0.83155268183415, 0.714747720464419,
		-0.771707731198488, 0.646946330857033, 0.863236048450003,
		-0.798130117702055, -0.430501483923713 };

__device__ double BD = { 6462.81956791467, -9614.43122832483,
		2627.99030829307, -8804.41155355035, 9136.95004515449,
		-8949.57876155713, -2295.45814158193, 3872.3107703565,
		-9840.28861553185, 9869.00108700317, 0.406473509129537,
		-0.857046574318794, -0.146306072762992, -0.611596349597419,
		0.638568865045352, 0.219496315503166, 0.602026915824652,
		0.703855774660112, 0.924344803562848, -0.916884736128181,
		-0.712434193013716, -0.407960108885158, 0.901888383562757 };

__device__ double ACCD = { 63766383.8085833, 61671440.4377171,
		16287870.8760915, 59780592.956295, -42546287.0409403, 77628265.357354,
		-17179802.7901642, 34180947.3007734, -90988147.9799452,
		90256394.9276104, -0.16850339976509, 0.639689718248289,
		-0.06478730229465, -0.548462940182346, -0.53778349265337,
		0.106315709189289, 0.500616628799301, -0.503079148090306,
		0.713324068603207, 0.593175121473782, 0.614998879506171,
		-0.325605271045656, 0.388264100186288 };

__global__ void f() {
	double ad = AD[threadIdx.x];
	double bd = BD[threadIdx.x];
	double accd = ACCD[threadIdx.x];

	float af = float(ad);
	float bf = float(bd);
	float accf = float(accd);

	accd += ad * bd;
	accf += af * bf;

	double relative = fabs(double(accf) / accd);
	printf("%d ACC double %.15e ACC float %.15e float/double %lf\n",
			threadIdx.x, accd, accf, relative);
}

void __checkFrameworkErrors(cudaError_t error) {
	if (error != cudaSuccess) {
		printf("CUDA Framework error: %s. Bailing.", cudaGetErrorString(error));
		exit (EXIT_FAILURE);
	}
}

int main() {
	f<<<1, SIZE>>>();
	__checkFrameworkErrors(cudaDeviceSynchronize());
	__checkFrameworkErrors(cudaPeekAtLastError());
}

Then I build it with NVCC 10.1 to execute on a Titan V
nvcc -Xptxas -v -ccbin g++ -gencode arch=compute_70,code=[sm_70,compute_70] --std=c++11 -o test_cuda test_cuda.cu

However, when I execute the code, the result of the division is strange. I’m getting the following results.

0 ACC double -4.835517010640686e+00 ACC float -1.236813545227051e+00 float/double 0.255777
1 ACC double -2.841516310257429e+01 ACC float -3.130780029296875e+01 float/double 1.101799
2 ACC double -5.456034419496163e+00 ACC float -4.415578842163086e+00 float/double 0.809302
3 ACC double 1.300656226226866e+01 ACC float 1.609744262695312e+01 float/double 1.237640
4 ACC double -1.067847764578921e+01 ACC float -1.249450492858887e+01 float/double 1.170064
5 ACC double -7.142597925749672e+01 ACC float -7.923313903808594e+01 float/double 1.109304
6 ACC double 7.549702814049470e+00 ACC float 9.442911148071289e+00 float/double 1.250766
7 ACC double -1.782252604758365e+01 ACC float -1.876491928100586e+01 float/double 1.052877
8 ACC double 4.237251531515085e+01 ACC float 5.037777709960938e+01 float/double 1.188926
9 ACC double -1.354656945280619e+01 ACC float -1.672016143798828e+01 float/double 1.234273
10 ACC double -1.615244734652407e-08 ACC float -2.247021768653212e-08 float/double 1.391134
11 ACC double 1.914866873269971e-07 ACC float 2.162647660952643e-07 float/double 1.129398
12 ACC double 4.778612375355737e-08 ACC float 4.041413959043894e-08 float/double 0.845730
13 ACC double 3.768341414346704e-08 ACC float 4.138978226819745e-08 float/double 1.098355
14 ACC double 3.586233818643276e-08 ACC float 4.864796210313216e-08 float/double 1.356520
15 ACC double 1.825046499057195e-08 ACC float 2.362276418921283e-08 float/double 1.294365
16 ACC double -4.675910303797702e-07 ACC float -5.074264777249482e-07 float/double 1.085193
17 ACC double 1.623837268526149e-07 ACC float 1.235459450299459e-07 float/double 0.760827
18 ACC double 3.740060946708186e-08 ACC float 7.471764718047780e-08 float/double 1.997765
19 ACC double -9.438316355498169e-08 ACC float -7.422294601155954e-08 float/double 0.786400
20 ACC double 1.948344086105084e-09 ACC float 1.961907969416643e-09 float/double 1.006962
21 ACC double -2.132340168873631e-08 ACC float -2.641643348511025e-08 float/double 1.238847
22 ACC double -1.872710377724454e-07 ACC float -1.982469512995522e-07 float/double 1.058610

The division outputs range from 0.25 to 1.99. I was expecting values closer to 1.0. Is this supposed to happen?

You’re exploring the difference at the edge of precision of float.
The difference of 2 large numbers that are almost the same may be quite different in double precision vs. float precision.

When I run your example as ordinary CPU code, I get even larger discrepancies.

$ cat t1558.cpp
#include <stdio.h>
#include <math.h>
#define SIZE 23
#ifdef USE_CUDA
#define _DEV __device__
#define _GLO __global__
#else
#define _DEV
#define _GLO
#endif

_DEV  double AD = { -9866.65154024647, 6414.46876973767,
                -6197.84490099783, 6789.84388520848, 4656.50749453597, 8673.96542916471,
                -7484.26208635956, -8827.02013096759, -9246.49610468186,
                -9145.44518523173, 0.414549484352603, 0.746388289656307,
                -0.442820648912679, -0.896772811392324, 0.842169980331744,
                -0.484362075486823, -0.83155268183415, 0.714747720464419,
                -0.771707731198488, 0.646946330857033, 0.863236048450003,
                -0.798130117702055, -0.430501483923713 };

_DEV double BD = { 6462.81956791467, -9614.43122832483,
                2627.99030829307, -8804.41155355035, 9136.95004515449,
                -8949.57876155713, -2295.45814158193, 3872.3107703565,
                -9840.28861553185, 9869.00108700317, 0.406473509129537,
                -0.857046574318794, -0.146306072762992, -0.611596349597419,
                0.638568865045352, 0.219496315503166, 0.602026915824652,
                0.703855774660112, 0.924344803562848, -0.916884736128181,
                -0.712434193013716, -0.407960108885158, 0.901888383562757 };

_DEV double ACCD = { 63766383.8085833, 61671440.4377171,
                16287870.8760915, 59780592.956295, -42546287.0409403, 77628265.357354,
                -17179802.7901642, 34180947.3007734, -90988147.9799452,
                90256394.9276104, -0.16850339976509, 0.639689718248289,
                -0.06478730229465, -0.548462940182346, -0.53778349265337,
                0.106315709189289, 0.500616628799301, -0.503079148090306,
                0.713324068603207, 0.593175121473782, 0.614998879506171,
                -0.325605271045656, 0.388264100186288 };

_GLO void f() {
#ifndef USE_CUDA
#define _T int(0)
#else
#define _T threadIdx.x
#endif
        double ad = AD[_T];
        double bd = BD[_T];
        double accd = ACCD[_T];

        float af = float(ad);
        float bf = float(bd);
        float accf = float(accd);

        accd += ad * bd;
        accf += af * bf;

        double relative = fabs(double(accf) / accd);
        printf("%d ACC double %.15f ACC float %.15f float/double %f\n",
                        _T, accd, accf, relative);
}
#ifdef USE_CUDA
void __checkFrameworkErrors(cudaError_t error) {
        if (error != cudaSuccess) {
                printf("CUDA Framework error: %s. Bailing.", cudaGetErrorString(error));
                exit (EXIT_FAILURE);
        }
}
#endif
int main() {
#ifndef USE_CUDA
        f();
#else
        f<<<1, 1>>>();
        __checkFrameworkErrors(cudaDeviceSynchronize());
        __checkFrameworkErrors(cudaPeekAtLastError());
#endif
}
$ nvcc -x cu t1558.cpp -o t1558
$ ./t1558
0 ACC double -4.835517011582851 ACC float 0.000000000000000 float/double 0.000000
$ nvcc -x cu t1558.cpp -o t1558 -DUSE_CUDA
$ ./t1558
0 ACC double -4.835517010640686 ACC float -1.236813545227051 float/double 0.255777
$

To a first order approximation, this has nothing to do with CUDA. You’ve only got about 6 decimal digits of precision with float representation. If you start asking why there are differences in the 7th or 8th digit, you’re outside of realistic expectations of the utility of float.

63766383.8085833
******
float digits

63766383.8085833
************
double digits

Hi guys,

I’m trying to evaluate the correctness of the output results as ffsantos does, but using half precision with tensor cores. At first I compared the results of cuBLAS (with tensor cores) and the tensor cores codes provided by samples (compute_gemm and simple_wmma_gemm) and noticed some differences, so I decided to compare with the output results of cuBLAS without using tensor cores. I’m running them in a Titan V.

I’m using random inputs in a range of (0.0,1.0) and I’m dividing the outputs for comparison. The results are kind of strange because they differ a lot (the division outputs range from 0,40 to 1.90) and I don’t know if this would be expected (as I’m not comparing 2 different precision). Is this supposed to be happening?