Cuda math produces a slightly different result to C++ math

Goubermouche · August 30, 2022, 11:45am

Hi everyone, I have a function that computes the exact band distance field and I have attempted to remake in in Cuda, but whilst it is working, it’s producing slightly different results, which in my case is undesirable.

The original CPU code looks something like this:

Array3D<float> Phi;

glm::ivec3 t;
float invDX = 1.0f / DX;

for (size_t tIdx= 0; tIdx < MeshTriangleCount; tIdx++) {
    t = MeshTriangles[tIdx];

    glm::vec3 p = MeshVertices[t.x];
    glm::vec3 q = MeshVertices[t.y];
    glm::vec3 r = MeshVertices[t.z];

    float fip = p.x * invDX;
    float fjp = p.y * invDX;
    float fkp = p.z * invDX;

    float fiq = q.x * invDX;
    float fjq = q.y * invDX;
    float fkq = q.z * invDX;

    float fir = r.x * invDX;
    float fjr = r.y * invDX;
    float fkr = r.z * invDX;

    int i0 = glm::clamp(int(fmin(fip, fmin(fiq, fir))) - bandWidth, 0, size.x - 1);
    int j0 = glm::clamp(int(fmin(fjp, fmin(fjq, fjr))) - bandWidth, 0, size.y - 1);
    int k0 = glm::clamp(int(fmin(fkp, fmin(fkq, fkr))) - bandWidth, 0, size.z - 1);

    int i1 = glm::clamp(int(fmax(fip, fmax(fiq, fir))) + bandWidth + 1, 0, size.x - 1);
    int j1 = glm::clamp(int(fmax(fjp, fmax(fjq, fjr))) + bandWidth + 1, 0, size.y - 1);
    int k1 = glm::clamp(int(fmax(fkp, fmax(fkq, fkr))) + bandWidth + 1, 0, size.z - 1);

    for (int k = k0; k <= k1; k++) {
		for (int j = j0; j <= j1; j++) {
			for (int i = i0; i <= i1; i++) {
				glm::vec3 gPos = GridIndexToPosition(i, j, k, DX);
				float d = PointToTriangleDistance(gPos, p, q, r);
				if (d < Phi(i, j, k)) {
					Phi.Set(i, j, k, d);
				}
			}
		}
	}
}

And this is the new GPU version:

__device__ Array3D<float> Phi;

static __global__ void ComputeExactBandDistanceFieldKernel(int bandWidth, float DX, glm::ivec3 size, const glm::vec3* vertices, int vertexCount, const glm::ivec3* triangles, int triangleCount) {
    const int index = blockIdx.x * blockDim.x + threadIdx.x;
    float invDX = 1.0f / DX;

	glm::ivec3 t = triangles[index];

	glm::vec3 p = vertices[t.x];
	glm::vec3 q = vertices[t.y];
	glm::vec3 r = vertices[t.z];

	float fip = p.x * invDX;
	float fjp = p.y * invDX;
	float fkp = p.z * invDX;

	float fiq = q.x * invDX;
	float fjq = q.y * invDX;
	float fkq = q.z * invDX;

	float fir = r.x * invDX;
	float fjr = r.y * invDX;
	float fkr = r.z * invDX;

	int i0 = glm::clamp(int(min(fip, min(fiq, fir))) - bandWidth, 0, size.x - 1);
	int j0 = glm::clamp(int(min(fjp, min(fjq, fjr))) - bandWidth, 0, size.y - 1);
	int k0 = glm::clamp(int(min(fkp, min(fkq, fkr))) - bandWidth, 0, size.z - 1);

	int i1 = glm::clamp(int(max(fip, max(fiq, fir))) + bandWidth + 1, 0, size.x - 1);
	int j1 = glm::clamp(int(max(fjp, max(fjq, fjr))) + bandWidth + 1, 0, size.y - 1);
	int k1 = glm::clamp(int(max(fkp, max(fkq, fkr))) + bandWidth + 1, 0, size.z - 1);

    for (int k = k0; k <= k1; k++) {
		for (int j = j0; j <= j1; j++) {
			for (int i = i0; i <= i1; i++) {
				glm::vec3 gPos = GridIndexToPosition(i, j, k, DX);
				float d = PointToTriangleDistance(gPos, p, q, r);
				if (d < Phi(i, j, k)) {
					Phi.Set(i, j, k, d);
				}
			}
		}
	}
}

And these are the functions that were used:

static __device__ __host__ glm::vec3 GridIndexToPosition(int i, int j, int k, float dx) {
	return { i * dx, j * dx, k * dx };
}

__device__ __host__  float PointToTriangleDistance(const glm::vec3& x0, const glm::vec3& x1, const glm::vec3& x2, const glm::vec3& x3) {
	glm::vec3 x13 = x1 - x3;
	glm::vec3 x23 = x2 - x3;
	glm::vec3 x03 = x0 - x3;

	float m13 = glm::length2(x13);
	float m23 = glm::length2(x23);
	float d = glm::dot(x13, x23);
	float invdet = 1.0f / fmax(m13 * m23 - d * d, 1e-30f);
	float a = glm::dot(x13, x03);
	float b = glm::dot(x23, x03);

	float w23 = invdet * (m23 * a - d * b);
	float w31 = invdet * (m13 * b - d * a);
	float w12 = 1 - w23 - w31;

	if (w23 >= 0 && w31 >= 0 && w12 >= 0) {
		return glm::length(x0 - (w23 * x1 + w31 * x2 + w12 * x3));
	}
	else {
		if (w23 > 0) {
			float d1 = PointToSegmentDistance(x0, x1, x2);
			float d2 = PointToSegmentDistance(x0, x1, x3);
			return fmin(d1, d2);
		}
		else if (w31 > 0) {
			// this rules out edge 1-3
			float d1 = PointToSegmentDistance(x0, x1, x2);
			float d2 = PointToSegmentDistance(x0, x2, x3);
			return fmin(d1, d2);
		}
		else {
			// w12 must be >0, ruling out edge 1-2
			float d1 = PointToSegmentDistance(x0, x1, x3);
			float d2 = PointToSegmentDistance(x0, x2, x3);
			return fmin(d1, d2);
		}
	}
}

__device__ float PointToSegmentDistance(const glm::vec3& x0, const glm::vec3& x1, const glm::vec3& x2) {
	glm::vec3 dx = x2 - x1;
	float m2 = glm::length2(dx);
	float s12 = glm::dot(x2 - x0, dx) / m2;
	if (s12 < 0) {
		s12 = 0;
	}
	else if (s12 > 1) {
		s12 = 1;
	}
	return glm::length(x0 - (s12 * x1 + (+-s12) * x2));
}

When I input the same values into both the CPU, and the GPU version and then log the values in Phi I get slightly different results, that look something like this:

CPU	GPU	Same?
0	0	TRUE
0.257427	0.257427	TRUE
0.964125	0.964125	TRUE
0.468518	0.468518	TRUE
0.672586	0.672586	TRUE
1.08635	1.08635	TRUE
0.968339	0.968339	TRUE
0.257427	1.08102	FALSE
1.37804	1.37804	TRUE
0.463216	0.463216	TRUE
0.67111	0.673154	FALSE
0.257574	0.257574	TRUE
0.672819	0.672995	FALSE
…

I’m wondering if this is normal on my GPU/across Cuda (my GPU is the GTX 1070 with compute capability 6.1). Note that I’m not using the--use_fast_math command.

Thanks in advance.

cbuchner1 · August 30, 2022, 12:16pm

the function glm::length likely uses sqrt() and there will be differences in precision between implementations.

Also note that CUDA may decide to contract some operations into a fused multiply-and-add (FMA) which also affects precision. Try passing -fmad=false to prevent this.

njuffa · August 30, 2022, 12:57pm

By default the CUDA compiler uses accurate sqrt(), i.e. correctly rounded square root as specified by IEEE-754. So this is unlikely to be the source of differences, unless the compiler used for the CPU implementation does something funky (some compilers default to “fast-math” mode).

As @cbuchner1 says, the most likely source of differences is the CUDA compiler contracting FMUL followed by dependent FADD to FMA, and this can be turned off by specifying -fmad=false on the nvcc command line.

However, I will question the wisdom of such a move. In general, the use of FMA will improve both performance and accuracy of floating-point computation. In floating-point computation, mismatches between different platforms are to be expected unless one compiles with the strictest IEEE-754 compliance and sticks to elementary math operations. If FMA merging is indeed identified as the root cause of the differences, and it is important to keep the results as similar as possible, what I would try if this were my code is to ensure that FMA is used on the CPU as well (all host CPU architectures supported by CUDA include FMA instructions).

I don’t know what glm is (is that an NVIDIA-provided library?), but it seems plausible that glm::length and glm::length2() are based on hypot() and / or norm3d() and with such math function where there could well be differences, i.e. there is no guarantee of bit-wise matches between any two platforms.

I do not immediately recognize the computation here (pointers welcome) but am vaguely aware that there are different ways of computing point to triangle distance. It seems to me this computation may not be using the numerically most robust way, which is often important in graphical primitives to avoid artifacts.

Side-remark: Whenever there is computation (a * b - c * d) one would probably want to employ a robust difference-of-products primitive, which can be constructed elegantly using FMAs at minor performance cost, see: algorithm - Accurate floating-point computation of the sum and difference of two products - Stack Overflow

cbuchner1 · August 30, 2022, 1:12pm

GLM appears to be the OpenGL Mathematics library. It provides primitives and math operators similar to those used in the OpenGL shading language.

Goubermouche · August 30, 2022, 1:33pm

Firstly, thank you both for replying so fast, I really appreciate it, secondly, I’ve tried using the suggested -fmad=false flag, but the inaccuracy still persists, this makes me believe that the error is in being produced by glm - I’ll try replacing the functions myself and will let you know how I end up doing. Furthermore, I don’t think using FMA on the CPU will work - The code above is used for calculating SDF’s and I’m using the CPU implementation as a baseline that will get removed later.

In case I don’t manage to get it working I’ll also check out the suggestion by @njuffa and find a different triangle-point distance algorithm.

Goubermouche · August 30, 2022, 2:13pm

Alright, so I’ve tried implementing my own functions for length and length2 - I’ve tried using sqrt and sqrtf (I’ve also enabled these compiler flags just to be sure that I’m getting the best possible results: -prec-sqrt=true -prec-div=true -prec-div=true), but the results were even worse than when I was using glm’s length.

(particle samples of an SDF generated using sqrt - contains strong artifacts)

After that I tried using sqrtl, which ´, without any compiler flags, produces the desired results.

(particle samples of an SDF generated using sqrtl - no artifacts)

I will still have to profile how much of a performance hit using sqrtlis, but I believe the Cuda implementation is still around 5x faster than the original, CPU based one.

Thank you all for your comments and advice.

njuffa · August 30, 2022, 7:41pm

Those are the settings the CUDA compiler uses by default. For what it is worth, they only apply to float computation. Differences in floating-point results between two platforms are just that: differences. Whether any one version is more or less accurate only comparison to a reference using higher precision can determine.

Sorry, you lost me there. sqrtl is the square root variant for long double, which is not supported in CUDA device code. GPUs only offer IEEE-754 binary32 format (exposed as float) and IEEE-754 binary64 format (exposed as double). So I am not clear where you applied the sqrtl.

My recommendation would be to review both the algorithm and the implementation for robust numerical properties. It might be instructive to follow the computation step by step in both target precision and a higher precision reference.

The main source of numerical issues tends to be subtractive cancellation, sometimes called catastrophic cancellation, a term I dislike because the effect is not always catastrophic, but degrades the accuracy of computation to varying degrees.

I already mentioned the differences-of-products idiom. Hand-code FMAs where you want them, using the fma() or fmaf() functions as appropriate. CPUs produced in the last ten years generally support FMA in hardware. I do not know how glm::dot is implemented, but it should be based on FMAs. I would claim that at this time, programmer’s should always think of arranging numerical computation in terms of FMAs, it is the most useful building block for accurate computation since IBM introduced it in the 1990s.

Note that functions like hypot() and norm3d() which are typically used to compute vector length are typically more accurate than what a naive manual implementation via sqrt() would achieve.

Goubermouche · August 31, 2022, 10:00am

I honestly do not know why the sqrtl method works. If anyone is wondering I only used it in this method:

__device__ __host__ float AccurateLength(const glm::vec3 v) {
	return sqrtl(v.x * v.x + v.y * v.y + v.z * v.z);
}

Curiously, this method actually improves the performance of my code by around 30%, but I’ll also check out the hypot() and norm3d() implementation and see how that changes things.
Edit: Added the Z component into the AccurateLength code example. Big thanks to @njuffa.

njuffa · August 31, 2022, 10:11am

On some platforms long double is treated as double (this is allowed by the C++ standard), and one might hypothesize that sqrtl(x) would then be treated the same as sqrt ((double)x). I am surprised that sqrtl() is accepted in device code though, that is news to me.

If my hypothesis is correct, that would mean that by using sqrtl() you wind up with a very accurate computation of the length, but this will be slow because it uses a double-precision square root. I have no idea why you would observe improved performance.

Using hypotf (v.x, v.y) should be similarly accurate (maximum error of about 1 ulp), is also robust (avoids underflow and overflow in intermediate computation), and very fast.

BTW, is the above your actual code? Argument v is of type glm::vec3, so shouldn’t the length computation incorporate v.x, v.y, and v.z, rather than just v.x and v.y? In other words, it seems that for a vec3 you would want norm3df (v.x, v.y, v.z) in CUDA. Note that norm3df() does not exist in standard C++, but I think C++17 added an overload for hypot() that allows three arguments.

Goubermouche · August 31, 2022, 10:47am

I’ve tried this, but the performance is, for some reason, really poor (around 540ms vs 7 ms), but the results appear to be the same / very similar.

No, my actual version does have the z component included in it, I must’ve missed it when copying it over to my browser - good catch (I’ll edit it in case someone else stumbles upon it)!

njuffa · August 31, 2022, 11:41am

We are talking about timing a release build for the GPU here, correct? If so, I do not have a hypothesis that would explain your observation. You may want to examine the generated machine code (cuobjdump --dump-sass) and use the CUDA profiler to do some digging.

Goubermouche · August 31, 2022, 2:05pm

Correct.

I’ll check the options you’ve suggested below, but I won’t be able to do it immediately, since I currently don’t have the time.

Regards