Hi everyone, I have a function that computes the exact band distance field and I have attempted to remake in in Cuda, but whilst it is working, it’s producing slightly different results, which in my case is undesirable.
The original CPU code looks something like this:
Array3D<float> Phi;
glm::ivec3 t;
float invDX = 1.0f / DX;
for (size_t tIdx= 0; tIdx < MeshTriangleCount; tIdx++) {
t = MeshTriangles[tIdx];
glm::vec3 p = MeshVertices[t.x];
glm::vec3 q = MeshVertices[t.y];
glm::vec3 r = MeshVertices[t.z];
float fip = p.x * invDX;
float fjp = p.y * invDX;
float fkp = p.z * invDX;
float fiq = q.x * invDX;
float fjq = q.y * invDX;
float fkq = q.z * invDX;
float fir = r.x * invDX;
float fjr = r.y * invDX;
float fkr = r.z * invDX;
int i0 = glm::clamp(int(fmin(fip, fmin(fiq, fir))) - bandWidth, 0, size.x - 1);
int j0 = glm::clamp(int(fmin(fjp, fmin(fjq, fjr))) - bandWidth, 0, size.y - 1);
int k0 = glm::clamp(int(fmin(fkp, fmin(fkq, fkr))) - bandWidth, 0, size.z - 1);
int i1 = glm::clamp(int(fmax(fip, fmax(fiq, fir))) + bandWidth + 1, 0, size.x - 1);
int j1 = glm::clamp(int(fmax(fjp, fmax(fjq, fjr))) + bandWidth + 1, 0, size.y - 1);
int k1 = glm::clamp(int(fmax(fkp, fmax(fkq, fkr))) + bandWidth + 1, 0, size.z - 1);
for (int k = k0; k <= k1; k++) {
for (int j = j0; j <= j1; j++) {
for (int i = i0; i <= i1; i++) {
glm::vec3 gPos = GridIndexToPosition(i, j, k, DX);
float d = PointToTriangleDistance(gPos, p, q, r);
if (d < Phi(i, j, k)) {
Phi.Set(i, j, k, d);
}
}
}
}
}
And this is the new GPU version:
__device__ Array3D<float> Phi;
static __global__ void ComputeExactBandDistanceFieldKernel(int bandWidth, float DX, glm::ivec3 size, const glm::vec3* vertices, int vertexCount, const glm::ivec3* triangles, int triangleCount) {
const int index = blockIdx.x * blockDim.x + threadIdx.x;
float invDX = 1.0f / DX;
glm::ivec3 t = triangles[index];
glm::vec3 p = vertices[t.x];
glm::vec3 q = vertices[t.y];
glm::vec3 r = vertices[t.z];
float fip = p.x * invDX;
float fjp = p.y * invDX;
float fkp = p.z * invDX;
float fiq = q.x * invDX;
float fjq = q.y * invDX;
float fkq = q.z * invDX;
float fir = r.x * invDX;
float fjr = r.y * invDX;
float fkr = r.z * invDX;
int i0 = glm::clamp(int(min(fip, min(fiq, fir))) - bandWidth, 0, size.x - 1);
int j0 = glm::clamp(int(min(fjp, min(fjq, fjr))) - bandWidth, 0, size.y - 1);
int k0 = glm::clamp(int(min(fkp, min(fkq, fkr))) - bandWidth, 0, size.z - 1);
int i1 = glm::clamp(int(max(fip, max(fiq, fir))) + bandWidth + 1, 0, size.x - 1);
int j1 = glm::clamp(int(max(fjp, max(fjq, fjr))) + bandWidth + 1, 0, size.y - 1);
int k1 = glm::clamp(int(max(fkp, max(fkq, fkr))) + bandWidth + 1, 0, size.z - 1);
for (int k = k0; k <= k1; k++) {
for (int j = j0; j <= j1; j++) {
for (int i = i0; i <= i1; i++) {
glm::vec3 gPos = GridIndexToPosition(i, j, k, DX);
float d = PointToTriangleDistance(gPos, p, q, r);
if (d < Phi(i, j, k)) {
Phi.Set(i, j, k, d);
}
}
}
}
}
And these are the functions that were used:
static __device__ __host__ glm::vec3 GridIndexToPosition(int i, int j, int k, float dx) {
return { i * dx, j * dx, k * dx };
}
__device__ __host__ float PointToTriangleDistance(const glm::vec3& x0, const glm::vec3& x1, const glm::vec3& x2, const glm::vec3& x3) {
glm::vec3 x13 = x1 - x3;
glm::vec3 x23 = x2 - x3;
glm::vec3 x03 = x0 - x3;
float m13 = glm::length2(x13);
float m23 = glm::length2(x23);
float d = glm::dot(x13, x23);
float invdet = 1.0f / fmax(m13 * m23 - d * d, 1e-30f);
float a = glm::dot(x13, x03);
float b = glm::dot(x23, x03);
float w23 = invdet * (m23 * a - d * b);
float w31 = invdet * (m13 * b - d * a);
float w12 = 1 - w23 - w31;
if (w23 >= 0 && w31 >= 0 && w12 >= 0) {
return glm::length(x0 - (w23 * x1 + w31 * x2 + w12 * x3));
}
else {
if (w23 > 0) {
float d1 = PointToSegmentDistance(x0, x1, x2);
float d2 = PointToSegmentDistance(x0, x1, x3);
return fmin(d1, d2);
}
else if (w31 > 0) {
// this rules out edge 1-3
float d1 = PointToSegmentDistance(x0, x1, x2);
float d2 = PointToSegmentDistance(x0, x2, x3);
return fmin(d1, d2);
}
else {
// w12 must be >0, ruling out edge 1-2
float d1 = PointToSegmentDistance(x0, x1, x3);
float d2 = PointToSegmentDistance(x0, x2, x3);
return fmin(d1, d2);
}
}
}
__device__ float PointToSegmentDistance(const glm::vec3& x0, const glm::vec3& x1, const glm::vec3& x2) {
glm::vec3 dx = x2 - x1;
float m2 = glm::length2(dx);
float s12 = glm::dot(x2 - x0, dx) / m2;
if (s12 < 0) {
s12 = 0;
}
else if (s12 > 1) {
s12 = 1;
}
return glm::length(x0 - (s12 * x1 + (+-s12) * x2));
}
When I input the same values into both the CPU, and the GPU version and then log the values in Phi
I get slightly different results, that look something like this:
CPU | GPU | Same? |
---|---|---|
0 | 0 | TRUE |
0.257427 | 0.257427 | TRUE |
0.964125 | 0.964125 | TRUE |
0.468518 | 0.468518 | TRUE |
0.672586 | 0.672586 | TRUE |
1.08635 | 1.08635 | TRUE |
0.968339 | 0.968339 | TRUE |
0.257427 | 1.08102 | FALSE |
1.37804 | 1.37804 | TRUE |
0.463216 | 0.463216 | TRUE |
0.67111 | 0.673154 | FALSE |
0.257574 | 0.257574 | TRUE |
0.672819 | 0.672995 | FALSE |
… |
I’m wondering if this is normal on my GPU/across Cuda (my GPU is the GTX 1070 with compute capability 6.1). Note that I’m not using the--use_fast_math
command.
Thanks in advance.