Hi everyone.
I’m implementing a function very similar to the nearest neighbour.
The difference to the common nearest neighbour or 3D range search, is that I’m computing the minimum distance of a point to a set of triangles (usually is a set of points).
Due to this, I cannot use the existing GPU implementations of nearest neighbour.
My problem, is that the code I’m using (http://www.geometrictools.com/Documentation/DistancePoint3Triangle3.pdf) has lots of branches, which results in a lot of path divergence. I believe that in a warp, only 10 threads follow the same path.
The excerpt below shows a C implementation of these branches. Does anyone knows a way to avoid them, or at least, to reduce the divergence? Basically I have a set of points, which I map in a one dimension grid divided in several blocks.
dim3 dimBlock(256);
dim3 dimGrid((totalPoints/dimBlock.x) + (totalPoints % dimBlock.x == 0?0:1));
The kernel uses this variables:
vector kDiff, kEdge0, kEdge1; //Each vector is a structure with 3 floats
vector m_kClosestPoint1;
float fA00, fA01, fA11, fB0, fB1, fC, fDet, fS, fT, fSqrDistance;
My graphic is an old Geforce 8500GT.
The code excerpt is the following:
[codebox] if (fS + fT <= fDet)
{
if (fS < 0.0f)
{
if (fT < 0.0f) // region 4
{
if (fB0 < 0.0f)
{
fT = 0.0f;
if (-fB0 >= fA00)
{
fS = 1.0f;
fSqrDistance = fA00 + 2.0f * fB0 + fC;
}
else
{
fS = -fB0/fA00;
fSqrDistance = fB0 * fS + fC;
}
}
else
{
fS = 0.0f;
if (fB1 >= 0.0f)
{
fT = 0.0f;
fSqrDistance = fC;
}
else if (-fB1 >= fA11)
{
fT = 1.0f;
fSqrDistance = fA11 + 2.0f * fB1 + fC;
}
else
{
fT = -fB1/fA11;
fSqrDistance = fB1 *fT + fC;
}
}
}
else // region 3
{
fS = 0.0f;
if (fB1 >= 0.0f)
{
fT = 0.0f;
fSqrDistance = fC;
}
else if (-fB1 >= fA11)
{
fT = 1.0f;
fSqrDistance = fA11 + 2.0f * fB1 + fC;
}
else
{
fT = -fB1/fA11;
fSqrDistance = fB1 * fT + fC;
}
}
}
else if (fT < 0.0f) // Region 5
{
fT = 0.0f;
if (fB0 >= 0.0f)
{
fS = 0.0f;
fSqrDistance = fC;
}
else if (-fB0 >= fA00)
{
fS = 1.0f;
fSqrDistance = fA00 + 2.0f * fB0 + fC;
}
else
{
fS = -fB0/fA00;
fSqrDistance = fB0 * fS + fC;
}
}
else // region 0
{
// minimum at interior point
float fInvDet = 1.0f/fDet;
fS *= fInvDet;
fT *= fInvDet;
fSqrDistance = fS*(fA00*fS+fA01*fT+2.0f*fB0) + fT*(fA01*fS+fA11*fT+2.0f*fB1)+fC;
}
}
else
{
float fTmp0, fTmp1, fNumer, fDenom;
if (fS < 0.0f) // region 2
{
fTmp0 = fA01 + fB0;
fTmp1 = fA11 + fB1;
if (fTmp1 > fTmp0)
{
fNumer = fTmp1 - fTmp0;
fDenom = fA00 - 2.0f * fA01 + fA11;
if (fNumer >= fDenom)
{
fS = 1.0f;
fT = 0.0f;
fSqrDistance = fA00 + 2.0f * fB0 + fC;
}
else
{
fS = fNumer/fDenom;
fT = 1.0f - fS;
fSqrDistance = fS * (fA00*fS+fA01*fT+2.0f*fB0) + fT*(fA01*fS+fA11*fT+2.0f*fB1)+fC;
}
}
else
{
fS = 0.0f;
if (fTmp1 <= 0.0f)
{
fT = 1.0f;
fSqrDistance = fA11 + 2.0f * fB1 + fC;
}
else if (fB1 >= 0.0f)
{
fT = 0.0f;
fSqrDistance = fC;
}
else
{
fT = -fB1/fA11;
fSqrDistance = fB1*fT+fC;
}
}
}
else if (fT < 0.0f) // region 6
{
fTmp0 = fA01 + fB1;
fTmp1 = fA00 + fB0;
if (fTmp1 > fTmp0)
{
fNumer = fTmp1 - fTmp0;
fDenom = fA00 - 2.0f * fA01 + fA11;
if (fNumer >= fDenom)
{
fT = 1.0f;
fS = 0.0f;
fSqrDistance = fA11 + 2.0f * fB1 + fC;
}
else
{
fT = fNumer/fDenom;
fS = 1.0f - fT;
fSqrDistance = fS*(fA00*fS+fA01*fT+2.0f*fB0) + fT*(fA01*fS+fA11*fT+2.0f*fB1)+fC;
}
}
else
{
fT = 0.0f;
if (fTmp1 <= 0.0f)
{
fS = 1.0f;
fSqrDistance = fA00 + 2.0f * fB0 + fC;
}
else if (fB0 >= 0.0f)
{
fS = 0.0f;
fSqrDistance = fC;
}
else
{
fS = -fB0/fA00;
fSqrDistance = fB0 * fS + fC;
}
}
}
else // region 1
{
fNumer = fA11 + fB1 - fA01 - fB0;
if (fNumer <= 0.0f)
{
fS = 0.0f;
fT = 1.0f;
fSqrDistance = fA11 + 2.0f * fB1 + fC;
}
else
{
fDenom = fA00 - 2.0f * fA01 + fA11;
if (fNumer >= fDenom)
{
fS = 1.0f;
fT = 0.0f;
fSqrDistance = fA00 + 2.0f * fB0 + fC;
}
else
{
fS = fNumer/fDenom;
fT = 1.0f - fS;
fSqrDistance = fS*(fA00*fS+fA01*fT+2.0f*fB0) + fT*(fA01*fS+fA11*fT+2.0f*fB1)+fC;
}
}
}
}
// account for numerical round-off error
if (fSqrDistance < 0.0f)
{
fSqrDistance = 0.0f;
}
[/codebox]
Thanks for your help.