Hey guys, I have some code that seems to be exhibiting behavior of a race condition…

Here are snippets of my code :

```
// Here's how we call the kernels in my host function...
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start );
is_degen<<< blocks, tpb >>>( points, mesh, dist, np, preds );
compare_min<<< blocks, tpb >>>( mesh, points, dist, np );
cudaDeviceSynchronize();
cudaEventRecord( stop );
cudaEventSynchronize( stop );
// These are the kernels. Basically, this first one isn't printing all the time.
__global__
void compare_min( tetra *tetras, point *points, float *dist, int n )
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < n)
{
tetra *t = tetras + ( points + 4 + i )->host;
t->min_dist = IFloatFlip( t->min_dist );
union
{
float flt_dist;
unsigned uint_dist;
};
uint_dist = t->min_dist;
if ( dist[i] == flt_dist)
{
printf( "%d : %f\n", i, dist[i] );
}
}
}
__global__
void is_degen( point *points, tetra *mesh, float *dist, unsigned n, PredicateInfo *preds )
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < n)
{
tetra *t = mesh + ( points + 4 + i )->host;
float *a = &( ( points + t->p[0] )->x );
float *b = &( ( points + t->p[1] )->x );
float *c = &( ( points + t->p[2] )->x );
float *d = &( ( points + t->p[3] )->x );
float *e = &( ( points + i + 4 )->x );
float u[3] = { b[0] - a[0], b[1] - a[1], b[2] - a[2] };
float v[3] = { c[0] - a[0], c[1] - a[1], c[2] - a[2] };
float w[3] = { d[0] - a[0], d[1] - a[1], d[2] - a[2] };
float A[9];
#pragma unroll
for ( int j = 0; j < 3; ++j )
{
int offset = 3 * j;
A[offset + 0] = u[j];
A[offset + 1] = v[j];
A[offset + 2] = w[j];
}
three_by_three_inverse( A );
float B[3] = { e[0] - a[0], e[1] - a[1], e[2] - a[2] };
float x[3] = { A[0] * B[0] + A[1] * B[1] + A[2] * B[2],
A[3] * B[0] + A[4] * B[1] + A[5] * B[2],
A[6] * B[0] + A[7] * B[1] + A[8] * B[2]
};
if ( x[0] > 0 && x[1] > 0 && x[2] > 0 &&
x[0] + x[1] + x[2] < 1 )
{
dist[i] = insphere( *preds, a, b, c, d, e);
}
else
{
dist[i] = FLT_MAX; // degenerate
}
atomicMin( &( t->min_dist ), FloatFlip( ( unsigned& ) dist[i] ) );
}
}
```

So what’s going on is, when I’m running my code on like a million points, the compare_min kernel will sometimes print the actual distance(s). Emphasis is on sometimes. Where in my code could there be a race condition? I even used cuda-memcheck --tool racecheck to no avail!