Okay, here’s a kernel I wrote :

```
struct tetrahedron
{
int v[4];
__host__ __device__
tetrahedron(void)
{
v[0] = v[1] = v[2] = v[3] = -1;
};
__host__ __device__
tetrahedron(const int a,
const int b,
const int c,
const int d)
{
v[0] = a;
v[1] = b;
v[2] = c;
v[3] = d;
};
__host__ __device__
void print(void) const
{
printf("%d, %d, %d, %d\n", v[0], v[1], v[2], v[3]);
}
};
__device__ __host__
unsigned long reverse(unsigned x)
{
const int prime_factor = 17;
const int prime_offset = 3;
unsigned long x_copy = x * prime_factor + prime_offset;
assert(x_copy <= UINT_MAX);
x = x_copy;
x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
return((x >> 16) | (x << 16));
}
__global__
void hash_faces(const int num_tet,
const int* __restrict__ tetra_index,
const tetrahedron* __restrict__ mesh,
unsigned long* __restrict__ hf,
int* __restrict__ ht,
int* __restrict__ po)
{
const int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
for (int tid = thread_id; tid < num_tet; tid += grid_size)
{
// load in tetrahedron
const int tet_id = __ldg(tetra_index + tid);
const tetrahedron t = mesh[tet_id];
// load in vertices
/*const int v0 = __ldg(&t.v[0]);
const int v1 = __ldg(&t.v[1]);
const int v2 = __ldg(&t.v[2]);
const int v3 = __ldg(&t.v[3]);*/
const int v0 = t.v[0];
const int v1 = t.v[1];
const int v2 = t.v[2];
const int v3 = t.v[3];
// hash faces
unsigned long hash[4] = { 0 };
hash[0] = reverse(v3) + reverse(v2) + reverse(v1);
hash[1] = reverse(v0) + reverse(v2) + reverse(v3);
hash[2] = reverse(v0) + reverse(v3) + reverse(v1);
hash[3] = reverse(v0) + reverse(v1) + reverse(v2);
// write back
const int offset = 4 * tid; // 4 faces per tetrahedron
// hashed faces
hf[offset + 0] = hash[0];
hf[offset + 1] = hash[1];
hf[offset + 2] = hash[2];
hf[offset + 3] = hash[3];
// index of hashed tetrahedra
ht[offset + 0] =
ht[offset + 1] =
ht[offset + 2] =
ht[offset + 3] = tet_id;
// index of points opposite to each hashed face
po[offset + 0] = 0;
po[offset + 1] = 1;
po[offset + 2] = 2;
po[offset + 3] = 3;
}
}
```

Mesh accesses are accurate. I did a host-side print and it worked fine. Writes seem accurate.

Basically, my code only crashes if I load in v0 through v3 with ldg instead of just a normal dereference. Am I doing this wrong? The compiler didn’t give me any syntax errors so I’m imagining that I’m using __ldg() was it was intended.

Or am I overloading my cache which subsequently creates lines being tossed out?