I have a kernel as follow: (KDCONSTRUCT_THREADS == 128)

I need to be 1.0 capable (no atomic add on shared memory ;((

Is this code optimal ? or is there any way to make it better ?

BTW. There is a guarantee that number of triangles that is inside the box is < TRI_CACHE_SIZE.

```
__global__ void KDSomething(const unsigned int FirstTrianglesIndex, const unsigned int LastTrianglesIndex, const float3 bbmin, const float3 bbmax)
{
__shared__ unsigned int TrianglesCache[TRI_CACHE_SIZE];
__shared__ unsigned int TrinaglesCacheCnt;
if (threadIdx.x == 0)
Â Â TrinaglesCacheCnt = 0;
__syncthreads();
for (unsigned int i = FirstTrianglesIndex; i < LastTrianglesIndex; i+=KDCONSTRUCT_THREADS)
Â {
Â Â const unsigned int idx Â = i + threadIdx.x;
Â Â Â Â Â unsigned int HIdx = 0xFFFFFFFF;
Â Â if (idx < gNumberOfTriangles)
Â Â {
Â Â float4 tmin = tex1Dfetch(PolygonsMinMax, (idx<<1)+0);
Â Â float4 tmax = tex1Dfetch(PolygonsMinMax, (idx<<1)+1);
Â Â if (tmax.x >= bbmin.x && tmax.y >= bbmin.y && tmax.z >= bbmin.z &&
Â Â Â tmin.x <= bbmax.x && tmin.y <= bbmax.y && tmin.z <= bbmax.z Â )
Â Â {
Â Â Â HIdx = idx;
Â Â }
Â Â }
Â Â Â Â Â Â Â Â Â Â Â Â //
Â Â Â Â Â Â Â Â Â Â Â Â // is there any construct that's better replaces this:
Â Â Â Â Â Â Â Â Â Â Â Â //
Â Â Â Â Â Â Â Â Â Â Â Â // if (HIdx != 0xFFFFFFFF) TrianglesCache[atomicAdd(&TrinaglesCacheCnt, 1)] = HIdx;
Â Â Â Â Â Â Â Â Â Â Â Â // ??
Â Â __syncthreads();
Â Â for (int k = 0; k < KDCONSTRUCT_THREADS; k++)
Â Â {
Â Â if (k == threadIdx.x && HIdx != 0xFFFFFFFF)
Â Â Â TrianglesCache[TrinaglesCacheCnt++] = HIdx;
Â Â __syncthreads();
Â Â }
Â }
//
// Rest of kernel that uses indices from TrianglesCache follows for other calculations follows
//
}
```