Here what I want to do is to identify the box of which triangle in a set of 2D triangles is overlapped with the box of a circle. The triangle that is not overlapped will be assigned to -1, otherwise to its own index.

The triangles’ vertices are stored in a thrust::device_vector. the d_pTriXys is the raw pointer to the vector. Each triangles’ 3 indices pointing to index of vertex array are stored in another device vector. d_pTriIndices is a raw pointer to this vector. triNum is the count of triangle.

The code is done in visual studio C++ and it works. But the performance seems to be an issue. It takes about 3.5 seconds for the case that has 2225 circles being checked with 5.5 million triangles, 2.8M vertices. It is not as fast as I wished. Does anybody have any idea to improve the algorithm for better performance?

I tried to change the register count, threadsPerBlock. right now the setting seems to be the best.

```
// RangeAlgo.h
struct tagXy_f
{
float x;
float y;
};
CUDA_HOST_DEV __forceinline bool IsCircleAndTriangleBoxesSeparated(const tagXy_f triVertices[3],
const tagXy_f &circleCen, float circleRad)
{
float t = min(triVertices[0].x, min(triVertices[1].x, triVertices[2].x));
if (circleCen.x <= t - circleRad)
return true;
t = max(triVertices[0].x, max(triVertices[1].x, triVertices[2].x));
if (circleCen.x >= t + circleRad)
return true;
t = min(triVertices[0].y, min(triVertices[1].y, triVertices[2].y));
if (circleCen.y <= t - circleRad)
return true;
t = max(triVertices[0].y, max(triVertices[1].y, triVertices[2].y));
return (circleCen.y >= t + circleRad);
}
// Main.cu
__global__ void MarkOutTrianglesKernel(const uint3 *d_pTriIndices, unsigned triNum,
const float2 *d_pTriXys, const tagXy_f &d_circleCen, float circleRad,
int *d_pValidTriIndices)
{
unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < triNum)
{
const uint3 &triInds = d_pTriIndices[tid];
// to local variable/registers, make it faster
const float2 triVertices[3] = { d_pTriXys[triInds.x],
d_pTriXys[triInds.y], d_pTriXys[triInds.z] };
bool b = IsCircleAndTriangleBoxesSeparated(reinterpret_cast<const tagXy_f *>(triVertices),
d_circleCen, circleRad);
d_pValidTriIndices[tid] = b ? -1 : (int)tid;
}
}
void main()
{
// here is the sample code to check one circle with all triangles
uint3 *d_pTriIndices; // indices of all triangles
unsigned triNum; // count of triangles
const float2 *d_pTriXys; // all vertices of triangles
tagXy_f d_circleCen;
float circleRad;
int *d_pValidTriIndices;
...
int threadsPerBlock = 256;
int blocksPerGrid = (int)((triNum + threadsPerBlock - 1) / threadsPerBlock);
MarkOutTrianglesKernel<<<blocksPerGrid, threadsPerBlock>>>(d_pTriIndices, triNum,
d_pTriXys, d_circleCen, circleRad, d_pValidTriIndices);
...
}
```