On my Titan V with 512 blocks of 128 and the product of them being the count this code takes about 480ms, with 1024 blocks of 128 it takes about 2880ms, that’s 6 times longer!

Any ideas why it scales so poorly?

```
__global__ void ComputeClosest(float2* p0, int* n, int count){
const auto i = blockIdx.x * blockDim.x + threadIdx.x;
for(auto j = 0; j < count; ++j){
auto dx = p0[j].x - p0[i].x;
auto dy = p0[j].y - p0[i].y;
const auto ds = dx * dx + dy * dy;
#pragma unroll
for(auto k = 0; k < 8; ++k){
const auto point = p0[n[i*8 + k]];
dx = point.x - p0[i].x;
dy = point.y - p0[i].y;
if(ds < dx * dx + dy * dy){
n[i*8 + k] = j;
break;
}
}
}
}
```