Sure. The problem ask for the k-nearest neighbors for a group of N sites in R^2 space. “data” is a pointer which stores N floats2 (x and y position), and res is a N*K array where we store the K nearest neighbors for each site. Each one stores a float2 since in x we store the distance from the actuallu site and in y the position into the array.

All values of sites are in the [0,1]^2 space.

K, BLOCK_X, BLOCK_Y are constants with values 300, 32 and 16 respectively. N is the number of sites.

```
float2 * runGPU(float2 * data, uint N, QTextStream& stream)
{
//timer
unsigned int timer = 0;
cutCreateTimer(&timer);
cutStartTimer(timer);
//array where we store the positions of the site
int dataSize=sizeof(float2)*N;
float2 * dataDevice;
cudaMalloc((void**)&dataDevice,dataSize); //allocate sizeof(float2)*N space
//copy from data (contains the site info) to GPU data
cudaMemcpy(dataDevice,data,dataSize,cudaMemcpyHostToDevice);
//The result is the K nearest sites for each site (N*K)
int resSize=sizeof(float2)*N*K;
float2 * resDevice;
cudaMalloc((void**)&resDevice,resSize);
dim3 dimBlock(BLOCK_X,BLOCK_Y);
//each thread computes the K nearest neighbors for one site
//The grid is defined as a 1 dimension array
dim3 dimGrid(N/(dimBlock.x*dimBlock.y));
kNearest<<<dimGrid,dimBlock>>>(dataDevice,resDevice,N);
float2 * GPURes = new float2[N*K];
cudaMemcpy(GPURes,resDevice,resSize,cudaMemcpyDeviceToHost);
cutStopTimer(timer);
stream<<"GPU total Time: "<<cutGetTimerValue(timer)/1000.<<"(s)"<<endl;
cutDeleteTimer(timer);
cudaFree(dataDevice);
cudaFree(resDevice);
return GPURes;
}
```

The kernel code:

```
__global__ void kNearest(float2 * data, float2 * res, float N)
{
int n=blockIdx.x*(blockDim.x*blockDim.y) + (threadIdx.y*blockDim.x + threadIdx.x);
//initialize all distances (x) to maximum value
for(uint k=0;k<K;k++)
res[(n*K)+k].x=1.5; // >sqrt(2.)
//loop for every site
for(uint j=0;j<N;j++)
{
if(n!=j) //do not compate with himself
{
float2 a = data[n];
float2 b = data[j];
//compute the distance between site n and j
float dist = sqrt(pow(a.x-b.x,2) + pow(a.y-b.y,2));
//look for the k position into the array (increassing order)
uint pos=0;
while(pos<K && dist>res[(n*K)+pos].x)
pos++;
//if is smaller than one value already stored in the array
if(pos<K)
{
//move all values to the rigth
for(uint w=K-1;w>pos;w--)
res[(n*K)+w] = res[(n*K)+(w-1)];
//finally store in the position the rigth value
res[(n*K)+pos].x = dist;
res[(n*K)+pos].y = j;
}
}
}
}
```

I assume that is not optimized for best performance but I just need that to work fine.

May be helps you to know that for small values of N it works fine and for great values of N returns all values to 0.

Thanks for your time.