I’m learning CUDA C architecture for my course and this is my task.

Cluster is constructed by structure a simple structure.

```
struct Cluster {
float* x;
float* y;
float* z;
};
```

Two clusters are then filled with **N** points and their similarity is calculated on the CPU with

```
float CPU_result(Cluster A, Cluster B, int n) {
float difference = 0.0f;
for (int i = 0; i < n-1; i++) {
float tmp = 0.0f;
for (int j = i+1; j < n; j++) {
float diff_a = sqrt((A.x[i]-A.x[j])*(A.x[i]-A.x[j])
+ (A.y[i]-A.y[j])*(A.y[i]-A.y[j])
+ (A.z[i]-A.z[j])*(A.z[i]-A.z[j]));
float diff_b = sqrt((B.x[i]-B.x[j])*(B.x[i]-B.x[j])
+ (B.y[i]-B.y[j])*(B.y[i]-B.y[j])
+ (B.z[i]-B.z[j])*(B.z[i]-B.z[j]));
tmp += (diff_a-db) * (diff_a-diff_b);
}
diff += tmp;
}
return sqrt(1/((float)n*((float)n-1)) * difference);
}
```

My first (very poorly) written implemenation looks like this

```
__global__ void compute_gpu(ClusterA, ClusterB, int n ,float* result){
//determine unique index within grid, miximizing block usage
int i = threadIdx.y + blockIdx.y * blockDim.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n && j < n && j > i && i != j){
float tmp = 0.0f;
float diff_a = sqrt((A.x[i]-A.x[j])*(A.x[i]-A.x[j])
+ (A.y[i]-A.y[j])*(A.y[i]-A.y[j])
+ (A.z[i]-A.z[j])*(A.z[i]-A.z[j]));
float diff_b = sqrt((B.x[i]-B.x[j])*(B.x[i]-B.x[j])
+ (B.y[i]-B.y[j])*(B.y[i]-B.y[j])
+ (B.z[i]-B.z[j])*(B.z[i]-B.z[j]));
tmp = (diff_a-diff_b) * (diff_a-diff_b);
atomicAdd(result, tmp);
return ;
}
}
float GPU_result(Cluster A, Cluster B, int n) {
float h_result[n];
float *d_result;
//allocate space on GPU for d_result variable
cudaMalloc((void**)&d_result, sizeof(float));
//define grid and block sizes
dim3 block_size(16,16);
dim3 grid_size((n+15)/16, (n+15)/16);
//Launch kernel with pass by referennce attribute and N size
compute_gpu <<<grid_size, block_size>>>(A, B, n,d_result);
//copy data from GPU memory to CPU memory via PCIe bus
cudaMemcpy(h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost);
//use overall square root out of GPU kernel, to avoid race condition
*h_result = sqrt(1/((float)n*((float)n-1)) * (* h_result));
return *h_result;
}
```

My problem is, that **GPU** is computing rather presisely for **N** = 3000(with really bad performance), but as the number got higher, it surpasses accepted error of 1%.

I know this is not very bad performance-wise implementation and I *would welcome any performance advices*, but my primary goal now is get kernel computing precisely for much higher **N** (tens of thousands) values. Any advices what might be going wrong here?

Thanks in advance.