This code on GTX 1050Ti gives ~48 fps (KernelSlow) and ~68 fps (KernelFast) [N = 15K]. On CUDA 9 I have

~25 and ~22 fps. On my GTX 1080 the problem is similar.

After compilation .exe file have size 1323 KB on CUDA 9 and ~330 KB on CUDA 8.

All settings in defaults.

```
//CUDA_Force_Calc
__global__ void KernelSlow(float *POSM, float *POSX, float *POSY, float *POSZ, float *POSR, const int N)
{
float AX_M, AY_M, AZ_M, det_X, det_Y, det_Z, den;
int first = threadIdx.x + blockIdx.x * blockDim.x;
AX_M = 0;
AY_M = 0;
AZ_M = 0;
for (int next = 0; next < N; next++) {
if ((first != next) && ((POSX[next] != POSX[first]) || (POSY[next] != POSY[first]) || (POSZ[next] != POSZ[first]))) {
det_X = POSX[next] - POSX[first];
det_Y = POSY[next] - POSY[first];
det_Z = POSZ[next] - POSZ[first];
den = POSM[next] / (30000 * pow((det_X*det_X + det_Y*det_Y + det_Z*det_Z), 3 / 2));
AX_M += det_X * den;
AY_M += det_Y * den;
AZ_M += det_Z * den;
}
}
POSR[first] += AX_M;
POSR[first + N] += AY_M;
POSR[first + N + N] += AZ_M;
POSX[first] += POSR[first];
POSY[first] += POSR[first + N];
POSZ[first] += POSR[first + N + N];
}
//CUDA_Tile_Calc
__global__ void KernelFast(float *POSM, float *POSX, float *POSY, float *POSZ, float *POSR, const int N)
{
__shared__ float LPOSX[blocksize], LPOSY[blocksize], LPOSZ[blocksize], LPOSM[blocksize];
float AX_M = 0, AY_M = 0, AZ_M = 0, den;
int ia = blockDim.x * blockIdx.x + threadIdx.x;
float POSX0 = POSX[ia], POSY0 = POSY[ia], POSZ0 = POSZ[ia], POSM0 = POSM[ia];
for (unsigned int ib = 0; ib < N; ib += blocksize) {
LPOSX[threadIdx.x] = POSX[ib + threadIdx.x];
LPOSY[threadIdx.x] = POSY[ib + threadIdx.x];
LPOSZ[threadIdx.x] = POSZ[ib + threadIdx.x];
LPOSM[threadIdx.x] = POSM[ib + threadIdx.x];
__syncthreads();
#pragma unroll
for (unsigned int ic = 0; ic < blocksize; ic++) {
if ((ic != ia) && (LPOSX[ic] != POSX0) || (LPOSY[ic] != POSY0) || (LPOSZ[ic] != POSZ0)){
float det_X = (LPOSX[ic] - POSX0);
float det_Y = (LPOSY[ic] - POSY0);
float det_Z = (LPOSZ[ic] - POSZ0);
den = LPOSM[ic] / (30000 * pow((det_X*det_X + det_Y*det_Y + det_Z*det_Z), 3 / 2));
AX_M += det_X * den;
AY_M += det_Y * den;
AZ_M += det_Z * den;
}
}
__syncthreads();
}
POSR[ia] += AX_M;
POSR[ia + N] += AY_M;
POSR[ia + N + N] += AZ_M;
POSX[ia] += POSR[ia];
POSY[ia] += POSR[ia + N];
POSZ[ia] += POSR[ia + N + N];
}
```

FULL Source (VS 2015): https://drive.google.com/file/d/1L2bx6p-LMHleH9JO-ILKeqlO1ByhUOL6/view

Thanks.