I have the following stencil type kernel I am running on a Tesla 2070. I have a (dim)x(dim) dimensioned grid of integers and for every element I need to sum up it’s 8 nearest neighbors and write that back to global memory, The grid has periodic boundary conditions and so I actually have a (dim+2)x(dim+2) grid to account for this. I allocate the grid linearly and have found best performance with a 16x16 blocksize. While profiling I can get a max of ~82 GB/s global read+write throughput, I thought using shared memory would speed things up but the second kernel below was slightly slower than the first one. Since this is a memory bound problem I was hoping to get closer to 100-120 GB/s, is there any obvious improvements anyone can see that would help or is this kind of performance about what I should expect? Thanks!

```
__global__ void GOL(int dim, int *grid, int *newGrid)
{
int iy = blockDim.y * blockIdx.y + threadIdx.y + 1;
int ix = blockDim.x * blockIdx.x + threadIdx.x + 1;
int id = iy * (dim+2) + ix;
int sum;
if (iy <= dim && ix <= dim) {
sum = grid[id+(dim+2)] + grid[id-(dim+2)] //upper lower
+ grid[id+1] + grid[id-1] //right left
+ grid[id+(dim+3)] + grid[id-(dim+3)] + grid[id-(dim+1)] + grid[id+(dim+1)]; //diagonals
}
...very light calculations in shared mem
newGrid[id] = s_grid[i][j];
}
```

```
__global__ void GOL(int dim, int *grid, int *newGrid)
{
int iy = (blockDim.y -2) * blockIdx.y + threadIdx.y;
int ix = (blockDim.x -2) * blockIdx.x + threadIdx.x;
int id = iy * (dim+2) + ix;
int i = threadIdx.y;
int j = threadIdx.x;
int numNeighbors;
__shared__ int s_grid[BLOCK_SIZE][BLOCK_SIZE];
if (ix <= dim+1 && iy <= dim+1)
s_grid[threadIdx.y][threadIdx.x] = grid[id];
__syncthreads();
if (iy <= dim && ix <= dim) {
if(i != 0 && i !=BLOCK_SIZE-1 && j != 0 && j !=BLOCK_SIZE-1) {
sum = s_grid[i+1][j] + s_grid[i-1][j] //upper lower
+ s_grid[i][j+1] + s_grid[i][j-1] //right left
+ s_grid[i+1][j+1] + s_grid[i-1][j-1] + s_grid[i-1][j+1] + s_grid[i+1][j-1]; //diagonals
}
}
...very light calculations in shared mem
newGrid[id] = s_grid[i][j];
}
```