optimization question

I have the following stencil type kernel I am running on a Tesla 2070. I have a (dim)x(dim) dimensioned grid of integers and for every element I need to sum up it’s 8 nearest neighbors and write that back to global memory, The grid has periodic boundary conditions and so I actually have a (dim+2)x(dim+2) grid to account for this. I allocate the grid linearly and have found best performance with a 16x16 blocksize. While profiling I can get a max of ~82 GB/s global read+write throughput, I thought using shared memory would speed things up but the second kernel below was slightly slower than the first one. Since this is a memory bound problem I was hoping to get closer to 100-120 GB/s, is there any obvious improvements anyone can see that would help or is this kind of performance about what I should expect? Thanks!

__global__ void GOL(int dim, int *grid, int *newGrid)

{

    int iy = blockDim.y * blockIdx.y + threadIdx.y + 1;

    int ix = blockDim.x * blockIdx.x + threadIdx.x + 1;

    int id = iy * (dim+2) + ix;

int sum;

if (iy <= dim && ix <= dim) {

        sum = grid[id+(dim+2)] + grid[id-(dim+2)] //upper lower

                      + grid[id+1] + grid[id-1] //right left

                      + grid[id+(dim+3)] + grid[id-(dim+3)] + grid[id-(dim+1)] + grid[id+(dim+1)]; //diagonals

    }

...very light calculations in shared mem

newGrid[id] = s_grid[i][j];

}
__global__ void GOL(int dim, int *grid, int *newGrid)

{ 

       int iy = (blockDim.y -2) * blockIdx.y + threadIdx.y;

       int ix = (blockDim.x -2) * blockIdx.x + threadIdx.x;

       int id = iy * (dim+2) + ix;

int i = threadIdx.y;

       int j = threadIdx.x;

       int numNeighbors;

__shared__ int s_grid[BLOCK_SIZE][BLOCK_SIZE];

if (ix <= dim+1 && iy <= dim+1)

          s_grid[threadIdx.y][threadIdx.x] = grid[id];

__syncthreads();

if (iy <= dim && ix <= dim) {

          if(i != 0 && i !=BLOCK_SIZE-1 && j != 0 && j !=BLOCK_SIZE-1) {

sum = s_grid[i+1][j] + s_grid[i-1][j] //upper lower

                     + s_grid[i][j+1] + s_grid[i][j-1] //right left

                     + s_grid[i+1][j+1] + s_grid[i-1][j-1] + s_grid[i-1][j+1] + s_grid[i+1][j-1]; //diagonals

       }

    }

...very light calculations in shared mem

newGrid[id] = s_grid[i][j];

}

try following one

__global__ void GOL(int dim, int *grid, int *newGrid){ 

    int iy = (blockDim.y -2) * blockIdx.y + threadIdx.y;   

    int ix = (blockDim.x -2) * blockIdx.x + threadIdx.x;       

    int id = iy * (dim+2) + ix;       

    int i = threadIdx.y;       

    int j = threadIdx.x;       

    int numNeighbors;       

    __shared__ int s_grid[BLOCK_SIZE][BLOCK_SIZE];      

    if (ix <= dim+1 && iy <= dim+1)          

        s_grid[threadIdx.y][threadIdx.x] = grid[id];        

    __syncthreads();      

    if (iy <= dim && ix <= dim) {          

sum = s_grid[i+1][j] + s_grid[i-1][j] //upper lower                     

                + s_grid[i][j+1] + s_grid[i][j-1] //right left                    

                + s_grid[i+1][j+1] + s_grid[i-1][j-1] + s_grid[i-1][j+1] + s_grid[i+1][j-1]; //diagonals      \

}    

    ...very light calculations in shared mem

    if(i != 0 && i !=BLOCK_SIZE-1 && j != 0 && j !=BLOCK_SIZE-1) {      

        newGrid[id] = s_grid[i][j];

    }

}

but remember to adjust size of shared memory to [BLOCK_SIZE+1][BLOCK_SIZE+1]

and corresponding indices in computation of sum.