# optimization question

I have the following stencil type kernel I am running on a Tesla 2070. I have a (dim)x(dim) dimensioned grid of integers and for every element I need to sum up it’s 8 nearest neighbors and write that back to global memory, The grid has periodic boundary conditions and so I actually have a (dim+2)x(dim+2) grid to account for this. I allocate the grid linearly and have found best performance with a 16x16 blocksize. While profiling I can get a max of ~82 GB/s global read+write throughput, I thought using shared memory would speed things up but the second kernel below was slightly slower than the first one. Since this is a memory bound problem I was hoping to get closer to 100-120 GB/s, is there any obvious improvements anyone can see that would help or is this kind of performance about what I should expect? Thanks!

``````__global__ void GOL(int dim, int *grid, int *newGrid)

{

int iy = blockDim.y * blockIdx.y + threadIdx.y + 1;

int ix = blockDim.x * blockIdx.x + threadIdx.x + 1;

int id = iy * (dim+2) + ix;

int sum;

if (iy <= dim && ix <= dim) {

sum = grid[id+(dim+2)] + grid[id-(dim+2)] //upper lower

+ grid[id+1] + grid[id-1] //right left

+ grid[id+(dim+3)] + grid[id-(dim+3)] + grid[id-(dim+1)] + grid[id+(dim+1)]; //diagonals

}

...very light calculations in shared mem

newGrid[id] = s_grid[i][j];

}
``````
``````__global__ void GOL(int dim, int *grid, int *newGrid)

{

int iy = (blockDim.y -2) * blockIdx.y + threadIdx.y;

int ix = (blockDim.x -2) * blockIdx.x + threadIdx.x;

int id = iy * (dim+2) + ix;

int numNeighbors;

__shared__ int s_grid[BLOCK_SIZE][BLOCK_SIZE];

if (ix <= dim+1 && iy <= dim+1)

if (iy <= dim && ix <= dim) {

if(i != 0 && i !=BLOCK_SIZE-1 && j != 0 && j !=BLOCK_SIZE-1) {

sum = s_grid[i+1][j] + s_grid[i-1][j] //upper lower

+ s_grid[i][j+1] + s_grid[i][j-1] //right left

+ s_grid[i+1][j+1] + s_grid[i-1][j-1] + s_grid[i-1][j+1] + s_grid[i+1][j-1]; //diagonals

}

}

...very light calculations in shared mem

newGrid[id] = s_grid[i][j];

}
``````

try following one

``````__global__ void GOL(int dim, int *grid, int *newGrid){

int iy = (blockDim.y -2) * blockIdx.y + threadIdx.y;

int ix = (blockDim.x -2) * blockIdx.x + threadIdx.x;

int id = iy * (dim+2) + ix;

int numNeighbors;

__shared__ int s_grid[BLOCK_SIZE][BLOCK_SIZE];

if (ix <= dim+1 && iy <= dim+1)

if (iy <= dim && ix <= dim) {

sum = s_grid[i+1][j] + s_grid[i-1][j] //upper lower

+ s_grid[i][j+1] + s_grid[i][j-1] //right left

+ s_grid[i+1][j+1] + s_grid[i-1][j-1] + s_grid[i-1][j+1] + s_grid[i+1][j-1]; //diagonals      \

}

...very light calculations in shared mem

if(i != 0 && i !=BLOCK_SIZE-1 && j != 0 && j !=BLOCK_SIZE-1) {

newGrid[id] = s_grid[i][j];

}

}
``````

but remember to adjust size of shared memory to [BLOCK_SIZE+1][BLOCK_SIZE+1]

and corresponding indices in computation of sum.