Using shared memory in a generic 2d stencil

Hi, I am new to CUDA and would like to use share memory on a generic 2d stencil. I have written the bellow code that doesn’t use this feature and would like to adapt a shared_mem version of it. Can anyone help me on where to start to have a shared_mem version starting from my line 121 ?

#include <algorithm>
#include <fstream>
#include <iterator>
#include <random>
#include <iostream>
#include <cuda.h>
#include <stdio.h>

// ==============================================
struct StencilType {};
struct Star : StencilType {};
//--------------------------------------
struct AccessPattern {};
struct XY_Reuse : AccessPattern {};
//-----------------------------------------
using DataType = float;
const unsigned int IN_H = 16;
const unsigned int IN_W = 16;
const unsigned int N = 4;
const unsigned int M = 4;
const unsigned int STENCIL_RADIUS = 4;
using STENCIL_PATTERN = Star;
using HOME_ACCESS_PATTERN = XY_Reuse;

// INDEX FUNCTIONS
// ==============================================

__device__ unsigned int indexOf(unsigned int y, unsigned int x) {
  return ((IN_W + STENCIL_RADIUS * 2) * (y + STENCIL_RADIUS)) + (x + STENCIL_RADIUS);
}

__device__ unsigned int indexOfOut(unsigned int y, unsigned int x) {
  return IN_W * y + x;
}

__device__ unsigned int sharedIndexOf(
    unsigned int y,
    unsigned int x,
    unsigned int S_Y,
    unsigned int S_X) {
  return ((IN_W + STENCIL_RADIUS * 2) * (y - S_Y)) + ((x - S_X) + STENCIL_RADIUS);
}

// COMPUTE KERNEL
// ==============================================

template <typename SType>
__device__
DataType apply(
    const DataType* __restrict__,
    const DataType* __restrict__,
    unsigned int,
    unsigned int);

template <>
__device__
DataType apply<Star>(
    const DataType* __restrict__ in,
    const DataType* __restrict__ in2,
    unsigned int H_X,
    unsigned int H_Y) {
  DataType init = 0;
   
  for (int i = N; i > 0; --i) {
    init += in[indexOf(H_Y - i, H_X)] * in2[indexOf(H_Y - i, H_X)];
  }
  for (int i = -N; i <= N; ++i) {
    init += in[indexOf(H_Y, H_X + i)] * in2[indexOf(H_Y, H_X + i)];
  }
  for (int i = 1; i <= N; ++i) {
    init += in[indexOf(H_Y - i, H_X)] * in2[indexOf(H_Y - i, H_X)];
  }
  return init;

};
// END COMPUTE KERNEL

// HOME ACCESS PATTERN
// ==============================================

template <typename Pattern>
__device__
dim3 home(dim3, dim3);

template <>
__device__
dim3 home<XY_Reuse>(dim3, dim3) {
  return {
    blockDim.x * blockIdx.x + threadIdx.x,
    blockDim.y * blockIdx.y + threadIdx.y
  };
}
// END HOME ACCESS PATTERN

// GLOBAL KERNEL
// ==============================================

__global__
void pattern(
    const DataType* __restrict__ in,
    const DataType* __restrict__ in2,
    DataType* __restrict__ out) {
  const dim3 IN (IN_W, IN_H);

  const dim3 NUM_WUS (
      (IN.x + (gridDim.x * blockDim.x) - 1) / (gridDim.x * blockDim.x),
      (IN.y + (gridDim.y * blockDim.y) - 1) / (gridDim.y * blockDim.y));

  const dim3 POS (
      (blockDim.x * blockIdx.x + threadIdx.x),
      (blockDim.y * blockIdx.y + threadIdx.y));

  dim3 iter;
  for (iter.x = 0; iter.x < NUM_WUS.x; ++iter.x) {
    for (iter.y = 0; iter.y < NUM_WUS.y; ++iter.y) {
      const dim3 WU = home<HOME_ACCESS_PATTERN>(iter, NUM_WUS);
      __syncthreads();
      // co-op load -- shared memory -- TODO
      for (unsigned int i = 0; i < N; ++i) {
        for (unsigned int j = 0; j < M; ++j) {

          //--------------------------------------------
         // USING SHARED MEMORY SWITCH HERE
         //-------------------------------------------
         // Read input element into shared memory
             //	__shared__

        }
      }
      __syncthreads();
      if (POS.x < IN_W && POS.y < IN_H) {
        out[indexOfOut(POS.y, POS.x)] = apply<STENCIL_PATTERN>(in, in2, POS.y, POS.x);
      }
    }
  }
}

int main(int argc, char* argv[]) {
  DataType* in1;
  //printf("int1 before rand is: %f \n", *in1);
  DataType* in2;
  DataType* out;

  const unsigned int WIDTH = (IN_W + 2 * STENCIL_RADIUS);
  const unsigned int HEIGHT = (IN_H + 2 * STENCIL_RADIUS);
  const unsigned int DATA_SIZE = WIDTH * HEIGHT;

  cudaMallocManaged(reinterpret_cast<void**>(&in1), DATA_SIZE);
  cudaMallocManaged(reinterpret_cast<void**>(&in2), DATA_SIZE);
  cudaMallocManaged(reinterpret_cast<void**>(&out), IN_W * IN_H);

  std::random_device rd;
  std::uniform_real_distribution<DataType> dist;
  std::generate_n (in1, DATA_SIZE, [&rd,&dist] { return dist(rd); });
  std::generate_n (in2, DATA_SIZE, [&rd,&dist] { return dist(rd); });
  
  printf ("in1 and in2 are : %f %f \n", *in1, *in2);

  dim3 blocks((M + IN_W) / M, (N + IN_H), N);
  dim3 threads(M, N);
  pattern <<<blocks, threads>>> (in1, in2, out);
  cudaDeviceSynchronize();

  std::ofstream outFile(argv[1], std::ios::binary);
  std::copy_n (out, IN_W * IN_H, std::ostream_iterator<DataType>(outFile));
  outFile.close();

  cudaFree(in1);
  cudaFree(in2);
  cudaFree(out);

  return 0;
}