Rewrite algorithm with shared memory

Hello! I’m writing a program that is given N x N matrix (“mat”) and calculates N-2R x N-2R matrix (“res”) filled with element sums of (2R+1) x (2R+1) submatrices (from “mat”). My code is somewhat similar to convolution kernels (filters). Each thread in the 2D grid of 2D thread blocks should calculate K results (K elements of matrix “res”). Currently I have basic implementation, but I’m struggling to find out how to improve it (or at least rewrite) using shared memory. I tried to re-use shared memory code from this repo: https://github.com/rpgolshan/CUDA-image-processing/blob/master/convolution.cu, but it seems to be designed only for K=1.
Below I provide my current implementation that does not use shared memory yet.

// Calculate unique thread ID
__device__ int getThreadID_cacheonly(int BS, int GS) {
    int threadsPerBlock = BS * BS;
    int threadNumInBlock = threadIdx.x + BS * threadIdx.y;
    int blockNumInGrid = blockIdx.x + GS * blockIdx.y;
    int globalThreadNum = blockNumInGrid * threadsPerBlock + threadNumInBlock;
    return globalThreadNum;
}

__global__ void sum_gpu_cacheonly_kernel(int* res, const int* mat, int N, int R, int K)
{
    // Output array size (N_OUT x N_OUT)
    int N_OUT = N - 2 * R;
    int OUT_SIZE = SIZE(N_OUT);
    // Block size (BS x BS) and grid size (GS x GS)
    int BS = blockDim.x, GS = gridDim.x;
    // Limit total number of threads (if k > 1)
    int max_thread = ceil(OUT_SIZE / (float)K);
    // Get unique thread (and output) index
    int thread_index = getThreadID_cacheonly(BS, GS);
    // Check if this thread should calculate results
    bool should_calc = !(thread_index > max_thread);
    if (!should_calc) {
        return;
    }
    // Calculate k-th result for this thread
    for (int k = 0; k < K; k++)
    {
        int sum = 0;
        // k-th result index
        int dest_loc = thread_index * K + k;
        if (dest_loc >= OUT_SIZE) {
            return;
        }
        // Source submatrix "center" position for k-th result
        int src_i = (dest_loc / N_OUT) + R, src_j = (dest_loc % N_OUT) + R;
        // Calculate sum of source submatrix elements (within "radius" R)
        for (int i = -R; i <= R; i++)
        {
            int part_loc = (i + src_i) * N + src_j;
            for (int j = -R; j <= R; j++)
            {
                sum += mat[part_loc+j];
            }
        }
        // Write k-th result to output array
        res[dest_loc] = sum;
}

I call the kernel as follows:

dim3 threads(BS, BS);
dim3 grid(ceil((float)N_OUT / threads.x), ceil((float)N_OUT / threads.y));

where BS is single 2D thread block size.