Hello! I’m writing a program that is given N x N matrix (“mat”) and calculates N-2*R x N-2*R matrix (“res”) filled with element sums of (2R+1) x (2R+1) submatrices (from “mat”). My code is somewhat similar to convolution kernels (filters). Each thread in the 2D grid of 2D thread blocks should calculate K results (K elements of matrix “res”). Currently I have basic implementation, but I’m struggling to find out how to improve it (or at least rewrite) using shared memory. I tried to re-use shared memory code from this repo: https://github.com/rpgolshan/CUDA-image-processing/blob/master/convolution.cu, but it seems to be designed only for K=1.

Below I provide my current implementation that does not use shared memory yet.

```
// Calculate unique thread ID
__device__ int getThreadID_cacheonly(int BS, int GS) {
int threadsPerBlock = BS * BS;
int threadNumInBlock = threadIdx.x + BS * threadIdx.y;
int blockNumInGrid = blockIdx.x + GS * blockIdx.y;
int globalThreadNum = blockNumInGrid * threadsPerBlock + threadNumInBlock;
return globalThreadNum;
}
__global__ void sum_gpu_cacheonly_kernel(int* res, const int* mat, int N, int R, int K)
{
// Output array size (N_OUT x N_OUT)
int N_OUT = N - 2 * R;
int OUT_SIZE = SIZE(N_OUT);
// Block size (BS x BS) and grid size (GS x GS)
int BS = blockDim.x, GS = gridDim.x;
// Limit total number of threads (if k > 1)
int max_thread = ceil(OUT_SIZE / (float)K);
// Get unique thread (and output) index
int thread_index = getThreadID_cacheonly(BS, GS);
// Check if this thread should calculate results
bool should_calc = !(thread_index > max_thread);
if (!should_calc) {
return;
}
// Calculate k-th result for this thread
for (int k = 0; k < K; k++)
{
int sum = 0;
// k-th result index
int dest_loc = thread_index * K + k;
if (dest_loc >= OUT_SIZE) {
return;
}
// Source submatrix "center" position for k-th result
int src_i = (dest_loc / N_OUT) + R, src_j = (dest_loc % N_OUT) + R;
// Calculate sum of source submatrix elements (within "radius" R)
for (int i = -R; i <= R; i++)
{
int part_loc = (i + src_i) * N + src_j;
for (int j = -R; j <= R; j++)
{
sum += mat[part_loc+j];
}
}
// Write k-th result to output array
res[dest_loc] = sum;
}
```

I call the kernel as follows:

```
dim3 threads(BS, BS);
dim3 grid(ceil((float)N_OUT / threads.x), ceil((float)N_OUT / threads.y));
```

where BS is single 2D thread block size.