How can I store intermediate results in a shared memory?

My teacher wrote:

Implement a CUDA kernel for matrix products as outer product vectors. In this version, each block of K threads counts a square piece of the result matrix of size KxK by implementing the matrix outer product formula. The kernel uses shared memory to store the corresponding column vectors from matrix A and matrix B and to store the corresponding fragment of the result array.

According to the outer product formula, I wrote the following CUDA program:

#include <iostream>

const int ROWS1 = 4;
const int COLS1 = 4;

const int ROWS2 = 4;
const int COLS2 = 4;

const int ROWS3 = ROWS1;
const int COLS3 = COLS2;

const int BLOCK_ROW_SIZE = 2;
const int BLOCK_COL_SIZE = 2;

#define IDX(block_size, block_i, relative_i) (block_size * block_i + relative_i)

__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
                                                   int block_row_size, int block_col_size,
                                                   int cols1, int rows1, int cols2) {
  int block_i = blockIdx.x;
  int block_j = blockIdx.y;
  int block_r = threadIdx.x;
  int cell_r, cell_i, cell_j;
  int r, i, j;

  for (cell_r = 0; cell_r < cols1; cell_r++) {
    for (cell_i = 0; cell_i < rows1; cell_i++) {
      for (cell_j = 0; cell_j < cols2; cell_j++) {
        r = IDX(BLOCK_COL_SIZE, block_r, cell_r);
        i = IDX(BLOCK_ROW_SIZE, block_i, cell_i);
        j = IDX(BLOCK_COL_SIZE, block_j, cell_j);

        atomicAdd(&C[i * COLS3 + j], A[i * COLS1 + r] * B[r * COLS2 + j]);
      }
    }
  }
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      printf("%d ", mat[i * cc + j]);
    }
    printf("\n");
  }
  printf("\n");
}

void allocateMatrix(int *&a, int rows, int cols) {
  cudaMallocManaged(&a, rows * cols * sizeof(int));
}

void freeMatrix(int *a) {
  cudaFree(a);
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

int main() {
  int *A, *B, *C;

  allocateMatrix(A, ROWS1, COLS1);
  initMatrix(A, ROWS1, COLS1);

  allocateMatrix(B, ROWS2, COLS2);
  initMatrix(B, ROWS2, COLS2);

  allocateMatrix(C, ROWS3, COLS3);
  initMatrixZero(C, ROWS3, COLS3);

  dim3 blocks(BLOCK_COL_SIZE, BLOCK_ROW_SIZE);
  MultiplyAsSumOuterProductOfVectors<<<blocks, BLOCK_COL_SIZE>>>(A, B, C, BLOCK_ROW_SIZE, BLOCK_COL_SIZE, COLS1/BLOCK_COL_SIZE, ROWS1/BLOCK_ROW_SIZE, COLS2/BLOCK_COL_SIZE);
  cudaDeviceSynchronize();

  printMatrix(C, ROWS3, COLS3);

  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

The output looks fine to me.

However, I am not being able to visualize how I can use shared memory to store the intermediate results.

I mean, my listing doesn’t need a separate 2D array of size BLOCK_ROW_SIZE x BLOCK_COL_SIZE. Therefore, I am unable to adjust my algorithm/listing according to the problem definition.

How can I store intermediate results in shared memory?


Additional source code:

The following is the host version of the same listing above.

#include <cstdarg>
#include <fstream>
#include <iomanip>
#include <iostream>

using namespace std;

const int ROWS1 = 4;
const int COLS1 = 4;

const int ROWS2 = 4;
const int COLS2 = 4;

const int ROWS3 = ROWS1;
const int COLS3 = COLS2;

const int BLOCK_ROW_SIZE = 2;
const int BLOCK_COL_SIZE = 2;

#define IDX(block_size, block_i, relative_i) (block_size * block_i + relative_i)

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      printf("%d ", mat[i * cc + j]);
    }
    printf("\n");
  }
  printf("\n");
}

void allocateMatrix(int *&a, int rows, int cols) {
  a = new int[rows * cols];
}

void freeMatrix(int *a) {
  delete[] a;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
                                        int block_row_size, int block_col_size,
                                        int cols1, int rows1, int cols2) {
  try {
      for (int block_i = 0; block_i < block_col_size; block_i++) {
          for (int block_j = 0; block_j < block_row_size; block_j++) {
              for (int block_r = 0; block_r < block_col_size; block_r++) {
                  for (int cell_r = 0; cell_r < cols1; cell_r++) {
                      for (int cell_i = 0; cell_i < rows1; cell_i++) {
                          for (int cell_j = 0; cell_j < cols2; cell_j++) {
                              int r = IDX(BLOCK_COL_SIZE, block_r, cell_r);
                              int i = IDX(BLOCK_ROW_SIZE, block_i, cell_i);
                              int j = IDX(BLOCK_COL_SIZE, block_j, cell_j);

                              C[i * COLS3 + j] += A[i * COLS1 + r] * B[r * COLS2 + j];
                          }
                      }
                  }
              }
          }
      }
  }
  catch(...)
  {}
}

int main() {
  int *A, *B, *C;

  allocateMatrix(A, ROWS1, COLS1);
  initMatrix(A, ROWS1, COLS1);

  allocateMatrix(B, ROWS2, COLS2);
  initMatrix(B, ROWS2, COLS2);

  allocateMatrix(C, ROWS3, COLS3);
  initMatrixZero(C, ROWS3, COLS3);

  MultiplyAsSumOuterProductOfVectors(A, B, C, BLOCK_ROW_SIZE, BLOCK_COL_SIZE, COLS1/BLOCK_COL_SIZE, ROWS1/BLOCK_ROW_SIZE, COLS2/BLOCK_COL_SIZE);

  printMatrix(C, ROWS3, COLS3);

  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

I think your teacher wants the kernel to load from global memory to shared memory,
work with the values in shared memory, placing the results in shared memory, then
store the results from shared memory to global memory:

“The kernel uses shared memory to store the corresponding column vectors from matrix A and matrix B and to store the corresponding fragment of the result array.”

So, as far as I understand, the kernel will need additional iterations or steps to load data from global memory to shared memory and then again from shared memory to global memory.

However, these additional iterations or steps are compensated for by the superior speed of shared memory.

Am I correct?

Yes, though not necessarily for a kernel that only does a little work.

Your teacher may just want you to demonstrate that you know how to:

  • set up shared memory
  • copy between it and global memory
  • provide the required synchronization (see the link below)

This article does a great job of explaining how using shared memory can speed things up, and it and will likely help you with using shared memory in your assignment:
https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/