How can I store intermediate results in a shared memory?

user366312 · August 6, 2023, 6:12pm

My teacher wrote:

Implement a CUDA kernel for matrix products as outer product vectors. In this version, each block of K threads counts a square piece of the result matrix of size KxK by implementing the matrix outer product formula. The kernel uses shared memory to store the corresponding column vectors from matrix A and matrix B and to store the corresponding fragment of the result array.

According to the outer product formula, I wrote the following CUDA program:

#include <iostream>

const int ROWS1 = 4;
const int COLS1 = 4;

const int ROWS2 = 4;
const int COLS2 = 4;

const int ROWS3 = ROWS1;
const int COLS3 = COLS2;

const int BLOCK_ROW_SIZE = 2;
const int BLOCK_COL_SIZE = 2;

#define IDX(block_size, block_i, relative_i) (block_size * block_i + relative_i)

__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
                                                   int block_row_size, int block_col_size,
                                                   int cols1, int rows1, int cols2) {
  int block_i = blockIdx.x;
  int block_j = blockIdx.y;
  int block_r = threadIdx.x;
  int cell_r, cell_i, cell_j;
  int r, i, j;

  for (cell_r = 0; cell_r < cols1; cell_r++) {
    for (cell_i = 0; cell_i < rows1; cell_i++) {
      for (cell_j = 0; cell_j < cols2; cell_j++) {
        r = IDX(BLOCK_COL_SIZE, block_r, cell_r);
        i = IDX(BLOCK_ROW_SIZE, block_i, cell_i);
        j = IDX(BLOCK_COL_SIZE, block_j, cell_j);

        atomicAdd(&C[i * COLS3 + j], A[i * COLS1 + r] * B[r * COLS2 + j]);
      }
    }
  }
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      printf("%d ", mat[i * cc + j]);
    }
    printf("\n");
  }
  printf("\n");
}

void allocateMatrix(int *&a, int rows, int cols) {
  cudaMallocManaged(&a, rows * cols * sizeof(int));
}

void freeMatrix(int *a) {
  cudaFree(a);
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

int main() {
  int *A, *B, *C;

  allocateMatrix(A, ROWS1, COLS1);
  initMatrix(A, ROWS1, COLS1);

  allocateMatrix(B, ROWS2, COLS2);
  initMatrix(B, ROWS2, COLS2);

  allocateMatrix(C, ROWS3, COLS3);
  initMatrixZero(C, ROWS3, COLS3);

  dim3 blocks(BLOCK_COL_SIZE, BLOCK_ROW_SIZE);
  MultiplyAsSumOuterProductOfVectors<<<blocks, BLOCK_COL_SIZE>>>(A, B, C, BLOCK_ROW_SIZE, BLOCK_COL_SIZE, COLS1/BLOCK_COL_SIZE, ROWS1/BLOCK_ROW_SIZE, COLS2/BLOCK_COL_SIZE);
  cudaDeviceSynchronize();

  printMatrix(C, ROWS3, COLS3);

  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

The output looks fine to me.

However, I am not being able to visualize how I can use shared memory to store the intermediate results.

I mean, my listing doesn’t need a separate 2D array of size BLOCK_ROW_SIZE x BLOCK_COL_SIZE. Therefore, I am unable to adjust my algorithm/listing according to the problem definition.

How can I store intermediate results in shared memory?

Additional source code:

The following is the host version of the same listing above.

#include <cstdarg>
#include <fstream>
#include <iomanip>
#include <iostream>

using namespace std;

const int ROWS1 = 4;
const int COLS1 = 4;

const int ROWS2 = 4;
const int COLS2 = 4;

const int ROWS3 = ROWS1;
const int COLS3 = COLS2;

const int BLOCK_ROW_SIZE = 2;
const int BLOCK_COL_SIZE = 2;

#define IDX(block_size, block_i, relative_i) (block_size * block_i + relative_i)

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      printf("%d ", mat[i * cc + j]);
    }
    printf("\n");
  }
  printf("\n");
}

void allocateMatrix(int *&a, int rows, int cols) {
  a = new int[rows * cols];
}

void freeMatrix(int *a) {
  delete[] a;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
                                        int block_row_size, int block_col_size,
                                        int cols1, int rows1, int cols2) {
  try {
      for (int block_i = 0; block_i < block_col_size; block_i++) {
          for (int block_j = 0; block_j < block_row_size; block_j++) {
              for (int block_r = 0; block_r < block_col_size; block_r++) {
                  for (int cell_r = 0; cell_r < cols1; cell_r++) {
                      for (int cell_i = 0; cell_i < rows1; cell_i++) {
                          for (int cell_j = 0; cell_j < cols2; cell_j++) {
                              int r = IDX(BLOCK_COL_SIZE, block_r, cell_r);
                              int i = IDX(BLOCK_ROW_SIZE, block_i, cell_i);
                              int j = IDX(BLOCK_COL_SIZE, block_j, cell_j);

                              C[i * COLS3 + j] += A[i * COLS1 + r] * B[r * COLS2 + j];
                          }
                      }
                  }
              }
          }
      }
  }
  catch(...)
  {}
}

int main() {
  int *A, *B, *C;

  allocateMatrix(A, ROWS1, COLS1);
  initMatrix(A, ROWS1, COLS1);

  allocateMatrix(B, ROWS2, COLS2);
  initMatrix(B, ROWS2, COLS2);

  allocateMatrix(C, ROWS3, COLS3);
  initMatrixZero(C, ROWS3, COLS3);

  MultiplyAsSumOuterProductOfVectors(A, B, C, BLOCK_ROW_SIZE, BLOCK_COL_SIZE, COLS1/BLOCK_COL_SIZE, ROWS1/BLOCK_ROW_SIZE, COLS2/BLOCK_COL_SIZE);

  printMatrix(C, ROWS3, COLS3);

  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

CU_Steve · August 7, 2023, 1:58pm

I think your teacher wants the kernel to load from global memory to shared memory,
work with the values in shared memory, placing the results in shared memory, then
store the results from shared memory to global memory:

“The kernel uses shared memory to store the corresponding column vectors from matrix A and matrix B and to store the corresponding fragment of the result array.”

user366312 · August 11, 2023, 10:07pm

So, as far as I understand, the kernel will need additional iterations or steps to load data from global memory to shared memory and then again from shared memory to global memory.

However, these additional iterations or steps are compensated for by the superior speed of shared memory.

Am I correct?

CU_Steve · August 11, 2023, 10:33pm

Yes, though not necessarily for a kernel that only does a little work.

Your teacher may just want you to demonstrate that you know how to:

set up shared memory
copy between it and global memory
provide the required synchronization (see the link below)

This article does a great job of explaining how using shared memory can speed things up, and it and will likely help you with using shared memory in your assignment:
https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/

Topic		Replies	Views
Can you help me find the bug? CUDA Programming and Performance	4	433	August 13, 2023
multi dimension array CUDA Programming and Performance	26	32759	February 12, 2010
Static shared memory Tiling (performance optimization) CUDA Programming and Performance	0	29	November 2, 2024
Subtle problem with matrix-matrix multiplication CUDA Programming and Performance	10	1120	December 12, 2020
Better way to program Cuda CUDA Programming and Performance	3	1173	July 5, 2010
CUDA Jacobi Method Using Shared memory CUDA Programming and Performance	3	3123	February 4, 2024
Annoying problems with memory and/or syntax CUDA Programming and Performance	19	4767	April 8, 2008
Example of Matrix Multiplication(from cuda book) points that i dont anderstend ... CUDA Programming and Performance	6	34329	February 3, 2008
Understanding and adjusting Mark Harris's array reduction CUDA Programming and Performance	11	4074	August 26, 2018
Beginer question Thread synchronization with shared memory CUDA Programming and Performance	35	9312	April 6, 2010

How can I store intermediate results in a shared memory?

Related topics