My teacher wrote:
Implement a CUDA kernel for matrix products as outer product vectors. In this version, each block of K threads counts a square piece of the result matrix of size KxK by implementing the matrix outer product formula. The kernel uses shared memory to store the corresponding column vectors from matrix A and matrix B and to store the corresponding fragment of the result array.
According to the outer product formula, I wrote the following CUDA program:
#include <iostream>
const int ROWS1 = 4;
const int COLS1 = 4;
const int ROWS2 = 4;
const int COLS2 = 4;
const int ROWS3 = ROWS1;
const int COLS3 = COLS2;
const int BLOCK_ROW_SIZE = 2;
const int BLOCK_COL_SIZE = 2;
#define IDX(block_size, block_i, relative_i) (block_size * block_i + relative_i)
__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
int block_row_size, int block_col_size,
int cols1, int rows1, int cols2) {
int block_i = blockIdx.x;
int block_j = blockIdx.y;
int block_r = threadIdx.x;
int cell_r, cell_i, cell_j;
int r, i, j;
for (cell_r = 0; cell_r < cols1; cell_r++) {
for (cell_i = 0; cell_i < rows1; cell_i++) {
for (cell_j = 0; cell_j < cols2; cell_j++) {
r = IDX(BLOCK_COL_SIZE, block_r, cell_r);
i = IDX(BLOCK_ROW_SIZE, block_i, cell_i);
j = IDX(BLOCK_COL_SIZE, block_j, cell_j);
atomicAdd(&C[i * COLS3 + j], A[i * COLS1 + r] * B[r * COLS2 + j]);
}
}
}
}
void printMatrix(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
printf("%d ", mat[i * cc + j]);
}
printf("\n");
}
printf("\n");
}
void allocateMatrix(int *&a, int rows, int cols) {
cudaMallocManaged(&a, rows * cols * sizeof(int));
}
void freeMatrix(int *a) {
cudaFree(a);
}
void initMatrix(int *mat, int rr, int cc) {
int init = 1;
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = init++;
}
}
}
void initMatrixZero(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = 0;
}
}
}
int main() {
int *A, *B, *C;
allocateMatrix(A, ROWS1, COLS1);
initMatrix(A, ROWS1, COLS1);
allocateMatrix(B, ROWS2, COLS2);
initMatrix(B, ROWS2, COLS2);
allocateMatrix(C, ROWS3, COLS3);
initMatrixZero(C, ROWS3, COLS3);
dim3 blocks(BLOCK_COL_SIZE, BLOCK_ROW_SIZE);
MultiplyAsSumOuterProductOfVectors<<<blocks, BLOCK_COL_SIZE>>>(A, B, C, BLOCK_ROW_SIZE, BLOCK_COL_SIZE, COLS1/BLOCK_COL_SIZE, ROWS1/BLOCK_ROW_SIZE, COLS2/BLOCK_COL_SIZE);
cudaDeviceSynchronize();
printMatrix(C, ROWS3, COLS3);
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
return 0;
}
The output looks fine to me.
However, I am not being able to visualize how I can use shared memory to store the intermediate results.
I mean, my listing doesn’t need a separate 2D array of size BLOCK_ROW_SIZE x BLOCK_COL_SIZE
. Therefore, I am unable to adjust my algorithm/listing according to the problem definition.
How can I store intermediate results in shared memory?
Additional source code:
The following is the host version of the same listing above.
#include <cstdarg>
#include <fstream>
#include <iomanip>
#include <iostream>
using namespace std;
const int ROWS1 = 4;
const int COLS1 = 4;
const int ROWS2 = 4;
const int COLS2 = 4;
const int ROWS3 = ROWS1;
const int COLS3 = COLS2;
const int BLOCK_ROW_SIZE = 2;
const int BLOCK_COL_SIZE = 2;
#define IDX(block_size, block_i, relative_i) (block_size * block_i + relative_i)
void printMatrix(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
printf("%d ", mat[i * cc + j]);
}
printf("\n");
}
printf("\n");
}
void allocateMatrix(int *&a, int rows, int cols) {
a = new int[rows * cols];
}
void freeMatrix(int *a) {
delete[] a;
}
void initMatrix(int *mat, int rr, int cc) {
int init = 1;
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = init++;
}
}
}
void initMatrixZero(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = 0;
}
}
}
void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
int block_row_size, int block_col_size,
int cols1, int rows1, int cols2) {
try {
for (int block_i = 0; block_i < block_col_size; block_i++) {
for (int block_j = 0; block_j < block_row_size; block_j++) {
for (int block_r = 0; block_r < block_col_size; block_r++) {
for (int cell_r = 0; cell_r < cols1; cell_r++) {
for (int cell_i = 0; cell_i < rows1; cell_i++) {
for (int cell_j = 0; cell_j < cols2; cell_j++) {
int r = IDX(BLOCK_COL_SIZE, block_r, cell_r);
int i = IDX(BLOCK_ROW_SIZE, block_i, cell_i);
int j = IDX(BLOCK_COL_SIZE, block_j, cell_j);
C[i * COLS3 + j] += A[i * COLS1 + r] * B[r * COLS2 + j];
}
}
}
}
}
}
}
catch(...)
{}
}
int main() {
int *A, *B, *C;
allocateMatrix(A, ROWS1, COLS1);
initMatrix(A, ROWS1, COLS1);
allocateMatrix(B, ROWS2, COLS2);
initMatrix(B, ROWS2, COLS2);
allocateMatrix(C, ROWS3, COLS3);
initMatrixZero(C, ROWS3, COLS3);
MultiplyAsSumOuterProductOfVectors(A, B, C, BLOCK_ROW_SIZE, BLOCK_COL_SIZE, COLS1/BLOCK_COL_SIZE, ROWS1/BLOCK_ROW_SIZE, COLS2/BLOCK_COL_SIZE);
printMatrix(C, ROWS3, COLS3);
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
return 0;
}