I took the source code from the following link:
and, refactored it using registers and shuffle instructions.
However, the output is not correct:
Result Matrix:
28 0 36 0
38 0 50 0
108 0 148 0
118 0 162 0
What am I doing incorrectly?
#include <iostream>
#include <iomanip>
using namespace std;
const int ROWS1 = 4;
const int COLS1 = 4;
const int ROWS2 = 4;
const int COLS2 = 4;
const int ROWS3 = ROWS1;
const int COLS3 = COLS2;
const int TILE_ROW_SIZE = 2;
const int TILE_COL_SIZE = 2;
#define IDX(tile_size, tile_i, relative_i) ((tile_size) * (tile_i) + (relative_i))
__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
int tile_row_size, int tile_col_size,
int cols1, int rows1, int cols2)
{
int tile_i = blockIdx.y;
int tile_j = blockIdx.x;
int cell_i = threadIdx.y;
int cell_j = threadIdx.x;
// column vectors of matrices A and B are stored in registers before multiplication
int regA[TILE_ROW_SIZE];
int regB[TILE_ROW_SIZE];
// the multiplication result is stored in shared memory
__shared__ int shrdC[TILE_ROW_SIZE][TILE_COL_SIZE+1];
shrdC[cell_i][cell_j] = 0;
__syncthreads();
for (int tile_r = 0; tile_r < cols2/tile_col_size; tile_r++)
{
regA[cell_i] = A[IDX(cols1, tile_i*TILE_ROW_SIZE+cell_i, tile_r*TILE_COL_SIZE+cell_j)];
regB[cell_j] = B[IDX(cols2, tile_r*TILE_ROW_SIZE+cell_i, tile_j*TILE_COL_SIZE+cell_j)];
__syncthreads();
for (int cell_r = 0; cell_r < TILE_ROW_SIZE; cell_r++)
{
// the kernel uses shuffle instructions to replace the contents of registers
int valA = __shfl_sync(0xffffffff, regA[cell_r], cell_i);
int valB = __shfl_sync(0xffffffff, regB[cell_r], cell_j);
shrdC[cell_i][cell_j] += valA * valB;
}
__syncthreads();
}
int c_i = tile_i * TILE_ROW_SIZE + cell_i;
int c_j = tile_j * TILE_COL_SIZE + cell_j;
if (c_i < rows1 && c_j < cols2)
{
C[IDX(COLS3, c_i, c_j)] = shrdC[cell_i][cell_j];
}
}
void printMatrix(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
cout << setw(6) << mat[i * cc + j] << " ";
}
cout << endl;
}
cout << endl;
}
void allocateMatrix(int *&a, int rows, int cols) {
a = new int[rows * cols];
}
void freeMatrix(int *a) {
delete[] a;
}
void initMatrix(int *mat, int rr, int cc) {
int init = 1;
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = init++;
}
}
}
void initMatrixZero(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = 0;
}
}
}
// the kernel should use 32 thread blocks
// however for this experiment we are using 2.
int main() {
int *A, *B, *C;
int *d_A, *d_B, *d_C;
allocateMatrix(A, ROWS1, COLS1);
initMatrix(A, ROWS1, COLS1);
allocateMatrix(B, ROWS2, COLS2);
initMatrix(B, ROWS2, COLS2);
allocateMatrix(C, ROWS3, COLS3);
initMatrixZero(C, ROWS3, COLS3);
cudaMalloc((void **)&d_A, ROWS1 * COLS1 * sizeof(int));
cudaMalloc((void **)&d_B, ROWS2 * COLS2 * sizeof(int));
cudaMalloc((void **)&d_C, ROWS3 * COLS3 * sizeof(int));
cudaMemset(d_C, 0, ROWS3 * COLS3 * sizeof(int));
cudaMemcpy(d_A, A, ROWS1 * COLS1 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, ROWS2 * COLS2 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyHostToDevice);
dim3 blocks(COLS3/TILE_COL_SIZE, ROWS3/TILE_ROW_SIZE);
dim3 threads(TILE_COL_SIZE, TILE_ROW_SIZE);
MultiplyAsSumOuterProductOfVectors<<<blocks, threads>>>(d_A, d_B, d_C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1, ROWS1, COLS2);
cudaMemcpy(C, d_C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyDeviceToHost);
cout << "Result Matrix:" << endl;
printMatrix(C, ROWS3, COLS3);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
return 0;
}