#include <iostream>
#include <iomanip>
using namespace std;
const int ROWS_Y = 3;//1024;
const int COLS_X = 3;//1024;
const int TILE_ROWS_Y = 1;//32;
const int TILE_COLS_X = 1;//32;
#define GX(tx, lx) ((tx) * (TILE_COLS_X) + (lx))
#define GY(ty, ly) ((ty) * (TILE_ROWS_Y) + (ly))
#define GID2(gx, gy) ((gy) * (COLS_X) + (gx))
#define GID4(tx, ty, lx, ly) ((GY(ty, ly)) * (COLS_X) + (GX(tx, lx)))
#define MOSAIC_ROWS_Y ((ROWS_Y) / (TILE_ROWS_Y))
#define MOSAIC_COLS_X ((COLS_X) / (TILE_COLS_X))
#define LID(lx, ly) ((ly)*(TILE_COLS_X)+(lx))
__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C)
{
int ty = blockIdx.y;
int tx = blockIdx.x;
int ly = threadIdx.y;
int lx = threadIdx.x;
for (int tr = 0; tr < MOSAIC_COLS_X; tr++)
{
__shared__ int subA[TILE_ROWS_Y*TILE_COLS_X];
__shared__ int subB[TILE_ROWS_Y*TILE_COLS_X];
subA[LID(lx,ly)] = A[GID4(tr,ty,lx,ly)];
subB[LID(lx,ly)] = B[GID4(tx,tr,lx,ly)];
__syncthreads();
for (int lr = 0; lr < TILE_ROWS_Y; lr++)
{
int gy = GY(ty, ly);
int gx = GX(tx, lx);
if (gy < ROWS_Y && gx < COLS_X)
{
C[GID2(gx, gy)] += subA[LID(lr,ly)] * subB[LID(lx,lr)];
}
}
__syncthreads();
}
}
void printMatrix(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
cout << setw(6) << mat[i * cc + j] << " ";
}
cout << endl;
}
cout << endl;
}
void allocateMatrix(int *&a, int rows, int cols) {
a = new int[rows * cols];
}
void freeMatrix(int *a) {
delete[] a;
}
void initMatrix(int *mat, int rr, int cc) {
int init = 1;
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = init++;
}
}
}
void initMatrixZero(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = 0;
}
}
}
int main() {
int *A, *B, *C;
int *d_A, *d_B, *d_C;
allocateMatrix(A, ROWS_Y, COLS_X);
initMatrix(A, ROWS_Y, COLS_X);
allocateMatrix(B, ROWS_Y, COLS_X);
initMatrix(B, ROWS_Y, COLS_X);
allocateMatrix(C, ROWS_Y, COLS_X);
initMatrixZero(C, ROWS_Y, COLS_X);
// Allocate device memory
cudaMalloc((void **)&d_A, ROWS_Y * COLS_X * sizeof(int));
cudaMalloc((void **)&d_B, ROWS_Y * COLS_X * sizeof(int));
cudaMalloc((void **)&d_C, ROWS_Y * COLS_X * sizeof(int));
// Copy input matrices from host to device
cudaMemcpy(d_A, A, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyHostToDevice);
// Set grid and block dimensions
dim3 gridSize(MOSAIC_COLS_X, MOSAIC_ROWS_Y);
dim3 blockSize(TILE_COLS_X, TILE_ROWS_Y);
// Launch the kernel
MultiplyAsSumOuterProductOfVectors<<<gridSize, blockSize>>>(d_A, d_B, d_C);
// Copy result matrix from device to host
cudaMemcpy(C, d_C, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyDeviceToHost);
// Print the result matrix
cout << "Result Matrix:" << endl;
printMatrix(C, ROWS_Y, COLS_X);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free host memory
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
return 0;
}