First I wrote the following listing which multiplies two matrices.
A =B= [[1 2 3 4],
[5 6 7 8],
[9 10 11 12],
[13 14 15 16]]
A*B = [[90 100 110 120],
[202 228 254 280],
[314 356 398 440],
[426 484 542 600]]
#include <cstdarg>
#include <fstream>
#include <iomanip>
#include <iostream>
using namespace std;
const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;
const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;
const int ROWS3 = ROWS1;
const int COLS3 = COLS2;
const int TILE_ROW_SIZE = 2;
const int TILE_COL_SIZE = 2;
#define IDX(tile_size, tile_i, relative_i) (tile_size * tile_i + relative_i)
void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
int tile_row_size, int tile_col_size,
int cols1, int rows1, int cols2) {
for (int tile_i = 0; tile_i < tile_col_size; tile_i++) {//x
for (int tile_j = 0; tile_j < tile_row_size; tile_j++) {//y
for (int tile_r = 0; tile_r < tile_col_size; tile_r++) {//x
for (int cell_r = 0; cell_r < cols1; cell_r++) {//x
for (int cell_i = 0; cell_i < rows1; cell_i++) {//y
for (int cell_j = 0; cell_j < cols2; cell_j++) {//x
int r = IDX(TILE_COL_SIZE, tile_r, cell_r);
int i = IDX(TILE_ROW_SIZE, tile_i, cell_i);
int j = IDX(TILE_COL_SIZE, tile_j, cell_j);
C[i * COLS3 + j] += A[i * COLS1 + r] * B[r * COLS2 + j];
}
}
}
}
}
}
}
void printMatrix(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
printf("%d ", mat[i * cc + j]);
}
printf("\n");
}
printf("\n");
}
void allocateMatrix(int *&a, int rows, int cols) {
a = new int[rows * cols];
}
void freeMatrix(int *a) {
delete[] a;
}
void initMatrix(int *mat, int rr, int cc) {
int init = 1;
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = init++;
}
}
}
void initMatrixZero(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = 0;
}
}
}
int main() {
int *A, *B, *C;
allocateMatrix(A, ROWS1, COLS1);
initMatrix(A, ROWS1, COLS1);
allocateMatrix(B, ROWS2, COLS2);
initMatrix(B, ROWS2, COLS2);
allocateMatrix(C, ROWS3, COLS3);
initMatrixZero(C, ROWS3, COLS3);
MultiplyAsSumOuterProductOfVectors(A, B, C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1 / TILE_COL_SIZE, ROWS1 / TILE_ROW_SIZE, COLS2 / TILE_COL_SIZE);
printMatrix(C, ROWS3, COLS3);
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
return 0;
}
Then I translated that into the following CUDA program.
However, the output seems to be incorrect:
66 116 86 136
146 276 198 328
226 436 310 520
306 596 422 712
How can I find the bug?
#include <iostream>
#include <iomanip>
using namespace std;
const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;
const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;
const int ROWS3 = ROWS1;
const int COLS3 = COLS2;
const int TILE_ROW_SIZE = 2;//32;
const int TILE_COL_SIZE = 2;//32;
#define IDX(tile_size, tile_i, relative_i) (tile_size * tile_i + relative_i)
__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
int tile_row_size, int tile_col_size,
int cols1, int rows1, int cols2)
{
int tile_i = blockIdx.y;
int tile_j = blockIdx.x;
int cell_i = threadIdx.y;
int cell_j = threadIdx.x;
for (int tile_r = 0; tile_r < tile_col_size; tile_r++)
{
int r = IDX(TILE_COL_SIZE, tile_r, cell_j);
__shared__ int subA[TILE_ROW_SIZE][TILE_COL_SIZE];
__shared__ int subB[TILE_ROW_SIZE][TILE_COL_SIZE];
subA[cell_i][cell_j] = A[IDX(cols1, (tile_i * TILE_ROW_SIZE + cell_i), r)];
subB[cell_i][cell_j] = B[IDX(cols2, r, (tile_j * TILE_COL_SIZE + cell_j))];
__syncthreads();
for (int cell_r = 0; cell_r < TILE_ROW_SIZE; cell_r++)
{
int c_i = tile_i * TILE_ROW_SIZE + cell_i;
int c_j = tile_j * TILE_COL_SIZE + cell_j;
if (c_i < rows1 && c_j < cols2)
{
C[IDX(COLS3, c_i, c_j)] += subA[cell_i][cell_r] * subB[cell_r][cell_j];
}
}
__syncthreads();
}
}
void printMatrix(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
cout << setw(6) << mat[i * cc + j] << " ";
}
cout << endl;
}
cout << endl;
}
void allocateMatrix(int *&a, int rows, int cols) {
a = new int[rows * cols];
}
void freeMatrix(int *a) {
delete[] a;
}
void initMatrix(int *mat, int rr, int cc) {
int init = 1;
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = init++;
}
}
}
void initMatrixZero(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = 0;
}
}
}
int main() {
int *A, *B, *C;
int *d_A, *d_B, *d_C;
allocateMatrix(A, ROWS1, COLS1);
initMatrix(A, ROWS1, COLS1);
allocateMatrix(B, ROWS2, COLS2);
initMatrix(B, ROWS2, COLS2);
allocateMatrix(C, ROWS3, COLS3);
initMatrixZero(C, ROWS3, COLS3);
// Allocate device memory
cudaMalloc((void **)&d_A, ROWS1 * COLS1 * sizeof(int));
cudaMalloc((void **)&d_B, ROWS2 * COLS2 * sizeof(int));
cudaMalloc((void **)&d_C, ROWS3 * COLS3 * sizeof(int));
// Copy input matrices from host to device
cudaMemcpy(d_A, A, ROWS1 * COLS1 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, ROWS2 * COLS2 * sizeof(int), cudaMemcpyHostToDevice);
// Set grid and block dimensions
dim3 gridSize(COLS3 / TILE_COL_SIZE, ROWS3 / TILE_ROW_SIZE);
dim3 blockSize(TILE_COL_SIZE, TILE_ROW_SIZE);
// Launch the kernel
MultiplyAsSumOuterProductOfVectors<<<gridSize, blockSize>>>(d_A, d_B, d_C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1, ROWS1, COLS2);
// Copy result matrix from device to host
cudaMemcpy(C, d_C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyDeviceToHost);
// Print the result matrix
cout << "Result Matrix:" << endl;
printMatrix(C, ROWS3, COLS3);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free host memory
freeMatrix(A);
freeMatrix(B);
freeMatrix(C);
return 0;
}