Can you help me find the bug?

user366312 · August 13, 2023, 4:28pm

First I wrote the following listing which multiplies two matrices.

A =B= [[1 2 3 4],
       [5 6 7 8],
       [9 10 11 12],
       [13 14 15 16]]

A*B = [[90 100 110 120], 
       [202 228 254 280], 
       [314 356 398 440], 
       [426 484 542 600]]

#include <cstdarg>
#include <fstream>
#include <iomanip>
#include <iostream>

using namespace std;

const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;

const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;

const int ROWS3 = ROWS1;
const int COLS3 = COLS2;

const int TILE_ROW_SIZE = 2;
const int TILE_COL_SIZE = 2;

#define IDX(tile_size, tile_i, relative_i) (tile_size * tile_i + relative_i)

void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
int tile_row_size, int tile_col_size,
int cols1, int rows1, int cols2) {
  for (int tile_i = 0; tile_i < tile_col_size; tile_i++) {//x
      for (int tile_j = 0; tile_j < tile_row_size; tile_j++) {//y
          for (int tile_r = 0; tile_r < tile_col_size; tile_r++) {//x
              for (int cell_r = 0; cell_r < cols1; cell_r++) {//x
                  for (int cell_i = 0; cell_i < rows1; cell_i++) {//y
                      for (int cell_j = 0; cell_j < cols2; cell_j++) {//x
                          int r = IDX(TILE_COL_SIZE, tile_r, cell_r);
                          int i = IDX(TILE_ROW_SIZE, tile_i, cell_i);
                          int j = IDX(TILE_COL_SIZE, tile_j, cell_j);
                          C[i * COLS3 + j] += A[i * COLS1 + r] * B[r * COLS2 + j];
                      }
                  }
              }
          }
      }
  }
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      printf("%d ", mat[i * cc + j]);
    }
    printf("\n");
  }
  printf("\n");
}

void allocateMatrix(int *&a, int rows, int cols) {
  a = new int[rows * cols];
}

void freeMatrix(int *a) {
  delete[] a;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

int main() {
  int *A, *B, *C;

  allocateMatrix(A, ROWS1, COLS1);
  initMatrix(A, ROWS1, COLS1);

  allocateMatrix(B, ROWS2, COLS2);
  initMatrix(B, ROWS2, COLS2);

  allocateMatrix(C, ROWS3, COLS3);
  initMatrixZero(C, ROWS3, COLS3);

  MultiplyAsSumOuterProductOfVectors(A, B, C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1 / TILE_COL_SIZE, ROWS1 / TILE_ROW_SIZE, COLS2 / TILE_COL_SIZE);

  printMatrix(C, ROWS3, COLS3);

  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

Then I translated that into the following CUDA program.

However, the output seems to be incorrect:

    66    116     86    136
   146    276    198    328
   226    436    310    520
   306    596    422    712

How can I find the bug?

#include <iostream>
#include <iomanip>

using namespace std;

const int ROWS1 = 4;//1024;
const int COLS1 = 4;//1024;

const int ROWS2 = 4;//1024;
const int COLS2 = 4;//1024;

const int ROWS3 = ROWS1;
const int COLS3 = COLS2;

const int TILE_ROW_SIZE = 2;//32;
const int TILE_COL_SIZE = 2;//32;

#define IDX(tile_size, tile_i, relative_i) (tile_size * tile_i + relative_i)


__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C,
  int tile_row_size, int tile_col_size,
  int cols1, int rows1, int cols2) 
{
  int tile_i = blockIdx.y;
  int tile_j = blockIdx.x;

  int cell_i = threadIdx.y;
  int cell_j = threadIdx.x;

  for (int tile_r = 0; tile_r < tile_col_size; tile_r++)
  {
    int r = IDX(TILE_COL_SIZE, tile_r, cell_j);

    __shared__ int subA[TILE_ROW_SIZE][TILE_COL_SIZE];
    __shared__ int subB[TILE_ROW_SIZE][TILE_COL_SIZE];

    subA[cell_i][cell_j] = A[IDX(cols1, (tile_i * TILE_ROW_SIZE + cell_i), r)];
    subB[cell_i][cell_j] = B[IDX(cols2, r, (tile_j * TILE_COL_SIZE + cell_j))];

    __syncthreads();

    for (int cell_r = 0; cell_r < TILE_ROW_SIZE; cell_r++) 
	{
      int c_i = tile_i * TILE_ROW_SIZE + cell_i;
      int c_j = tile_j * TILE_COL_SIZE + cell_j;
	  
      if (c_i < rows1 && c_j < cols2)
	  {
        C[IDX(COLS3, c_i, c_j)] += subA[cell_i][cell_r] * subB[cell_r][cell_j];
      }
    }

    __syncthreads();
  }
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      cout << setw(6) << mat[i * cc + j] << " ";
    }
    cout << endl;
  }
  cout << endl;
}

void allocateMatrix(int *&a, int rows, int cols) {
  a = new int[rows * cols];
}

void freeMatrix(int *a) {
  delete[] a;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

int main() {
  int *A, *B, *C;
  int *d_A, *d_B, *d_C;

  allocateMatrix(A, ROWS1, COLS1);
  initMatrix(A, ROWS1, COLS1);

  allocateMatrix(B, ROWS2, COLS2);
  initMatrix(B, ROWS2, COLS2);

  allocateMatrix(C, ROWS3, COLS3);
  initMatrixZero(C, ROWS3, COLS3);

  // Allocate device memory
  cudaMalloc((void **)&d_A, ROWS1 * COLS1 * sizeof(int));
  cudaMalloc((void **)&d_B, ROWS2 * COLS2 * sizeof(int));
  cudaMalloc((void **)&d_C, ROWS3 * COLS3 * sizeof(int));

  // Copy input matrices from host to device
  cudaMemcpy(d_A, A, ROWS1 * COLS1 * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, B, ROWS2 * COLS2 * sizeof(int), cudaMemcpyHostToDevice);

  // Set grid and block dimensions
  dim3 gridSize(COLS3 / TILE_COL_SIZE, ROWS3 / TILE_ROW_SIZE);
  dim3 blockSize(TILE_COL_SIZE, TILE_ROW_SIZE);

  // Launch the kernel
  MultiplyAsSumOuterProductOfVectors<<<gridSize, blockSize>>>(d_A, d_B, d_C, TILE_ROW_SIZE, TILE_COL_SIZE, COLS1, ROWS1, COLS2);

  // Copy result matrix from device to host
  cudaMemcpy(C, d_C, ROWS3 * COLS3 * sizeof(int), cudaMemcpyDeviceToHost);

  // Print the result matrix
  cout << "Result Matrix:" << endl;
  printMatrix(C, ROWS3, COLS3);

  // Free device memory
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  // Free host memory
  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

Robert_Crovella · August 13, 2023, 7:12pm

Learn some debugging methods. For example you could start with in-kernel printf. Pick a particular result element that doesn’t match between the two realizations. Put a printf statement in-kernel that prints out every value that gets calculated and summed together for that value. For example, if you you found that element 0,0 is incorrect, then put a statement like

  if (c_i < rows1 && c_j < cols2)
  {    int temp =  subA[cell_i][cell_r] * subB[cell_r][cell_j];
       if ((tile_i == 0) && (cell_i == 0) && (tile_j == 0) && (cell_j == 0)) printf("%d\n", temp);
       C[IDX(COLS3, c_i, c_j)] += temp;
  }

You can easily get more fancy with that printf statement if you want. Look at all the individual products - do they make sense? Calculate the individual products (one row of the first matrix multiplied by one column of the 2nd matrix).

Before doing any of that work, I would use the compute-sanitizer debugging described in unit 12 of this online training series. Make sure there are no runtime-detectable errors.

Robert_Crovella · August 13, 2023, 8:03pm

A few things I noticed:

You’re not initializing the C matrix on the device to zero. cudaMalloc doesn’t do that for you. It is necessary because the only thing you ever do to the C matrix is add to it:

This isn’t the source of the problem, however.

This isn’t the correct way to calculate the tile movement, but it is also not the source of the problem because 4/2 gives the correct answer (2). However your calculation would not work correctly if your tile dimension were 2 but the number of rows was 8, for example.

There is a fully worked shared memory matrix multiplication in the programming guide. If you simply compare your code carefully to that one, you may be able to spot errors.

Robert_Crovella · August 13, 2023, 9:23pm

system · August 27, 2023, 9:23pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Subtle problem with matrix-matrix multiplication CUDA Programming and Performance	10	1120	December 12, 2020
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	9993	January 18, 2012
matrix multiplication with its transpose in cuda(cudamemcpy from device to host not working) . CUDA Programming and Performance	6	1734	October 5, 2018
Desperate help needed! Sum rows of matrix CUDA Programming and Performance	16	40573	April 24, 2011
Warp shuffle instruction not working as expected CUDA Programming and Performance	7	765	September 6, 2023
How can I store intermediate results in a shared memory? CUDA Programming and Performance	3	443	August 11, 2023
Matrix Multiplication Garbage value :( CUDA Programming and Performance	10	3399	July 25, 2009
matrix multiply reduction CUDA Programming and Performance	41	35525	January 15, 2011
Cuda matrix multiplication too slow CUDA Programming and Performance	5	13323	February 17, 2010
Matrix multiplication CUDA Programming and Performance	7	2150	July 2, 2010

Can you help me find the bug?

Related topics