Why does it gives incorrect result when ROWS_Y=COLS_X=3 and TILE_ROWS_Y=TILE_COLS_X=1?

The supplied source code gives correct results when

  • ROWS_Y=2, COLS_X=2, TILE_ROWS_Y = 1, and TILE_COLS_X=1
  • ROWS_Y=4, COLS_X=4, TILE_ROWS_Y = 2, and TILE_COLS_X=2

However, it gives incorrect result when

  • ROWS_Y=3, COLS_X=3, TILE_ROWS_Y = 1, and TILE_COLS_X=1.

Why is that?

#include <iostream>
#include <iomanip>

using namespace std;

const int ROWS_Y = 3;//1024;
const int COLS_X = 3;//1024;

const int TILE_ROWS_Y = 1;//32;
const int TILE_COLS_X = 1;//32;

#define GX(tx, lx) ((tx) * (TILE_COLS_X) + (lx))
#define GY(ty, ly) ((ty) * (TILE_ROWS_Y) + (ly))

#define GID2(gx, gy) ((gy) * (COLS_X) + (gx))
#define GID4(tx, ty, lx, ly) ((GY(ty, ly)) * (COLS_X) + (GX(tx, lx)))

#define MOSAIC_ROWS_Y ((ROWS_Y) / (TILE_ROWS_Y))
#define MOSAIC_COLS_X ((COLS_X) / (TILE_COLS_X))

#define LID(lx, ly) ((ly)*(TILE_COLS_X)+(lx))

__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C)
{
  int ty = blockIdx.y;
  int tx = blockIdx.x;

  int ly = threadIdx.y;
  int lx = threadIdx.x;

  for (int tr = 0; tr < MOSAIC_COLS_X; tr++)
  {
    __shared__ int subA[TILE_ROWS_Y*TILE_COLS_X];
    __shared__ int subB[TILE_ROWS_Y*TILE_COLS_X];

    subA[LID(lx,ly)] = A[GID4(tr,ty,lx,ly)];
    subB[LID(lx,ly)] = B[GID4(tx,tr,lx,ly)];    
    __syncthreads();
	
    for (int lr = 0; lr < TILE_ROWS_Y; lr++)
    {
      int gy = GY(ty, ly);
      int gx = GX(tx, lx);
	  
      if (gy < ROWS_Y && gx < COLS_X)
      {
        C[GID2(gx, gy)] += subA[LID(lr,ly)] * subB[LID(lx,lr)];
      }
    }
    __syncthreads();
  }
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      cout << setw(6) << mat[i * cc + j] << " ";
    }
    cout << endl;
  }
  cout << endl;
}

void allocateMatrix(int *&a, int rows, int cols) {
  a = new int[rows * cols];
}

void freeMatrix(int *a) {
  delete[] a;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

int main() {
  int *A, *B, *C;
  int *d_A, *d_B, *d_C;

  allocateMatrix(A, ROWS_Y, COLS_X);
  initMatrix(A, ROWS_Y, COLS_X);

  allocateMatrix(B, ROWS_Y, COLS_X);
  initMatrix(B, ROWS_Y, COLS_X);

  allocateMatrix(C, ROWS_Y, COLS_X);
  initMatrixZero(C, ROWS_Y, COLS_X);

  // Allocate device memory
  cudaMalloc((void **)&d_A, ROWS_Y * COLS_X * sizeof(int));
  cudaMalloc((void **)&d_B, ROWS_Y * COLS_X * sizeof(int));
  cudaMalloc((void **)&d_C, ROWS_Y * COLS_X * sizeof(int));

  // Copy input matrices from host to device
  cudaMemcpy(d_A, A, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, B, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_C, C, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyHostToDevice);

  // Set grid and block dimensions
  dim3 gridSize(MOSAIC_COLS_X, MOSAIC_ROWS_Y);
  dim3 blockSize(TILE_COLS_X, TILE_ROWS_Y);

  // Launch the kernel
  MultiplyAsSumOuterProductOfVectors<<<gridSize, blockSize>>>(d_A, d_B, d_C);

  // Copy result matrix from device to host
  cudaMemcpy(C, d_C, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyDeviceToHost);

  // Print the result matrix
  cout << "Result Matrix:" << endl;
  printMatrix(C, ROWS_Y, COLS_X);

  // Free device memory
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  // Free host memory
  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

When I run your code I get this result:

Result Matrix:
    30     36     42
    66     81     96
   102    126    150

That is the correct result. I checked it here.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.