Why are they padding only one matrix?

The above link says that:

__global__ void coalescedMultiply(float *a, float *c, int M)
{
  __shared__ float aTile[TILE_DIM][TILE_DIM],
                   transposedTile[TILE_DIM][TILE_DIM];
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  float sum = 0.0f;
  aTile[threadIdx.y][threadIdx.x] = > a[row*TILE_DIM+threadIdx.x];
  transposedTile[threadIdx.x][threadIdx.y] =
      a[(blockIdx.x*blockDim.x + threadIdx.y)*TILE_DIM +
      threadIdx.x];  
  __syncthreads();
  for (int i = 0; i < TILE_DIM; i++) {
      sum += aTile[threadIdx.y][i]* transposedTile[i][threadIdx.x];
  }
  c[row*M+col] = sum;
}

(…)
These many-way bank conflicts are very expensive. The simple remedy is to pad the shared memory array so that it has an extra column, as in the following line of code.

__shared__ float transposedTile[TILE_DIM][TILE_DIM+1];

My question is,

why are they padding only right-hand-side (transposedTile) matrix, not both (transposedTile and aTile)?

There is only one 2D matrix in the code snippet shown, named transposedTile. It is being padded in only one dimension, because that is fully sufficient to avoid shared memory banked conflicts.

I suggest you draw, using a little box for each matrix element, the layout of this 2D matrix in memory, then annotate each box with the shared-memory bank it maps to It should become apparent what is going on, i.e. why it is important to pick the correct dimension to pad, why padding the other dimension doesn’t provide any , and how the padding eliminates the bank conflicts.

This sort of manual visualization is generally useful in all kinds of matrix processing. I had to use this a lot when I wrote the initial version of CUDA, because I have very poor sense of spatial orientation.

This is the full routine:

__global__ void coalescedMultiply(float *a, float *c, int M)
{
    __shared__ float aTile[TILE_DIM][TILE_DIM],
                     transposedTile[TILE_DIM][TILE_DIM];
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    aTile[threadIdx.y][threadIdx.x] = a[row*TILE_DIM+threadIdx.x];
    transposedTile[threadIdx.x][threadIdx.y] =
        a[(blockIdx.x*blockDim.x + threadIdx.y)*TILE_DIM +
        threadIdx.x];  
    __syncthreads();
    for (int i = 0; i < TILE_DIM; i++) {
        sum += aTile[threadIdx.y][i]* transposedTile[i][threadIdx.x];
    }
    c[row*M+col] = sum;
}

Here we have two matrices in this routine.

One is transposedTile and the other is aTile.


I see that the matrix is padded in only one dimension.

However, my question was about why pad only one matrix, not both?.

It is best to write questions so they are self contained. I almost never follow links provided in questions, and definitely not if I do not recognize the user name. The question as originally asked showed only one array, and I answered based on the information provided.

Sorry for that.

I edited the original post.

Would you like to post an update?

#include <iostream>
#include <iomanip>

using namespace std;

const int ROWS_Y = 3;//1024;
const int COLS_X = 3;//1024;

const int TILE_ROWS_Y = 1;//32;
const int TILE_COLS_X = 1;//32;

#define GX(tx, lx) ((tx) * (TILE_COLS_X) + (lx))
#define GY(ty, ly) ((ty) * (TILE_ROWS_Y) + (ly))

#define GID2(gx, gy) ((gy) * (COLS_X) + (gx))
#define GID4(tx, ty, lx, ly) ((GY(ty, ly)) * (COLS_X) + (GX(tx, lx)))

#define MOSAIC_ROWS_Y ((ROWS_Y) / (TILE_ROWS_Y))
#define MOSAIC_COLS_X ((COLS_X) / (TILE_COLS_X))

#define LID(lx, ly) ((ly)*(TILE_COLS_X)+(lx))

__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C)
{
  int ty = blockIdx.y;
  int tx = blockIdx.x;

  int ly = threadIdx.y;
  int lx = threadIdx.x;

  for (int tr = 0; tr < MOSAIC_COLS_X; tr++)
  {
    __shared__ int subA[TILE_ROWS_Y*TILE_COLS_X];
    __shared__ int subB[TILE_ROWS_Y*TILE_COLS_X];

    subA[LID(lx,ly)] = A[GID4(tr,ty,lx,ly)];
    subB[LID(lx,ly)] = B[GID4(tx,tr,lx,ly)];    
    __syncthreads();
	
    for (int lr = 0; lr < TILE_ROWS_Y; lr++)
    {
      int gy = GY(ty, ly);
      int gx = GX(tx, lx);
	  
      if (gy < ROWS_Y && gx < COLS_X)
      {
        C[GID2(gx, gy)] += subA[LID(lr,ly)] * subB[LID(lx,lr)];
      }
    }
    __syncthreads();
  }
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      cout << setw(6) << mat[i * cc + j] << " ";
    }
    cout << endl;
  }
  cout << endl;
}

void allocateMatrix(int *&a, int rows, int cols) {
  a = new int[rows * cols];
}

void freeMatrix(int *a) {
  delete[] a;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

int main() {
  int *A, *B, *C;
  int *d_A, *d_B, *d_C;

  allocateMatrix(A, ROWS_Y, COLS_X);
  initMatrix(A, ROWS_Y, COLS_X);

  allocateMatrix(B, ROWS_Y, COLS_X);
  initMatrix(B, ROWS_Y, COLS_X);

  allocateMatrix(C, ROWS_Y, COLS_X);
  initMatrixZero(C, ROWS_Y, COLS_X);

  // Allocate device memory
  cudaMalloc((void **)&d_A, ROWS_Y * COLS_X * sizeof(int));
  cudaMalloc((void **)&d_B, ROWS_Y * COLS_X * sizeof(int));
  cudaMalloc((void **)&d_C, ROWS_Y * COLS_X * sizeof(int));

  // Copy input matrices from host to device
  cudaMemcpy(d_A, A, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, B, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_C, C, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyHostToDevice);

  // Set grid and block dimensions
  dim3 gridSize(MOSAIC_COLS_X, MOSAIC_ROWS_Y);
  dim3 blockSize(TILE_COLS_X, TILE_ROWS_Y);

  // Launch the kernel
  MultiplyAsSumOuterProductOfVectors<<<gridSize, blockSize>>>(d_A, d_B, d_C);

  // Copy result matrix from device to host
  cudaMemcpy(C, d_C, ROWS_Y * COLS_X * sizeof(int), cudaMemcpyDeviceToHost);

  // Print the result matrix
  cout << "Result Matrix:" << endl;
  printMatrix(C, ROWS_Y, COLS_X);

  // Free device memory
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  // Free host memory
  freeMatrix(A);
  freeMatrix(B);
  freeMatrix(C);

  return 0;
}

A threadblock of only 1 thread by definition can never create a bank conflict. A bank conflict only manifests amongst threads in the same warp, for the same instruction issue/cycle.