Hi all,
I am following this sample code from a text book [Hwu&Kirk]
This is code for tiled matrix multiplication for square matrices whose width is multiple of the TILE_WIDTH,
I have written my doubts as comments below…
global void MatrixMulKernel(float *M, float *N, float *P, int Width)
{
shared float Mds[TILE_WIDTH][TILE_WIDTH];
shared float Nds[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
// Identify the row and column of the P element to work on
int Row = by * TILE_WIDTH + ty;
int Col = bx * TILE_WIDTH + tx;
float Pvalue = 0;
// Loop over the M and N tiles required to compute P element
for (int ph = 0; ph < Width / TILE_WIDTH; ++ph)
{
// Collaborative loading of M and N tiles into shared memory
Mds[ty][tx] = M[Row * Width + ph * TILE_WIDTH + tx];
// Are we transposing the columns of matrix N to a row in the tile here ?
Nds[ty][tx] = N[(ph * TILE_WIDTH + ty) * Width + Col];
__syncthreads();
for (int k = 0; k < TILE_WIDTH; ++k)
{
// If we have transposed the column to row, then use Nds[tx][k] ?
Pvalue += Mds[ty][k] * Nds[k][tx];
}
__syncthreads();
}
P[Row * Width + Col] = Pvalue;
}
Thanks in advance…