Hi all,

I am following this sample code from a text book [Hwu&Kirk]

This is code for tiled matrix multiplication for square matrices whose width is multiple of the TILE_WIDTH,

I have written my doubts as comments below…

**global** void MatrixMulKernel(float *M, float *N, float *P, int Width)

{

**shared** float Mds[TILE_WIDTH][TILE_WIDTH];

**shared** float Nds[TILE_WIDTH][TILE_WIDTH];

int bx = blockIdx.x;

int by = blockIdx.y;

int tx = threadIdx.x;

int ty = threadIdx.y;

// Identify the row and column of the P element to work on

int Row = by * TILE_WIDTH + ty;

int Col = bx * TILE_WIDTH + tx;

float Pvalue = 0;

// Loop over the M and N tiles required to compute P element

for (int ph = 0; ph < Width / TILE_WIDTH; ++ph)

{

// Collaborative loading of M and N tiles into shared memory

Mds[ty][tx] = M[Row * Width + ph * TILE_WIDTH + tx];

// **Are we transposing the columns of matrix N to a row in the tile here ?**

Nds[ty][tx] = N[(ph * TILE_WIDTH + ty) * Width + Col];

__syncthreads();

for (int k = 0; k < TILE_WIDTH; ++k)

{

// **If we have transposed the column to row, then use Nds[tx][k] ?**

Pvalue += Mds[ty][k] * Nds[k][tx];

}

__syncthreads();

}

P[Row * Width + Col] = Pvalue;

}

Thanks in advance…