Hi! I am writing this code for doing matrix multiplication for arbitrary size matrices such that the matrices sizes are not necessarily a multiple of tile dimensions …
Matrices are defined as struct where it has the attributes of height … width and the pointer (elements)
There are no syntax errors too …
Logically my code is correct i guess … don’t know why it doesn’t give correct results when i run in it emulation mode … I have no cuda capable hw … all my runs are in emulation
TILE_WIDTH is defined as 16 in each direction
shared float Mds[TILE_WIDTH][TILE_WIDTH];
shared float Nds[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
// Identify the row and column of the Pd element to work on
int Row = by * TILE_WIDTH + ty;
int Col = bx * TILE_WIDTH + tx;
int limit;
float Pvalue = 0;
// Loop over the Md and Nd tiles required to compute the Pd element
for (int m = 0; m < (ceil((float)M.width/(float)TILE_WIDTH)); ++m)
{
//suppose M.width is 66 then the 1st four tiles are 16 elements wide .. and the last one is 2 .. here we calculate m which is used in calculation to indicate this
(m == (ceil((float)M.width/(float)TILE_WIDTH)-1)) ? limit = (M.width - TILE_WIDTH*(floor((float)M.width/(float)TILE_WIDTH))): limit = TILE_WIDTH ;
//calculate all tiles except the bottom and most right ones (this is because all tiles here are guaranteed to be of tile_width*tile_width size)
if ((bx < (ceil((float)N.width/(float)TILE_WIDTH)-1)) && (by <(ceil((float)M.height/(float)TILE_WIDTH)-1)))
{
Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];
Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];
__syncthreads();
for (int k = 0; k < limit; ++k)
Pvalue += Mds[ty][k] * Nds[k][tx];
}
//calculate the bottom right most corner tile (it is not guaranteed to be tile_width in any direction … can be less)
else if ((bx == (ceil((float)N.width/(float)TILE_WIDTH)-1)) && (by == (ceil((float)M.height/(float)TILE_WIDTH)-1)) && ((N.width % TILE_WIDTH) != 0) && ((M. height % TILE_WIDTH) != 0))
{
if ((tx < (N.width - TILE_WIDTH*(floor((float)N.width/(float)TILE_WIDTH)))) && (ty < (M.height - TILE_WIDTH*(floor((float)M.height/(float)TILE_WIDTH)))))
{
Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];
Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];
__syncthreads();
for (int k = 0; k < limit; ++k)
Pvalue += Mds[ty][k] * Nds[k][tx];
}
}
//calculate the right most column except the corner tile .. here the width of the right most column is less than tile_width
else if ((bx == (ceil((float)N.width/(float)TILE_WIDTH)-1)) && ((N.width % TILE_WIDTH) != 0) && (by != (ceil((float)M.height/(float)TILE_WIDTH)-1)))
{
if (tx < (N.width - TILE_WIDTH*(floor((float)N.width/(float)TILE_WIDTH))))
{
Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];
Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];
__syncthreads();
for (int k = 0; k < limit; ++k)
Pvalue += Mds[ty][k] * Nds[k][tx];
}
}
//calculate the bottom line tiles except the right most corner tile ,, this is the case where the height of the tile is less than tile_width
else if ((by == (ceil((float)M.height/(float)TILE_WIDTH)-1)) && ((M. height % TILE_WIDTH) != 0) && (bx != (ceil((float)N.width/(float)TILE_WIDTH)-1)))
{
if (ty < (M.height - TILE_WIDTH*(floor((float)M.height/(float)TILE_WIDTH))))
{
Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];
Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];
__syncthreads();
for (int k = 0; k < limit; ++k)
Pvalue += Mds[ty][k] * Nds[k][tx];
}
}
__syncthreads();
}
P.elements[Row*P.width+Col] = Pvalue;