# Tiled Matrix Multiply for Arbitrary Size Matrices Matrix dimensions are not multiples of tile dimens

Hi! I am writing this code for doing matrix multiplication for arbitrary size matrices such that the matrices sizes are not necessarily a multiple of tile dimensions …

Matrices are defined as struct where it has the attributes of height … width and the pointer (elements)

There are no syntax errors too …

Logically my code is correct i guess … don’t know why it doesn’t give correct results when i run in it emulation mode … I have no cuda capable hw … all my runs are in emulation

TILE_WIDTH is defined as 16 in each direction

shared float Mds[TILE_WIDTH][TILE_WIDTH];

shared float Nds[TILE_WIDTH][TILE_WIDTH];

int bx = blockIdx.x; int by = blockIdx.y;

// Identify the row and column of the Pd element to work on

int Row = by * TILE_WIDTH + ty;

int Col = bx * TILE_WIDTH + tx;

int limit;

float Pvalue = 0;

// Loop over the Md and Nd tiles required to compute the Pd element

for (int m = 0; m < (ceil((float)M.width/(float)TILE_WIDTH)); ++m)

{

``````    //suppose M.width is 66 then the 1st four tiles are 16 elements wide .. and the last one is 2 .. here we calculate m which is used in calculation to indicate this

(m == (ceil((float)M.width/(float)TILE_WIDTH)-1)) ? limit = (M.width - TILE_WIDTH*(floor((float)M.width/(float)TILE_WIDTH))): limit = TILE_WIDTH  ;

//calculate all tiles except the bottom and most right ones (this is because all tiles here are guaranteed to be of tile_width*tile_width size)

if ((bx < (ceil((float)N.width/(float)TILE_WIDTH)-1)) && (by <(ceil((float)M.height/(float)TILE_WIDTH)-1)))

{

Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];

Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];

for (int k = 0; k < limit; ++k)

Pvalue += Mds[ty][k] * Nds[k][tx];

}
``````

//calculate the bottom right most corner tile (it is not guaranteed to be tile_width in any direction … can be less)

``````else if ((bx == (ceil((float)N.width/(float)TILE_WIDTH)-1)) && (by == (ceil((float)M.height/(float)TILE_WIDTH)-1)) && ((N.width % TILE_WIDTH) != 0) && ((M. height % TILE_WIDTH) != 0))

{

if ((tx < (N.width - TILE_WIDTH*(floor((float)N.width/(float)TILE_WIDTH)))) && (ty < (M.height - TILE_WIDTH*(floor((float)M.height/(float)TILE_WIDTH)))))

{

Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];

Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];

for (int k = 0; k < limit; ++k)

Pvalue += Mds[ty][k] * Nds[k][tx];

}

}

//calculate the right most column except the corner tile .. here the width of the right most column is less than tile_width

else if ((bx == (ceil((float)N.width/(float)TILE_WIDTH)-1)) && ((N.width % TILE_WIDTH) != 0) && (by != (ceil((float)M.height/(float)TILE_WIDTH)-1)))

{

if (tx < (N.width - TILE_WIDTH*(floor((float)N.width/(float)TILE_WIDTH))))

{

Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];

Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];

for (int k = 0; k < limit; ++k)

Pvalue += Mds[ty][k] * Nds[k][tx];

}

}

//calculate the bottom line tiles except the right most corner tile ,, this is the case where the height of the tile is less than tile_width

else if ((by == (ceil((float)M.height/(float)TILE_WIDTH)-1)) && ((M. height % TILE_WIDTH) != 0) && (bx != (ceil((float)N.width/(float)TILE_WIDTH)-1)))

{

if (ty < (M.height - TILE_WIDTH*(floor((float)M.height/(float)TILE_WIDTH))))

{

Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];

Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];

for (int k = 0; k < limit; ++k)

Pvalue += Mds[ty][k] * Nds[k][tx];

}

}

``````

}

P.elements[Row*P.width+Col] = Pvalue;

I’ve not tried your code. But it seems a little too big and complex for a simple task.

Here is mine:

void MatrixMulOnDevice(const Matrix M, const Matrix N, Matrix P)

{

.
.
.

``````// Setup the execution configuration

dim3 dimBlock(16,16);

dim3 dimGrid((N.width/dimBlock.x)+1, (M.height/dimBlock.y)+1);
MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd);
``````

.
.
.
}

global void MatrixMulKernel(Matrix M, Matrix N, Matrix P)

{

``````//Multiply the two matrices

int row = blockIdx.y * blockDim.y + threadIdx.y;

int col = blockIdx.x * blockDim.x + threadIdx.x;

int i;

// The element with index idx and idy of matrix Pd being processed.,

float PdValue = 0.0f;

if (row < P.height && col < P.width)
``````

{

``````for (i = 0; i < M.width; ++i)

{

PdValue += M.elements[row * M.width + i] * N.elements[i * N.width + col];

}

P.elements[row * P.width + col] = PdValue;
``````

}

}

Hope it helps.

Why don’t you just pad them with zeros?
Memory access and copies work better anyway with suitably pitched memory.

N.

C = A * B;

When the matrix dimensions of A and B are not multiples of the tile dimensions, then it can happen that some tiles cover the matrices only partially. The tile elements falling outside the not-fully overlapping tiles should be properly zero-ed. It seems that Vanio’s approach is not doing that. Below, copied and pasted a possible solution

``````__global__ void MatMul(float* A, float* B, float* C, int ARows, int ACols, int BRows, int BCols, int CRows, int CCols) {

float CValue = 0;

int Row = blockIdx.y*TILE_DIM + threadIdx.y;
int Col = blockIdx.x*TILE_DIM + threadIdx.x;

__shared__ float As[TILE_DIM][TILE_DIM];
__shared__ float Bs[TILE_DIM][TILE_DIM];

for (int k = 0; k < (TILE_DIM + ACols - 1)/TILE_DIM; k++) {