Tiled Matrix Multiply for Arbitrary Size Matrices Matrix dimensions are not multiples of tile dimens

Hi! I am writing this code for doing matrix multiplication for arbitrary size matrices such that the matrices sizes are not necessarily a multiple of tile dimensions …

Matrices are defined as struct where it has the attributes of height … width and the pointer (elements)

There are no syntax errors too …

Logically my code is correct i guess … don’t know why it doesn’t give correct results when i run in it emulation mode … I have no cuda capable hw … all my runs are in emulation

TILE_WIDTH is defined as 16 in each direction

shared float Mds[TILE_WIDTH][TILE_WIDTH];

shared float Nds[TILE_WIDTH][TILE_WIDTH];

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;

// Identify the row and column of the Pd element to work on

int Row = by * TILE_WIDTH + ty;

int Col = bx * TILE_WIDTH + tx;

int limit;

float Pvalue = 0;

// Loop over the Md and Nd tiles required to compute the Pd element

for (int m = 0; m < (ceil((float)M.width/(float)TILE_WIDTH)); ++m)

{

    //suppose M.width is 66 then the 1st four tiles are 16 elements wide .. and the last one is 2 .. here we calculate m which is used in calculation to indicate this

(m == (ceil((float)M.width/(float)TILE_WIDTH)-1)) ? limit = (M.width - TILE_WIDTH*(floor((float)M.width/(float)TILE_WIDTH))): limit = TILE_WIDTH  ;

//calculate all tiles except the bottom and most right ones (this is because all tiles here are guaranteed to be of tile_width*tile_width size)

if ((bx < (ceil((float)N.width/(float)TILE_WIDTH)-1)) && (by <(ceil((float)M.height/(float)TILE_WIDTH)-1)))

	

	{

		Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];

	  	Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];

		__syncthreads();

 		for (int k = 0; k < limit; ++k)

	 	    		Pvalue += Mds[ty][k] * Nds[k][tx];

	}

//calculate the bottom right most corner tile (it is not guaranteed to be tile_width in any direction … can be less)

else if ((bx == (ceil((float)N.width/(float)TILE_WIDTH)-1)) && (by == (ceil((float)M.height/(float)TILE_WIDTH)-1)) && ((N.width % TILE_WIDTH) != 0) && ((M. height % TILE_WIDTH) != 0))

{

	if ((tx < (N.width - TILE_WIDTH*(floor((float)N.width/(float)TILE_WIDTH)))) && (ty < (M.height - TILE_WIDTH*(floor((float)M.height/(float)TILE_WIDTH)))))

	{

		Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];

	  	Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];

		__syncthreads();

		for (int k = 0; k < limit; ++k)

    			Pvalue += Mds[ty][k] * Nds[k][tx];

	}

		

}

    //calculate the right most column except the corner tile .. here the width of the right most column is less than tile_width

else if ((bx == (ceil((float)N.width/(float)TILE_WIDTH)-1)) && ((N.width % TILE_WIDTH) != 0) && (by != (ceil((float)M.height/(float)TILE_WIDTH)-1)))

{

	if (tx < (N.width - TILE_WIDTH*(floor((float)N.width/(float)TILE_WIDTH))))

	{

		Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];

	  	Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];

		__syncthreads();

 		for (int k = 0; k < limit; ++k)

    			Pvalue += Mds[ty][k] * Nds[k][tx];

	}

		

}

    //calculate the bottom line tiles except the right most corner tile ,, this is the case where the height of the tile is less than tile_width 

else if ((by == (ceil((float)M.height/(float)TILE_WIDTH)-1)) && ((M. height % TILE_WIDTH) != 0) && (bx != (ceil((float)N.width/(float)TILE_WIDTH)-1)))

{

	if (ty < (M.height - TILE_WIDTH*(floor((float)M.height/(float)TILE_WIDTH))))

	{

		Mds[ty][tx] = M.elements[Row*M.width + (m*TILE_WIDTH + tx)];

	  	Nds[ty][tx] = N.elements[Col + (m*TILE_WIDTH + ty)*N.width];

		__syncthreads();

 		for (int k = 0; k < limit; ++k)

    			Pvalue += Mds[ty][k] * Nds[k][tx];

	}

}	



__syncthreads();

}

P.elements[Row*P.width+Col] = Pvalue;

I’ve not tried your code. But it seems a little too big and complex for a simple task.

Here is mine:

void MatrixMulOnDevice(const Matrix M, const Matrix N, Matrix P)

{

.
.
.

// Setup the execution configuration

dim3 dimBlock(16,16);

dim3 dimGrid((N.width/dimBlock.x)+1, (M.height/dimBlock.y)+1);
MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd);

.
.
.
}

global void MatrixMulKernel(Matrix M, Matrix N, Matrix P)

{

//Multiply the two matrices



// Thread ids.

int row = blockIdx.y * blockDim.y + threadIdx.y; 

int col = blockIdx.x * blockDim.x + threadIdx.x;



int i;

// The element with index idx and idy of matrix Pd being processed.,

float PdValue = 0.0f;


if (row < P.height && col < P.width)

{

for (i = 0; i < M.width; ++i)

{

	PdValue += M.elements[row * M.width + i] * N.elements[i * N.width + col];

}



P.elements[row * P.width + col] = PdValue;

}

}

Hope it helps.

Why don’t you just pad them with zeros?
Memory access and copies work better anyway with suitably pitched memory.

N.

C = A * B;

When the matrix dimensions of A and B are not multiples of the tile dimensions, then it can happen that some tiles cover the matrices only partially. The tile elements falling outside the not-fully overlapping tiles should be properly zero-ed. It seems that Vanio’s approach is not doing that. Below, copied and pasted a possible solution

__global__ void MatMul(float* A, float* B, float* C, int ARows, int ACols, int BRows, int BCols, int CRows, int CCols) {
	
    float CValue = 0;
    
    int Row = blockIdx.y*TILE_DIM + threadIdx.y;
    int Col = blockIdx.x*TILE_DIM + threadIdx.x;

    __shared__ float As[TILE_DIM][TILE_DIM];
    __shared__ float Bs[TILE_DIM][TILE_DIM];
		 
    for (int k = 0; k < (TILE_DIM + ACols - 1)/TILE_DIM; k++) {
			
        if (k*TILE_DIM + threadIdx.x < ACols && Row < ARows)	As[threadIdx.y][threadIdx.x] = A[Row*ACols + k*TILE_DIM + threadIdx.x];
	else													As[threadIdx.y][threadIdx.x] = 0.0;

	if (k*TILE_DIM + threadIdx.y < BRows && Col < BCols)	Bs[threadIdx.y][threadIdx.x] = B[(k*TILE_DIM + threadIdx.y)*BCols + Col];
	else													Bs[threadIdx.y][threadIdx.x] = 0.0;
         
	__syncthreads();

	for (int n = 0; n < TILE_DIM; ++n) CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];
		
	__syncthreads();
    }
    
    if (Row < CRows && Col < CCols) C[((blockIdx.y * blockDim.y + threadIdx.y)*CCols)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue;
}

Just to add to this, cublas’ s/d/c/z gemm functions are very well optimized for most cases. Unless you haev a weird corner case (outer dimensions small, inner dimensions huge), which your code doesn’t appear to cater for anyway, you will have a hard time outperforming gemm, also because they used custom toolkits to build their kernels.