I forgot to include my code. Here is MatMul.cu…

#include “cuda_runtime.h”

#include “device_launch_parameters.h”

#include “mex.h”

/* Thread block size*/

#define BLOCK_SIZE 16

/* Matrices are stored in row-major order:*/*

/ M(row, col) = *(M.elements + row * M.stride + col)*/

typedef struct {

int width;

int height;

int stride;

double *elements;

} Matrix;

/* Forward declaration of MatMul function */

void MatMul(const Matrix A, const Matrix B, Matrix C);

/* Forward declaration of the matrix multiplication kernel*/

**global** void MatMulKernel(const Matrix, const Matrix, Matrix);

/* mex interface function */

void mexFunction(int nlhs, mxArray *plhs, int nrhs, const mxArray *prhs)

{

mwSize mA = mxGetM(prhs[0]);

mwSize mB = mxGetM(prhs[1]);

mwSize nA = mxGetN(prhs[0]);

mwSize nB = mxGetN(prhs[1]);

int Stride = nA/BLOCK_SIZE;

Matrix A, B, C;

```
plhs[0] = mxCreateDoubleMatrix(mA,nB,mxREAL);
A.width = nA; A.height = mA; A.stride = Stride; A.elements = mxGetPr(prhs[0]);
B.width = nB; B.height = mB; B.stride = Stride; B.elements = mxGetPr(prhs[1]);
C.width = nB; C.height = mA; C.stride = Stride; C.elements = mxGetPr(plhs[0]);
/* Throw errors if input does not have correct format */
if(nrhs != 2)
mexErrMsgTxt("Two inputs (A and B) are required.");
else if(nlhs > 1)
mexErrMsgTxt("Too many output arguments.");
if(!mxIsDouble(prhs[0]) || mxIsComplex(prhs[0]))
mexErrMsgTxt("Input must be noncomplex double.");
if (nA != mB)
mexErrMsgTxt("Number of rows in A must equal number of columns in B.");
MatMul(A, B, C);
```

}

/* Get a matrix element*/

**device** double GetElement(const Matrix A, int row, int col)

{

return A.elements[row * A.stride + col];

}

/* Set a matrix element*/

**device** void SetElement(Matrix A, int row, int col,

double value)

{

A.elements[row * A.stride + col] = value;

}

/* Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is*/

/* located col sub-matrices to the right and row sub-matrices down*/

/* from the upper-left corner of A*/

**device** Matrix GetSubMatrix(Matrix A, int row, int col)

{

Matrix Asub;

Asub.width = BLOCK_SIZE;

Asub.height = BLOCK_SIZE;

Asub.stride = A.stride;

Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row

+ BLOCK_SIZE * col];

return Asub;

}

/* Matrix multiplication - Host code*/

/* Matrix dimensions are assumed to be multiples of BLOCK_SIZE*/

void MatMul(const Matrix A, const Matrix B, Matrix C)

{

/* Load A and B to device memory*/

Matrix d_A;

d_A.width = d_A.stride = A.width; d_A.height = A.height;

size_t size = A.width * A.height * sizeof(double);

cudaMalloc(&d_A.elements, size);

cudaMemcpy(d_A.elements, A.elements, size,

cudaMemcpyHostToDevice);

Matrix d_B;

d_B.width = d_B.stride = B.width; d_B.height = B.height;

size = B.width * B.height * sizeof(double);

cudaMalloc(&d_B.elements, size);

cudaMemcpy(d_B.elements, B.elements, size,

cudaMemcpyHostToDevice);

```
/* Allocate C in device memory*/
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(double);
cudaMalloc(&d_C.elements, size);
/* Invoke kernel*/
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
/* Read C from device memory*/
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);
/* Free device memory*/
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
```

}

/* Matrix multiplication kernel called by MatMul()*/*

**global** void MatMulKernel(Matrix A, Matrix B, Matrix C)

{

/ Block row and column*/

int blockRow = blockIdx.y;

int blockCol = blockIdx.x;

```
/* Each thread block computes one sub-matrix Csub of C*/
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
/* Each thread computes one element of Csub*/
/* by accumulating results into Cvalue*/
double Cvalue = 0;
/* Thread row and column within Csub*/
int row = threadIdx.y;
int col = threadIdx.x;
/* Loop over all the sub-matrices of A and B that are*/
/* required to compute Csub*/
/* Multiply each pair of sub-matrices together*/
/* and accumulate the results*/
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
/* Get sub-matrix Asub of A*/
Matrix Asub = GetSubMatrix(A, blockRow, m);
/* Get sub-matrix Bsub of B*/
Matrix Bsub = GetSubMatrix(B, m, blockCol);
/* Shared memory used to store Asub and Bsub respectively*/
__shared__ double As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ double Bs[BLOCK_SIZE][BLOCK_SIZE];
/* Load Asub and Bsub from device memory to shared memory*/
/* Each thread loads one element of each sub-matrix*/
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
/* Synchronize to make sure the sub-matrices are loaded*/
/* before starting the computation*/
__syncthreads();
/* Multiply Asub and Bsub together*/
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
/* Synchronize to make sure that the preceding*/
/* computation is done before loading two new*/
/* sub-matrices of A and B in the next iteration*/
__syncthreads();
}
/* Write Csub to device memory*/
/* Each thread writes one element*/
SetElement(Csub, row, col, Cvalue);
```

}