I implemented a very simple plain matrix multiplication with Matlab mex.

But it is about 30x slower than the matlab build-in function (A*B with A and B being both gpuArrays).

It takes about 14 seconds to multiply two 8000x8000 matrices. Matlab build-in takes about 0.5 second.

Why is there so large difference?

There could be two reasons:

(1) I haven’t made full use of the GPU using my code

(2) The fancy algorithm used by Matlab build-in function is 30x faster (with 8000x8000 matrices) than my naive matrix multiplication. But my 14 seconds seems really too long given the power of my GPU (Tesla K80).

Anybody has any idea?

Thank you very much!

Full code (and script to compile) can be downloaded from:

https://www.dropbox.com/s/cq6tiqcyc5539po/mat_mult.tar?dl=0

Main code:

```
// modified from http://www.mathworks.com/help/distcomp/run-mex-functions-containing-cuda-code.html
#include "mex.h"
#include "gpu/mxGPUArray.h"
/*
* Device code
*/
void __global__ MatMult(float const * const A,
float const * const B,
float * const C,
int const N, int const M, int const K)
{
int const ROW = blockDim.x * blockIdx.x + threadIdx.x;
int const COL = blockDim.y * blockIdx.y + threadIdx.y;
if (ROW < N && COL < M) {
float tmpSum = 0.0f;
for (int i = 0; i < K; i++) {
tmpSum += A[i * N + ROW] * B[COL * K + i];
}
C[COL * N + ROW] = tmpSum;
}
}
/*
* Host code
*/
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
/* Declare all variables.*/
mxGPUArray const *A;
mxGPUArray const *B;
mxGPUArray *C;
float const *d_A;
float const *d_B;
float *d_C;
int N,M,K;
char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
char const * const errMsg = "Invalid input to MEX file.";
/* Choose a reasonably sized number of threads for the block. */
int const threadsPerBlock_sqrt = 16;
/* Initialize the MathWorks GPU API. */
mxInitGPU();
/* Throw an error if the input is not a GPU array. */
if ( (nrhs!=2) || !(mxIsGPUArray(prhs[0])) || !(mxIsGPUArray(prhs[1])) ) {
mexErrMsgIdAndTxt(errId, errMsg);
}
A = mxGPUCreateFromMxArray(prhs[0]);
B = mxGPUCreateFromMxArray(prhs[1]);
/*
* Verify that A really is a float array before extracting the pointer.
*/
if (mxGPUGetClassID(A) != mxSINGLE_CLASS) {
mexErrMsgIdAndTxt(errId, errMsg);
}
if (mxGPUGetClassID(B) != mxSINGLE_CLASS) {
mexErrMsgIdAndTxt(errId, errMsg);
}
/*
* Now that we have verified the data type, extract a pointer to the input
* data on the device.
*/
d_A = (float const *)(mxGPUGetDataReadOnly(A));
d_B = (float const *)(mxGPUGetDataReadOnly(B));
mwSize const * const A_size = mxGPUGetDimensions(A);
mwSize const * const B_size = mxGPUGetDimensions(B);
N = A_size[0];
M = B_size[1];
K = B_size[0];
mwSize * C_size = new mwSize[2];
C_size[0] = N;
C_size[1] = M;
/* Create a GPUArray to hold the result and get its underlying pointer. */
C = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
C_size,
mxGPUGetClassID(A),
mxGPUGetComplexity(A),
MX_GPU_INITIALIZE_VALUES);
//MX_GPU_DO_NOT_INITIALIZE);
d_C = (float *)(mxGPUGetData(C));
//N = (int)(mxGPUGetNumberOfElements(A));
//blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
dim3 blocksPerGrid( (N + threadsPerBlock_sqrt - 1) / threadsPerBlock_sqrt , (M + threadsPerBlock_sqrt - 1) / threadsPerBlock_sqrt , 1);
dim3 threadsPerBlock( threadsPerBlock_sqrt , threadsPerBlock_sqrt , 1);
MatMult<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N, M, K);
/* Wrap the result up as a MATLAB gpuArray for return. */
plhs[0] = mxGPUCreateMxArrayOnGPU(C);
mxGPUDestroyGPUArray(A);
mxGPUDestroyGPUArray(B);
mxGPUDestroyGPUArray(C);
}
```