Well I am making some progress but very limited. :|

I tried running a matrix multiplication on 2 512*512 matrices with the following code from matlab (the code was posted here on the forum):

```
#include <stdio.h>
#include <stdlib.h>
#include "mex.h"
#include "cublas.h"
/* Rutine za pretvorbo med single in double natancnostjo */
void convert_double2float( double *input_double, float *output_float,int Ntot){
int i;
for (i = 0; i < Ntot; i++){
output_float[i] = (float) input_double[i];
}
}
void convert_float2double( float *input_float, double *output_double,int Ntot){
int i;
for (i = 0; i < Ntot; i++){
output_double[i] = (double) input_float[i];
}
}
/* vhodna funkcija [C]=cublasMMul(A,B) */
void mexFunction( int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[]){
int M,K,L,N;
int dims0[2];
float alpha,beta;
double *A,*B,*C;
float *a,*b,*c;
float *ga,*gb,*gc;
cublasStatus retStatus;
if (nrhs != 2) {
mexErrMsgTxt("rabimo 2 vhodna argmenta");
} else if (nlhs != 1) {
mexErrMsgTxt("rabimo 1 izhodni argument");
}
alpha = 1.0;
beta = 0.0;
M = mxGetM(prhs[0]); /* dobimo stevilo vrstic A */
K = mxGetN(prhs[0]); /* dobimo stevilo stolpcev A */
L = mxGetM(prhs[1]); /* dobimo stevilo vrstic B */
N = mxGetN(prhs[1]); /* dobimo stevilo stolpcev B */
/* K in L morata biti iste velikosti */
/* Keiranje matrike */
dims0[0]=M;
dims0[1]=N;
plhs[0] = mxCreateNumericArray(2,dims0,mxDOUBLE_CLASS,mxREAL);
C = mxGetPr(plhs[0]);
/* Matrika 1 */
A = mxGetPr(prhs[0]);
/* Matrika 2 */
B = mxGetPr(prhs[1]);
/* alociranje delovnega polja na gostitelju */
a = (float*) mxMalloc(sizeof(float)*M*K);
b = (float*) mxMalloc(sizeof(float)*N*K);
convert_double2float(A,a,M*K);
convert_double2float(B,b,N*K);
c = (float*) mxMalloc(sizeof(float)*M*N);
/* Zagon Cublas knjiznice */
retStatus = cublasInit();
// testiramo za napake
retStatus = cublasGetError ();
if (retStatus != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS: napaka v cublasInit\n");
} else {
printf("CUBLAS: cublasInit deluje normalno\n");
}
/* Dodeljevanje prostora na GPU enoti in kopiranje podatkov na napravo */
cublasAlloc (M*K, sizeof(float), (void**)&ga);
cublasSetMatrix (M, K, sizeof(float),a, M, (void*)ga, M);
cublasAlloc (N*K, sizeof(float), (void**)&gb);
cublasSetMatrix (K, N, sizeof(float),b, K, (void*)gb, K);
cublasAlloc (N*M, sizeof(float), (void**)&gc);
cublasSetMatrix (M, N, sizeof(float),c, M, (void*)gc, M);
/* Klic cublasSgemm knjiznice */
(void) cublasSgemm ('n','n',M,N,K,alpha,ga,M,gb,L,beta,gc,M);
/* kopiranje rezultata na c */
cublasGetMatrix (M, N, sizeof(float), gc, M, c, M);
/* sprostimo pomnilnik in izklopimo cublas */
cublasFree (ga);
cublasFree (gb);
cublasFree (gc);
cublasShutdown();
convert_float2double(c,C,M*N);
}
```

But results are actually very bad. A CPU gives results six times faster than the GPU (probably because of the data transfer from and to the device.) Also I dont se any usage of parallelism in the above program. Is there any? Is cublasSgemm taking any advantage of multithreading? Any hints on optimizing this code?

And one more question regarding mexFunction( int nlhs, mxArray *plhs,int nrhs, const mxArray *prhs)

nlhs = number of expected mxArrays (Left Hand Side)

plhs = array of pointers to expected outputs

nrhs = number of inputs (Right Hand Side)

prhs = array of pointers to input data. The input data is read-only

Is there any limit on the numer or size of the nlhs, plhs, nrhs and prhs?

And finally did anyone try to implement the standard CUDA C example matrix multiplication to the Matlab environment?