Im working on a Matlab CUDA function that would be able to initialise and keep two matrices in GPU memory in the first run (from matlab fCUDA(A,C)) and in the second run (from matlab [D]=fCUDA(A,C)) there would be a

matrix multiplication of those two functions using cublasGemm.

All is going well except the final result of cublasGemm returns empty function.

I just want to keep one or two matrices in GPU memory in the first run of the function (using static pointers) and in the second run I would like to make a matrix multiplication.

What am I doing wrong? Did I make a mistake in usage of some pointers that I don’t get a result in the output D matrix? Or is there a problem that I’m keeping the right pointer values but that I’m not in the same memory address space anymore?

Code:

```
#include "mex.h"
#include "cuda.h"
#include "cublas.h"
#include <stdio.h>
#include <stdlib.h>
static int initialized = 0;
static float *kazalecA;
static float *kazalecF0;
void cleanup(void) {
Â cudaFree(kazalecA);
Â cudaFree(kazalecF0);
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
Â float *kazalecAHost;
Â float *kazalecF0Host;
Â int vrsticaA = mxGetM(prhs[0]);
Â int stolpecA = mxGetN(prhs[0]);
Â int vrsticaF0 = mxGetM(prhs[1]);
Â int stolpecF0 = mxGetN(prhs[1]);
Â if (!initialized) {
Â Â Â cudaMalloc ((void **)&kazalecA, sizeof(float)*vrsticaA*stolpecA);
Â Â Â
Â Â Â kazalecAHost = (float *)mxGetPr(prhs[0]); Â Â
Â Â Â cudaMemcpy(kazalecA, kazalecAHost, sizeof(float)*vrsticaA*stolpecA, cudaMemcpyHostToDevice);
Â Â Â cudaMalloc ((void **)&kazalecF0, sizeof(float)*vrsticaF0*stolpecF0);
Â Â Â kazalecF0Host = (float *)mxGetPr(prhs[0]);
Â Â Â cudaMemcpy(kazalecF0, kazalecF0Host, sizeof(float)*vrsticaF0*stolpecF0, cudaMemcpyHostToDevice);
Â Â Â
initialized = 1;
Â Â Â mexAtExit(cleanup);
Â } else {
Â Â
Â Â int dimenzija[2];
Â Â float alpha,beta;
Â Â double *C;
Â Â float *c;
Â Â float *gc;
Â Â
Â Â alpha = 1.0;
Â Â beta = 0.0; Â
Â
Â Â dimenzija[0]=vrsticaA;
Â Â dimenzija[1]=stolpecF0;
Â Â
Â Â plhs[0] = mxCreateNumericArray(2,dimenzija,mxDOUBLE_CLASS,mxREAL);
Â Â
Â Â C = mxGetPr(plhs[0]);
Â Â c Â = (float*) mxMalloc(sizeof(float)*vrsticaA*stolpecF0);
Â Â cublasInit(); //inicalizacija cublas knjiznice
Â Â cublasAlloc (stolpecF0*vrsticaA, sizeof(float), (void**)&gc);
Â Â cublasSetMatrix (vrsticaA, stolpecF0, sizeof(float),c, vrsticaA, (void*)gc, vrsticaA);
Â Â (void) cublasSgemm ('n','n',vrsticaA,stolpecF0,stolpecA,alpha,kazalecA,vrsticaA,kazalecF0,vrsticaF0,beta,gc,vrsticaA);
Â Â cublasGetMatrix (vrsticaA, stolpecF0, sizeof(float), Â gc, vrsticaA, c, vrsticaA);
Â Â cublasFree(gc);
Â Â cublasShutdown(); Â
Â Â }
}
```

Someone wrote me regarding a similar function:

As I only have one mexfunction can someone please clarify if I’m in the same memory address space in the second iteration?