Im working on a Matlab CUDA function that would be able to initialise and keep two matrices in GPU memory in the first run (from matlab fCUDA(A,C)) and in the second run (from matlab [D]=fCUDA(A,C)) there would be a
matrix multiplication of those two functions using cublasGemm.
All is going well except the final result of cublasGemm returns empty function.
I just want to keep one or two matrices in GPU memory in the first run of the function (using static pointers) and in the second run I would like to make a matrix multiplication.
What am I doing wrong? Did I make a mistake in usage of some pointers that I don’t get a result in the output D matrix? Or is there a problem that I’m keeping the right pointer values but that I’m not in the same memory address space anymore?
Code:
#include "mex.h"
#include "cuda.h"
#include "cublas.h"
#include <stdio.h>
#include <stdlib.h>
static int initialized = 0;
static float *kazalecA;
static float *kazalecF0;
void cleanup(void) {
 cudaFree(kazalecA);
 cudaFree(kazalecF0);
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
 float *kazalecAHost;
 float *kazalecF0Host;
 int vrsticaA = mxGetM(prhs[0]);
 int stolpecA = mxGetN(prhs[0]);
 int vrsticaF0 = mxGetM(prhs[1]);
 int stolpecF0 = mxGetN(prhs[1]);
 if (!initialized) {
   cudaMalloc ((void **)&kazalecA, sizeof(float)*vrsticaA*stolpecA);
  Â
   kazalecAHost = (float *)mxGetPr(prhs[0]);  Â
   cudaMemcpy(kazalecA, kazalecAHost, sizeof(float)*vrsticaA*stolpecA, cudaMemcpyHostToDevice);
   cudaMalloc ((void **)&kazalecF0, sizeof(float)*vrsticaF0*stolpecF0);
   kazalecF0Host = (float *)mxGetPr(prhs[0]);
   cudaMemcpy(kazalecF0, kazalecF0Host, sizeof(float)*vrsticaF0*stolpecF0, cudaMemcpyHostToDevice);
  Â
initialized = 1;
   mexAtExit(cleanup);
 } else {
 Â
  int dimenzija[2];
  float alpha,beta;
  double *C;
  float *c;
  float *gc;
 Â
  alpha = 1.0;
  beta = 0.0; Â
Â
  dimenzija[0]=vrsticaA;
  dimenzija[1]=stolpecF0;
 Â
  plhs[0] = mxCreateNumericArray(2,dimenzija,mxDOUBLE_CLASS,mxREAL);
 Â
  C = mxGetPr(plhs[0]);
  c  = (float*) mxMalloc(sizeof(float)*vrsticaA*stolpecF0);
  cublasInit(); //inicalizacija cublas knjiznice
  cublasAlloc (stolpecF0*vrsticaA, sizeof(float), (void**)&gc);
  cublasSetMatrix (vrsticaA, stolpecF0, sizeof(float),c, vrsticaA, (void*)gc, vrsticaA);
  (void) cublasSgemm ('n','n',vrsticaA,stolpecF0,stolpecA,alpha,kazalecA,vrsticaA,kazalecF0,vrsticaF0,beta,gc,vrsticaA);
  cublasGetMatrix (vrsticaA, stolpecF0, sizeof(float),  gc, vrsticaA, c, vrsticaA);
  cublasFree(gc);
  cublasShutdown(); Â
  }
}
Someone wrote me regarding a similar function:
As I only have one mexfunction can someone please clarify if I’m in the same memory address space in the second iteration?