I have been trying to write a mexfunction that will call a cuda kernel. I have run into some difficulties that i have overcome as a result of many posts on this forum. So i thought i give it another go. I have managed to get the cuda kernel compiled and linked into the mexfunction, but it looks like the mexfunction doesn’t even call the kernel. I tried having a .cpp file as an entry point to the kernel and this doesn’t seem to be called either.

```
/*
* This mexFunction is a test stub to call a Cuda kernel function from Matlab
*
* This mexfunction will call C code to add two vectors together.
*/
#include <stdio.h>
#include <math.h>
#include "mex.h"
#include <cuda.h>
#include <cuda_runtime.h>
#ifdef __cplusplus
extern "C" {
#endif
void VecAddCPP(int grid, int block, float *vector1, float* vector2, float* resultVector);
#ifdef __cplusplus
}
#endif
void mexFunction (int nlhs,
mxArray *plhs[],
int nrhs,
const mxArray *prhs[])
{
float *vector1, *vector2;
float *resultVector;
int row, col;
/* Check for proper number of arguments */
if (nrhs != 2) {
mexErrMsgTxt("Two input arguments required.");
}
if (nlhs != 1) {
mexErrMsgTxt("One output arguments required.");
}
if (mxGetM(prhs[0]) != mxGetM(prhs[1]) &&
mxGetN(prhs[0]) != mxGetN(prhs[1])){
mexErrMsgTxt("Input vectors must be the same size.");
}
/* get the two vectors */
vector1 = (float *)mxGetPr(prhs[0]);
vector2 = (float *)mxGetPr(prhs[1]);
row = (int)mxGetM(prhs[0]);
col = (int)mxGetN(prhs[0]);
for (int i = 0; i < col; i++){
mexPrintf("1:: Vect1: %g, vect2: %g\n",vector1[i],vector2[i]);
}
/* put the input vectors on the GPU */
float *device_vect1, *device_vect2, *device_result;
cudaMemcpy(device_vect1, &vector1, col*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(device_vect2, &vector2, col*sizeof(float), cudaMemcpyHostToDevice);
mexPrintf("row: %d, col: %d\n",row,col);
/* assign the return vectror */
plhs[0] = mxCreateDoubleMatrix(row, col, mxREAL); /* result vector */
resultVector = (float *)mxGetPr(plhs[0]);
int grid = 1;
int block = col;
mexPrintf("calling vecAdd Cuda\n");
VecAddCPP(grid, block, device_vect1, device_vect2, device_result);
// VecAddKernelEmulation(grid, block, vector1, vector2, resultVector);
mexPrintf("returned from Cuda\n");
/* get the resulting vector off the GPU */
cudaMemcpy(&resultVector, device_result, col*sizeof(float), cudaMemcpyDeviceToHost);
mexPrintf("\nResults from GPU\n");
for (int i = 0; i < col; i++){
mexPrintf("5:: Vect1: %g, vect2: %g, result: %g\n",vector1[i],vector2[i], resultVector[i]);
}
/* free the device memory */
cudaFree(device_vect1);
cudaFree(device_vect2);
cudaFree(device_result);
}
```