I have been trying to write a mexfunction that will call a cuda kernel. I have run into some difficulties that i have overcome as a result of many posts on this forum. So i thought i give it another go. I have managed to get the cuda kernel compiled and linked into the mexfunction, but it looks like the mexfunction doesn’t even call the kernel. I tried having a .cpp file as an entry point to the kernel and this doesn’t seem to be called either.
Here is the code i’m trying to compile.
VecAddCPP.cu
#include <stdio.h>
#include <cuda.h>
__global__ void VecAdd(float *vector1, float* vector2, float* resultVector)
{
int idx = threadIdx.x;
resultVector[idx] = vector1[idx] + vector2[idx];
}
extern "C" void VecAddCPP(int grid, int block, float *v1, float *v2, float *v3)
{
printf("calling vecadd<grid,block>\n");
VecAdd<<<grid,block>>>(v1, v2, v3);
}
VectAddMexFunction.cpp
/*
* This mexFunction is a test stub to call a Cuda kernel function from Matlab
*
* This mexfunction will call C code to add two vectors together.
*/
#include <stdio.h>
#include <math.h>
#include "mex.h"
#include <cuda.h>
#include <cuda_runtime.h>
#ifdef __cplusplus
extern "C" {
#endif
void VecAddCPP(int grid, int block, float *vector1, float* vector2, float* resultVector);
#ifdef __cplusplus
}
#endif
void mexFunction (int nlhs,
mxArray *plhs[],
int nrhs,
const mxArray *prhs[])
{
float *vector1, *vector2;
float *resultVector;
int row, col;
/* Check for proper number of arguments */
if (nrhs != 2) {
mexErrMsgTxt("Two input arguments required.");
}
if (nlhs != 1) {
mexErrMsgTxt("One output arguments required.");
}
if (mxGetM(prhs[0]) != mxGetM(prhs[1]) &&
mxGetN(prhs[0]) != mxGetN(prhs[1])){
mexErrMsgTxt("Input vectors must be the same size.");
}
/* get the two vectors */
vector1 = (float *)mxGetPr(prhs[0]);
vector2 = (float *)mxGetPr(prhs[1]);
row = (int)mxGetM(prhs[0]);
col = (int)mxGetN(prhs[0]);
for (int i = 0; i < col; i++){
mexPrintf("1:: Vect1: %g, vect2: %g\n",vector1[i],vector2[i]);
}
/* put the input vectors on the GPU */
float *device_vect1, *device_vect2, *device_result;
cudaMemcpy(device_vect1, &vector1, col*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(device_vect2, &vector2, col*sizeof(float), cudaMemcpyHostToDevice);
mexPrintf("row: %d, col: %d\n",row,col);
/* assign the return vectror */
plhs[0] = mxCreateDoubleMatrix(row, col, mxREAL); /* result vector */
resultVector = (float *)mxGetPr(plhs[0]);
int grid = 1;
int block = col;
mexPrintf("calling vecAdd Cuda\n");
VecAddCPP(grid, block, device_vect1, device_vect2, device_result);
mexPrintf("returned from Cuda\n");
/* get the resulting vector off the GPU */
cudaMemcpy(&resultVector, device_result, col*sizeof(float), cudaMemcpyDeviceToHost);
mexPrintf("\nResults from GPU\n");
for (int i = 0; i < col; i++){
mexPrintf("5:: Vect1: %g, vect2: %g, result: %g\n",vector1[i],vector2[i], resultVector[i]);
}
/* free the device memory */
cudaFree(device_vect1);
cudaFree(device_vect2);
cudaFree(device_result);
}
I’m compiling in the Matlab command line, i know this isn’t ideal, but i haven’t had much luck with VS, and i understand what i’m telling the compiler.
compiling the .cu file into an object file
!nvcc -c -arch=sm_20 vecAddCPP.cu
compiling the mex and linking in the cuda object file
mex LINKFLAGS=“$LINKFLAGS /NODEFAULTLIB:MSVCRT.lib” -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v4.1/include/" -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.1\lib\x64" -lcudart VectAddMexFunction.cpp VecAddCPP.obj
This is the Matlab output
A = [1 2 3];
B = [4 5 6];
C = VectAddMexFunction(A, B)
1:: Vect1: 1, vect2: 4
1:: Vect1: 2, vect2: 5
1:: Vect1: 3, vect2: 6
row: 1, col: 3
calling vecAdd Cuda
returned from Cuda
Results from GPU
5:: Vect1: 1, vect2: 4, result: 0
5:: Vect1: 2, vect2: 5, result: 0
5:: Vect1: 3, vect2: 6, result: 0
C =
0 0 0