Hey guys,
I’ve been beating my head over this for the last few days and I can’t figure out why mex file doesn’t want to work for large Matlab matrices.
I call my code in MatLab by calling “cudaadd(a,b,c … and how many ever matrices you want)” if one matrix is given then a copy of that matrix is returned. The importance of this statement is because submitting a large matrix returns a copy of that large matrix no problem. So the problem must be when I am adding these large matrices. I get values returned to Matlab that don’t make sense, such as 850 (I am adding two matrices made with 100 * rand(1000), so they only have values between 1-100.) Any help would be appreciated
The code:
#include "cuda.h"
#include "mex.h"
#include <string.h>
#include <unistd.h>
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
/* Kernel to add the array on the GPU */
__global__ void gpuadd(float* result, float* nextMatrix, int size)
{
/* calculate which element we should be adding */
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
int idx = x*y + x;
/*make sure x < sizeX && ycoord < sizeY*/
if(idx < size)
result[idx] = result[idx] + nextMatrix[idx];
}
/*Gateway function */
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
/* whatever data i need to allocate */
double *result, *nextMatrix; /*hold the results */
float *resultf, *nextMatrixf; /* arrays to hold float conversion of the data */
float *resultGPU, *nextMatrixGPU; /* pointers for data on GPU */
int size, rowLen, colLen; /*hold our matrix size, which should be the same for all */
int floatsize; /* size of float in bytes */
dim3 dimGrid, dimBlock; /* holds the CUDA thread block size and size of grid */
/*get all arrays to add*/
if(nlhs > 1)
mexErrMsgTxt("Need only 1 output.");
else if(nrhs == 0)
mexErrMsgTxt("Please specify what matrices to add");
else
{
/* get the dimension and check that all matrices have the same dimension */
rowLen = mxGetM(prhs[0]);
colLen = mxGetN(prhs[0]);
for(int i = 1; i < nrhs; i++)
if(rowLen != mxGetM(prhs[i]) || colLen != mxGetN(prhs[i]))
mexErrMsgTxt("Matrix dimensions must agree.");
size = rowLen * colLen;
floatsize = sizeof(float) * size;
dimBlock = dim3(10,10);
/*int nBlocks = size/dimBlock.x + (size%dimBlock.x == 0?0:1);*/
int yBlocks = rowLen/dimBlock.y + (rowLen%dimBlock.y == 0?0:1);
int xBlocks = colLen/dimBlock.x + (colLen%dimBlock.x == 0?0:1);
dimGrid = dim3(xBlocks, yBlocks);
}
/* prepare our output matrix */
plhs[0] = mxCreateDoubleMatrix(rowLen, colLen, mxREAL);
/* point to our output matrix's data */
result = mxGetPr(plhs[0]);
memcpy(result, mxGetPr(prhs[0]), sizeof(double)*size); /* copy the first matrix */
for(int i = 1; i < nrhs; i++)
{
if(i == 1) /* only need to do this once */
{
/*allocate data for previous result to add on to*/
cudaMalloc( (void **) &resultGPU, floatsize);
/*allocate data for prhs[i]*/
cudaMalloc( (void **) &nextMatrixGPU, floatsize);
/* convert result to float, do in here in case no addition needs to be
done, so you don't convert if you never enter the for loop */
resultf = (float *) mxMalloc(floatsize);
nextMatrixf = (float *) mxMalloc(floatsize);
for(int k = 0; k < size; k++)
resultf[k] = (float) result[k]; /* float <- double */
}
/* get next matrix */
nextMatrix = mxGetPr(prhs[i]);
/* check to see if prhs[i] is float, else make it float */
if( mxGetClassID(prhs[i]) == mxSINGLE_CLASS )
/* transfer "nextmatrix" data to the GPU*/
cudaMemcpy(nextMatrixGPU, nextMatrix, floatsize, cudaMemcpyHostToDevice);
else /* not single class, make it float */
{
for(int k = 0; k < size; k++)
nextMatrixf[k] = (float) nextMatrix[k];
/* transfer "nextmatrix" data to the GPU */
cudaMemcpy(nextMatrixGPU, nextMatrixf, floatsize, cudaMemcpyHostToDevice);
}
/*transfer the "result" data to the GPU*/
cudaMemcpy(resultGPU, resultf, floatsize, cudaMemcpyHostToDevice);
/*transfers to kernel*/
/* pass pointers to data we copied to device, to the device and whatever other data we need) */
gpuadd<<<dimGrid, dimBlock>>>(resultGPU, nextMatrixGPU, size);
//Check for any CUDA errors
checkCUDAError("kernel invocation");
/*get data from gpu */
cudaMemcpy(resultf, resultGPU, floatsize, cudaMemcpyDeviceToHost);
}
/* convert resultf back to double if matrices were added*/
if(nrhs > 1)
for(int i = 0; i < size; i++)
result[i] = (double) resultf[i];
/* clean up the memories */
if(nrhs > 1)
{
/* device memory */
cudaFree(resultGPU);
cudaFree(nextMatrixGPU);
/* host memory */
mxFree(resultf);
mxFree(nextMatrixf);
}
mexPrintf("Finished successfully?\n");
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
mexErrMsgTxt(cudaGetErrorString(err));
}
}