Adding Huge Matlab Matrices - Doesn't Work Sometimes

Hey guys,

I’ve been beating my head over this for the last few days and I can’t figure out why mex file doesn’t want to work for large Matlab matrices.

I call my code in MatLab by calling “cudaadd(a,b,c … and how many ever matrices you want)” if one matrix is given then a copy of that matrix is returned. The importance of this statement is because submitting a large matrix returns a copy of that large matrix no problem. So the problem must be when I am adding these large matrices. I get values returned to Matlab that don’t make sense, such as 850 (I am adding two matrices made with 100 * rand(1000), so they only have values between 1-100.) Any help would be appreciated

The code:

#include "cuda.h"

#include "mex.h"

#include <string.h>

#include <unistd.h>

// Simple utility function to check for CUDA runtime errors

void checkCUDAError(const char* msg);

/* Kernel to add the array on the GPU */

__global__ void gpuadd(float* result, float* nextMatrix, int size)


   /* calculate which element we should be adding */

   int x = blockIdx.x*blockDim.x + threadIdx.x;

   int y = blockIdx.y*blockDim.y + threadIdx.y;

   int idx = x*y + x;

   /*make sure x < sizeX && ycoord < sizeY*/

   if(idx < size)

	  result[idx] = result[idx] + nextMatrix[idx];


/*Gateway function */

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])


   /* whatever data i need to allocate */

   double *result, *nextMatrix; /*hold the results */

   float *resultf, *nextMatrixf; /* arrays to hold float conversion of the data */

   float *resultGPU, *nextMatrixGPU; /* pointers for data on GPU */

   int size, rowLen, colLen; /*hold our matrix size, which should be the same for all */

   int floatsize; /* size of float in bytes */

   dim3 dimGrid, dimBlock; /* holds the CUDA thread block size and size of grid */

/*get all arrays to add*/

   if(nlhs > 1)

	  mexErrMsgTxt("Need only 1 output.");

   else if(nrhs == 0)

	  mexErrMsgTxt("Please specify what matrices to add");



	  /* get the dimension and check that all matrices have the same dimension */

	  rowLen = mxGetM(prhs[0]);

	  colLen = mxGetN(prhs[0]);

	  for(int i = 1; i < nrhs; i++)

	 if(rowLen != mxGetM(prhs[i]) || colLen != mxGetN(prhs[i]))

		mexErrMsgTxt("Matrix dimensions must agree.");

	  size = rowLen * colLen;

	  floatsize = sizeof(float) * size;

	  dimBlock = dim3(10,10);

	  /*int nBlocks = size/dimBlock.x + (size%dimBlock.x == 0?0:1);*/

	  int yBlocks = rowLen/dimBlock.y + (rowLen%dimBlock.y == 0?0:1);

	  int xBlocks = colLen/dimBlock.x + (colLen%dimBlock.x == 0?0:1);

	  dimGrid = dim3(xBlocks, yBlocks);


/* prepare our output matrix */

   plhs[0] = mxCreateDoubleMatrix(rowLen, colLen, mxREAL);

   /* point to our output matrix's data */

   result = mxGetPr(plhs[0]);

   memcpy(result, mxGetPr(prhs[0]), sizeof(double)*size); /* copy the first matrix */

for(int i = 1; i < nrhs; i++)


	  if(i == 1) /* only need to do this once */


	 /*allocate data for previous result to add on to*/

	 cudaMalloc( (void **) &resultGPU, floatsize);

	 /*allocate data for prhs[i]*/

	 cudaMalloc( (void **) &nextMatrixGPU, floatsize);

	 /* convert result to float, do in here in case no addition needs to be

		done, so you don't convert if you never enter the for loop */

	 resultf = (float *) mxMalloc(floatsize);

	 nextMatrixf = (float *) mxMalloc(floatsize);

	 for(int k = 0; k < size; k++)

		resultf[k] = (float) result[k]; /* float <- double */


	  /* get next matrix */

	  nextMatrix = mxGetPr(prhs[i]);

	  /* check to see if prhs[i] is float, else make it float */

	  if( mxGetClassID(prhs[i]) == mxSINGLE_CLASS )

	 /* transfer "nextmatrix" data to the GPU*/

	 cudaMemcpy(nextMatrixGPU, nextMatrix, floatsize, cudaMemcpyHostToDevice);

	  else /* not single class, make it float */


	 for(int k = 0; k < size; k++)

		nextMatrixf[k] = (float) nextMatrix[k];

	 /* transfer "nextmatrix" data to the GPU */

	 cudaMemcpy(nextMatrixGPU, nextMatrixf, floatsize, cudaMemcpyHostToDevice);


	 /*transfer the "result" data to the GPU*/

	  cudaMemcpy(resultGPU, resultf, floatsize, cudaMemcpyHostToDevice);

	  /*transfers to kernel*/

	  /* pass pointers to data we copied to device, to the device and whatever other data we need) */

	  gpuadd<<<dimGrid, dimBlock>>>(resultGPU, nextMatrixGPU, size);

	  //Check for any CUDA errors

	  checkCUDAError("kernel invocation");

	  /*get data from gpu */

	  cudaMemcpy(resultf, resultGPU, floatsize, cudaMemcpyDeviceToHost);


/* convert resultf back to double if matrices were added*/

   if(nrhs > 1)

	  for(int i = 0; i < size; i++)

	 result[i] = (double) resultf[i];

/* clean up the memories */

   if(nrhs > 1)


	  /* device memory */



	  /* host memory */




mexPrintf("Finished successfully?\n");


void checkCUDAError(const char *msg)


	cudaError_t err = cudaGetLastError();

	if( cudaSuccess != err) 





Fixed this a while ago, forgot to update. I made an error in calculating the index, so not all possible indeces were being calculated.