Adding Huge Matlab Matrices - Doesn't Work Sometimes

Hey guys,

I’ve been beating my head over this for the last few days and I can’t figure out why mex file doesn’t want to work for large Matlab matrices.

I call my code in MatLab by calling “cudaadd(a,b,c … and how many ever matrices you want)” if one matrix is given then a copy of that matrix is returned. The importance of this statement is because submitting a large matrix returns a copy of that large matrix no problem. So the problem must be when I am adding these large matrices. I get values returned to Matlab that don’t make sense, such as 850 (I am adding two matrices made with 100 * rand(1000), so they only have values between 1-100.) Any help would be appreciated

The code:

``````#include "cuda.h"

#include "mex.h"

#include <string.h>

#include <unistd.h>

// Simple utility function to check for CUDA runtime errors

void checkCUDAError(const char* msg);

/* Kernel to add the array on the GPU */

__global__ void gpuadd(float* result, float* nextMatrix, int size)

{

/* calculate which element we should be adding */

int x = blockIdx.x*blockDim.x + threadIdx.x;

int y = blockIdx.y*blockDim.y + threadIdx.y;

int idx = x*y + x;

/*make sure x < sizeX && ycoord < sizeY*/

if(idx < size)

result[idx] = result[idx] + nextMatrix[idx];

}

/*Gateway function */

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])

{

/* whatever data i need to allocate */

double *result, *nextMatrix; /*hold the results */

float *resultf, *nextMatrixf; /* arrays to hold float conversion of the data */

float *resultGPU, *nextMatrixGPU; /* pointers for data on GPU */

int size, rowLen, colLen; /*hold our matrix size, which should be the same for all */

int floatsize; /* size of float in bytes */

dim3 dimGrid, dimBlock; /* holds the CUDA thread block size and size of grid */

if(nlhs > 1)

mexErrMsgTxt("Need only 1 output.");

else if(nrhs == 0)

else

{

/* get the dimension and check that all matrices have the same dimension */

rowLen = mxGetM(prhs[0]);

colLen = mxGetN(prhs[0]);

for(int i = 1; i < nrhs; i++)

if(rowLen != mxGetM(prhs[i]) || colLen != mxGetN(prhs[i]))

mexErrMsgTxt("Matrix dimensions must agree.");

size = rowLen * colLen;

floatsize = sizeof(float) * size;

dimBlock = dim3(10,10);

/*int nBlocks = size/dimBlock.x + (size%dimBlock.x == 0?0:1);*/

int yBlocks = rowLen/dimBlock.y + (rowLen%dimBlock.y == 0?0:1);

int xBlocks = colLen/dimBlock.x + (colLen%dimBlock.x == 0?0:1);

dimGrid = dim3(xBlocks, yBlocks);

}

/* prepare our output matrix */

plhs[0] = mxCreateDoubleMatrix(rowLen, colLen, mxREAL);

/* point to our output matrix's data */

result = mxGetPr(plhs[0]);

memcpy(result, mxGetPr(prhs[0]), sizeof(double)*size); /* copy the first matrix */

for(int i = 1; i < nrhs; i++)

{

if(i == 1) /* only need to do this once */

{

/*allocate data for previous result to add on to*/

cudaMalloc( (void **) &resultGPU, floatsize);

/*allocate data for prhs[i]*/

cudaMalloc( (void **) &nextMatrixGPU, floatsize);

/* convert result to float, do in here in case no addition needs to be

done, so you don't convert if you never enter the for loop */

resultf = (float *) mxMalloc(floatsize);

nextMatrixf = (float *) mxMalloc(floatsize);

for(int k = 0; k < size; k++)

resultf[k] = (float) result[k]; /* float <- double */

}

/* get next matrix */

nextMatrix = mxGetPr(prhs[i]);

/* check to see if prhs[i] is float, else make it float */

if( mxGetClassID(prhs[i]) == mxSINGLE_CLASS )

/* transfer "nextmatrix" data to the GPU*/

cudaMemcpy(nextMatrixGPU, nextMatrix, floatsize, cudaMemcpyHostToDevice);

else /* not single class, make it float */

{

for(int k = 0; k < size; k++)

nextMatrixf[k] = (float) nextMatrix[k];

/* transfer "nextmatrix" data to the GPU */

cudaMemcpy(nextMatrixGPU, nextMatrixf, floatsize, cudaMemcpyHostToDevice);

}

/*transfer the "result" data to the GPU*/

cudaMemcpy(resultGPU, resultf, floatsize, cudaMemcpyHostToDevice);

/*transfers to kernel*/

/* pass pointers to data we copied to device, to the device and whatever other data we need) */

//Check for any CUDA errors

checkCUDAError("kernel invocation");

/*get data from gpu */

cudaMemcpy(resultf, resultGPU, floatsize, cudaMemcpyDeviceToHost);

}

/* convert resultf back to double if matrices were added*/

if(nrhs > 1)

for(int i = 0; i < size; i++)

result[i] = (double) resultf[i];

/* clean up the memories */

if(nrhs > 1)

{

/* device memory */

cudaFree(resultGPU);

cudaFree(nextMatrixGPU);

/* host memory */

mxFree(resultf);

mxFree(nextMatrixf);

}

mexPrintf("Finished successfully?\n");

}

void checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err)

{

mexErrMsgTxt(cudaGetErrorString(err));

}

}
``````

Fixed this a while ago, forgot to update. I made an error in calculating the index, so not all possible indeces were being calculated.