Dear CUDA gurus,

I am a novice regarding CUDA & GPU parallel programming (sorry in advance for my possibly silly questions).

I want to parallelize several of my MEX files that compute different univariate statistics on large 2D arrays for each column in the array separately. Therefore, I wrote a little test program that should calculate the column means of a 2D array. However, it doesn’t work:

#include “mex.h”

#include “cuda.h”

/* ************* kernel ************* */

**global** void means(float* in, float* out, int n, int m)

{

int jdx;

int idx = blockIdx.x*blockDim.x+threadIdx.x;

out[idx] = 0;

if ( idx < n)

{

for (jdx=0; jdx < m; ++jdx)

out[idx] += in[idx*m+jdx];

out[idx] /= m;

}

}

void mexFunction( int nlhs, mxArray *plhs, int nrhs, const mxArray *prhs[])
{
int dims0[2];
float *) mxGetData(prhs[0]);

*in,*) mxGetData(plhs[0]); in = (float

*out; float*sizeof(float); dims0[0]=n; dims0[1]=1; plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL); out = (float*gin,*sizeof(float); int sizeN = n*gout; int pitch; int m = mxGetM(prhs[0]); int n = mxGetN(prhs[0]); int sizeMN = m*n// Alocate space on the GPU

// Input array

cudaMalloc((void**)&gin,sizeMN);

// Output array

cudaMalloc((void**)&gout,sizeN);

// Copy array to GPU

cudaMemcpy(gin,in,sizeMN,cudaMemcpyHostToDevice);

dim3 dimBlock(256);

dim3 dimGrid((m*n)/dimBlock.x);
if ( (n*m) % 256 !=0 ) dimGrid.x+=1;

// Compute kernel

means<<<dimGrid,dimBlock>>>(gin,gout,n,m);

cudaMemcpy(out,gout,sizeN,cudaMemcpyDeviceToHost);

// Free allocated memory

cudaFree(gin);

cudaFree(gout);

}

…obviously, this doesn’t work, but I have no idea how to write the kernel so that the computation is done correctly. Do I have to distribute the N columns of the array to N block having M threads?

Any help would be greatly appreciated!!!

Best,

NikosK