Dear CUDA gurus,
I am a novice regarding CUDA & GPU parallel programming (sorry in advance for my possibly silly questions).
I want to parallelize several of my MEX files that compute different univariate statistics on large 2D arrays for each column in the array separately. Therefore, I wrote a little test program that should calculate the column means of a 2D array. However, it doesn’t work:
#include “mex.h”
#include “cuda.h”
/* ************* kernel ************* */
global void means(float* in, float* out, int n, int m)
{
int jdx;
int idx = blockIdx.x*blockDim.x+threadIdx.x;
out[idx] = 0;
if ( idx < n)
{
for (jdx=0; jdx < m; ++jdx)
out[idx] += in[idx*m+jdx];
out[idx] /= m;
}
}
void mexFunction( int nlhs, mxArray *plhs, int nrhs, const mxArray prhs[])
{
int dims0[2];
float in, out;
float gin, gout;
int pitch;
int m = mxGetM(prhs[0]);
int n = mxGetN(prhs[0]);
int sizeMN = mnsizeof(float);
int sizeN = nsizeof(float);
dims0[0]=n; dims0[1]=1;
plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL);
out = (float) mxGetData(plhs[0]);
in = (float) mxGetData(prhs[0]);
// Alocate space on the GPU
// Input array
cudaMalloc((void**)&gin,sizeMN);
// Output array
cudaMalloc((void**)&gout,sizeN);
// Copy array to GPU
cudaMemcpy(gin,in,sizeMN,cudaMemcpyHostToDevice);
dim3 dimBlock(256);
dim3 dimGrid((mn)/dimBlock.x);
if ( (nm) % 256 !=0 ) dimGrid.x+=1;
// Compute kernel
means<<<dimGrid,dimBlock>>>(gin,gout,n,m);
cudaMemcpy(out,gout,sizeN,cudaMemcpyDeviceToHost);
// Free allocated memory
cudaFree(gin);
cudaFree(gout);
}
…obviously, this doesn’t work, but I have no idea how to write the kernel so that the computation is done correctly. Do I have to distribute the N columns of the array to N block having M threads?
Any help would be greatly appreciated!!!
Best,
NikosK