Hi,

I am trying to speed up my Matlab codes using GPU.

For a beginnig I tried a simple vectorAdd function which simply adds two vectors.

The code is as below.

```
#include "cuda.h"
#include "mex.h"
#define BLOCK_SIZE 256
/* Kernel to add elements of two arrays on the GPU */
__global__ void vecAdd(float* A, float* B, float* C)
{
__shared__ float As[BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE];
int tx = threadIdx.x;
int bx = blockIdx.x;
int index = tx + BLOCK_SIZE*bx;
As[tx] = A[index];
Bs[tx] = B[index];
C[index] = As[tx] + Bs[tx];
}
/* Gateway function */
void mexFunction(int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
int M, N;
float *data1, *data2, *result;
float *data1_gpu, *data2_gpu, *result_gpu;
if (nrhs != 2 || nlhs != 1)
mexErrMsgTxt("The number of input and output arguments must 2 & 1 resp.");
if ( mxGetM(prhs[0]) != mxGetM(prhs[1]) || mxGetN(prhs[0]) != mxGetN(prhs[1]))
mexErrMsgTxt("Sizes of input vectors must be the same");
if ( mxGetM(prhs[0]) != 1 && mxGetN(prhs[0]) != 1)
mexErrMsgTxt("Inputs should be vectors (i.e. 1xN or Nx1 matrices)");
if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS || mxGetClassID(prhs[1]) != mxSINGLE_CLASS)
mexErrMsgTxt("Input vectors should be of type mxSINGLE_CLASS (i.e. single)");
/* Find the dimensions of the data */
M = mxGetM(prhs[0]);
N = mxGetN(prhs[0]);
if (M*N % BLOCK_SIZE != 0)
mexErrMsgTxt("Length of the input vectors should be an integer multiple of 256");
// Create an mxArray for the output data
plhs[0] = mxCreateNumericArray(mxGetNumberOfDimensions(prhs[0]), mxGetDimensions(prhs[0]),mxSINGLE_CLASS, mxREAL);
// Create an input and output data array on the GPU
cudaMalloc( (void **) &data1_gpu,sizeof(float)*M*N);
cudaMalloc( (void **) &data2_gpu,sizeof(float)*M*N);
cudaMalloc( (void **) &result_gpu,sizeof(float)*M*N);
/* Retrieve the input data */
data1 = (float*)mxGetPr(prhs[0]);
data2 = (float*)mxGetPr(prhs[1]);
result = (float*)mxGetPr(plhs[0]);
// The input array is single precision, it can be sent directly to the card
cudaMemcpy( data1_gpu, data1, sizeof(float)*M*N,cudaMemcpyHostToDevice);
cudaMemcpy( data2_gpu, data2, sizeof(float)*M*N,cudaMemcpyHostToDevice);
// Compute execution configuration using BLOCK_SIZE threads per block
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGrid((M*N)/dimBlock.x);
if ( (N*M) % BLOCK_SIZE !=0 )
dimGrid.x+=1;
// Call function on GPU
vecAdd<<<dimGrid,dimBlock>>>(data1_gpu, data2_gpu, result_gpu);
// Copy result back to host
cudaMemcpy( result, result_gpu, sizeof(float)*M*N, cudaMemcpyDeviceToHost);
// Clean-up memory on device
cudaFree(data1_gpu);
cudaFree(data2_gpu);
cudaFree(result_gpu);
}
```

Since they are faster I used shared memory. But when I compare the speed of Matlab and my code (using tic toc and a big vector which is 1024e+4 long) Matlab always beats me.

I would really appreciate your valuable comments on this code, where may I be wrong?

And the main question is: I would like to investigate the assembly code. However -ptx (or -keep) options are not supported by nvmex script

And I hope to find the reason for slowness in assembly code.

PS: I am using Matlab R2008a