Apologies. (I posted in wrong section before)

Hi, I am very fresh in learning CUDA and I need some help adding matrices. So far i have this as my adding function:

#define N 3

const dim3 threadsPerBlock(N, N);

const dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);

**global** void compute(int a[N][N], int b[N][N], int c[N][N])

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockIdx.y * blockDim.y + threadIdx.y;

if (i < N && j < N)

c[i][j] = a[i][j] + b[i][j];

}

It is very similar to the NVIDIA programming guide example.

From there i have:

int main(void)

{

int a[N][N], b[N][N], c[N][N];

int dev_a[N][N], dev_b[N][N], dev_c[N][N];

cudaMalloc( (void**)&dev_a, (N*N) sizeof(int) );*N)

cudaMalloc( (void*)&dev_b, (N

*sizeof(int) );*

cudaMalloc( (void*)&dev_c, (N*N)*sizeof(int) );

cudaMalloc( (void

//THEN I FILL THE MATRICES UP WITH RANDOM NUMBERS

and finish off with this:

cudaMemcpy(dev_a, a, (N*N)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, (N*N)*sizeof(int), cudaMemcpyHostToDevice);

compute<<<numBlocks,threadsPerBlock>>>(dev_a, dev_b,dev_c);

cudaMemcpy(c,dev_c, (N*N)*sizeof(int), cudaMemcpyDeviceToHost);

The addition is not happening. Can anyone see where i went wrong? Thanks.