mutrix multiplication

I know it is a bit easy but I a newbie to cuda… I have write the code below, to multiply two matrices by cuda and a serial cuda. In my opinion, both codes should work but cuda only takes power of the first roe and nothing else happens. Can you help me about what might be wrong?

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <iostream>

#include <ctime>

#include <cstdlib>

using namespace std;

#define N 4

#define M 4

__global__ void multiply (int *a, int *b, int *c)

{

	int i = threadIdx.x*blockIdx.x;

	int j = threadIdx.y*blockIdx.y;

#pragma unroll

	for(int k=0; k<N; k++)

		c[i*N+j] += a[i*N+j] * b[k*N+j];

__syncthreads();

}

void tester (int *a, int *b, int *c)

{

	for(int i=0; i<N; i++)

	{

		for(int j=0; j<M; j++)

		{

			for(int k=0;k<N;k++)

			{

				c[i*N+j] += a[i*N+j] * b[k*N+j];

			}

		}

	}

}

int main()

{

	srand(time(0));

	int a[N*M];

	int b[N*M];

	int c[N*M];

	int cn[N*M];

	for(int i=0; i<M*N; i++)

	{

		a[i] = b[i] = rand()%3+1;

		cn[i] = 0;

		c[i] = 0;

	}

	int *dev_a;

	int *dev_b;

	int *dev_c;

	cudaMalloc(&dev_a, N*M);

	cudaMalloc(&dev_b, N*M);

	cudaMalloc(&dev_c, N*M);

	cudaMemcpy(dev_a, a, N*M, cudaMemcpyHostToDevice);

	cudaMemcpy(dev_b, b, N*M, cudaMemcpyHostToDevice);

	cudaMemcpy(dev_c, cn, N*M, cudaMemcpyHostToDevice);

	dim3 threads(N, M);

	multiply<<<1,threads>>>(dev_a, dev_b, dev_c);

	cudaMemcpy(cn, dev_c, N*M, cudaMemcpyDeviceToHost);

	cudaFree(dev_a);

	cudaFree(dev_b);

	cudaFree(dev_c);

	tester(a,b,c);

	for(int i=0; i<N; i++)

	{

		for(int j=0;j<M;j++)

		{

			cout<<a[i*N+j]<<" ";

		}

		cout<<endl;

	}

	cout<<endl<<endl<<endl;

	for(int i=0; i<N; i++)

	{

		for(int j=0;j<M;j++)

		{

			cout<<b[i*N+j]<<" ";

		}

		cout<<endl;

	}

	cout<<endl<<endl<<endl;

	for(int i=0; i<N; i++)

	{

		for(int j=0;j<M;j++)

		{

			cout<<c[i*N+j]<<" ";

		}

		cout<<endl;

	}

	cout<<endl<<endl<<endl;

	for(int i=0; i<N; i++)

	{

		for(int j=0;j<M;j++)

		{

			cout<<cn[i*N+j]<<" ";

		}

		cout<<endl;

	}

	return 0;

}
int i = blockDim.x * blockIdx.x + threadIdx.x;

	int j = blockDim.y * blockIdx.y + threadIdx.y;

made the fix but still same, here is sample output;

3 1 2 2

1 1 2 1

3 1 1 3

2 2 3 1

3 1 2 2

1 1 2 1

3 1 1 3

2 2 3 1

27 5 16 14

9 5 16 7

27 5 8 21

18 10 24 7

9 1 4 4

0 0 0 0

0 0 0 0

0 0 0 0
cudaMalloc(&dev_a, N*M*sizeof(int));

        cudaMalloc(&dev_b, N*M*sizeof(int));

        cudaMalloc(&dev_c, N*M*sizeof(int));

        cudaMemcpy(dev_a, a, N*M*sizeof(int), cudaMemcpyHostToDevice);

        cudaMemcpy(dev_b, b, N*M*sizeof(int), cudaMemcpyHostToDevice);

        cudaMemcpy(dev_c, cn, N*M*sizeof(int), cudaMemcpyHostToDevice);

        dim3 threads(N, M);

        multiply<<<1,threads>>>(dev_a, dev_b, dev_c);

        cudaMemcpy(cn, dev_c, N*M*sizeof(int), cudaMemcpyDeviceToHost);

wow, it was even simpler than I thought :) thanks a lot