# mutrix multiplication

I know it is a bit easy but I a newbie to cuda… I have write the code below, to multiply two matrices by cuda and a serial cuda. In my opinion, both codes should work but cuda only takes power of the first roe and nothing else happens. Can you help me about what might be wrong?

``````#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <iostream>

#include <ctime>

#include <cstdlib>

using namespace std;

#define N 4

#define M 4

__global__ void multiply (int *a, int *b, int *c)

{

#pragma unroll

for(int k=0; k<N; k++)

c[i*N+j] += a[i*N+j] * b[k*N+j];

}

void tester (int *a, int *b, int *c)

{

for(int i=0; i<N; i++)

{

for(int j=0; j<M; j++)

{

for(int k=0;k<N;k++)

{

c[i*N+j] += a[i*N+j] * b[k*N+j];

}

}

}

}

int main()

{

srand(time(0));

int a[N*M];

int b[N*M];

int c[N*M];

int cn[N*M];

for(int i=0; i<M*N; i++)

{

a[i] = b[i] = rand()%3+1;

cn[i] = 0;

c[i] = 0;

}

int *dev_a;

int *dev_b;

int *dev_c;

cudaMalloc(&dev_a, N*M);

cudaMalloc(&dev_b, N*M);

cudaMalloc(&dev_c, N*M);

cudaMemcpy(dev_a, a, N*M, cudaMemcpyHostToDevice);

cudaMemcpy(dev_b, b, N*M, cudaMemcpyHostToDevice);

cudaMemcpy(dev_c, cn, N*M, cudaMemcpyHostToDevice);

cudaMemcpy(cn, dev_c, N*M, cudaMemcpyDeviceToHost);

cudaFree(dev_a);

cudaFree(dev_b);

cudaFree(dev_c);

tester(a,b,c);

for(int i=0; i<N; i++)

{

for(int j=0;j<M;j++)

{

cout<<a[i*N+j]<<" ";

}

cout<<endl;

}

cout<<endl<<endl<<endl;

for(int i=0; i<N; i++)

{

for(int j=0;j<M;j++)

{

cout<<b[i*N+j]<<" ";

}

cout<<endl;

}

cout<<endl<<endl<<endl;

for(int i=0; i<N; i++)

{

for(int j=0;j<M;j++)

{

cout<<c[i*N+j]<<" ";

}

cout<<endl;

}

cout<<endl<<endl<<endl;

for(int i=0; i<N; i++)

{

for(int j=0;j<M;j++)

{

cout<<cn[i*N+j]<<" ";

}

cout<<endl;

}

return 0;

}
``````
``````int i = blockDim.x * blockIdx.x + threadIdx.x;

int j = blockDim.y * blockIdx.y + threadIdx.y;
``````

made the fix but still same, here is sample output;

``````3 1 2 2

1 1 2 1

3 1 1 3

2 2 3 1

3 1 2 2

1 1 2 1

3 1 1 3

2 2 3 1

27 5 16 14

9 5 16 7

27 5 8 21

18 10 24 7

9 1 4 4

0 0 0 0

0 0 0 0

0 0 0 0
``````
``````cudaMalloc(&dev_a, N*M*sizeof(int));

cudaMalloc(&dev_b, N*M*sizeof(int));

cudaMalloc(&dev_c, N*M*sizeof(int));

cudaMemcpy(dev_a, a, N*M*sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(dev_b, b, N*M*sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(dev_c, cn, N*M*sizeof(int), cudaMemcpyHostToDevice);