I know it is a bit easy but I a newbie to cuda… I have write the code below, to multiply two matrices by cuda and a serial cuda. In my opinion, both codes should work but cuda only takes power of the first roe and nothing else happens. Can you help me about what might be wrong?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <ctime>
#include <cstdlib>
using namespace std;
#define N 4
#define M 4
__global__ void multiply (int *a, int *b, int *c)
{
int i = threadIdx.x*blockIdx.x;
int j = threadIdx.y*blockIdx.y;
#pragma unroll
for(int k=0; k<N; k++)
c[i*N+j] += a[i*N+j] * b[k*N+j];
__syncthreads();
}
void tester (int *a, int *b, int *c)
{
for(int i=0; i<N; i++)
{
for(int j=0; j<M; j++)
{
for(int k=0;k<N;k++)
{
c[i*N+j] += a[i*N+j] * b[k*N+j];
}
}
}
}
int main()
{
srand(time(0));
int a[N*M];
int b[N*M];
int c[N*M];
int cn[N*M];
for(int i=0; i<M*N; i++)
{
a[i] = b[i] = rand()%3+1;
cn[i] = 0;
c[i] = 0;
}
int *dev_a;
int *dev_b;
int *dev_c;
cudaMalloc(&dev_a, N*M);
cudaMalloc(&dev_b, N*M);
cudaMalloc(&dev_c, N*M);
cudaMemcpy(dev_a, a, N*M, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*M, cudaMemcpyHostToDevice);
cudaMemcpy(dev_c, cn, N*M, cudaMemcpyHostToDevice);
dim3 threads(N, M);
multiply<<<1,threads>>>(dev_a, dev_b, dev_c);
cudaMemcpy(cn, dev_c, N*M, cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
tester(a,b,c);
for(int i=0; i<N; i++)
{
for(int j=0;j<M;j++)
{
cout<<a[i*N+j]<<" ";
}
cout<<endl;
}
cout<<endl<<endl<<endl;
for(int i=0; i<N; i++)
{
for(int j=0;j<M;j++)
{
cout<<b[i*N+j]<<" ";
}
cout<<endl;
}
cout<<endl<<endl<<endl;
for(int i=0; i<N; i++)
{
for(int j=0;j<M;j++)
{
cout<<c[i*N+j]<<" ";
}
cout<<endl;
}
cout<<endl<<endl<<endl;
for(int i=0; i<N; i++)
{
for(int j=0;j<M;j++)
{
cout<<cn[i*N+j]<<" ";
}
cout<<endl;
}
return 0;
}