Apologies. (I posted in wrong section before)
Hi, I am very fresh in learning CUDA and I need some help adding matrices. So far i have this as my adding function:
#define N 3
const dim3 threadsPerBlock(N, N);
const dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
global void compute(int a[N][N], int b[N][N], int c[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
c[i][j] = a[i][j] + b[i][j];
}
It is very similar to the NVIDIA programming guide example.
From there i have:
int main(void)
{
int a[N][N], b[N][N], c[N][N];
int dev_a[N][N], dev_b[N][N], dev_c[N][N];
cudaMalloc( (void**)&dev_a, (NN)sizeof(int) );
cudaMalloc( (void*)&dev_b, (NN)sizeof(int) );
cudaMalloc( (void*)&dev_c, (N*N)*sizeof(int) );
//THEN I FILL THE MATRICES UP WITH RANDOM NUMBERS
and finish off with this:
cudaMemcpy(dev_a, a, (N*N)sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, (NN)*sizeof(int), cudaMemcpyHostToDevice);
compute<<<numBlocks,threadsPerBlock>>>(dev_a, dev_b,dev_c);
cudaMemcpy(c,dev_c, (N*N)*sizeof(int), cudaMemcpyDeviceToHost);
The addition is not happening. Can anyone see where i went wrong? Thanks.