Matrix Addition

Hi, I am very fresh in learning CUDA and I need some help adding matrices. So far i have this as my adding function:

#define N 3
const dim3 threadsPerBlock(N, N);
const dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);

global void compute(int a[N][N], int b[N][N], int c[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
c[i][j] = a[i][j] + b[i][j];
}

It is very similar to the NVIDIA programming guide example.
From there i have:

int main(void)
{
int a[N][N], b[N][N], c[N][N];
int dev_a[N][N], dev_b[N][N], dev_c[N][N];

cudaMalloc( (void**)&dev_a, (N*N)*sizeof(int) );
cudaMalloc( (void**)&dev_b, (N*N)*sizeof(int) );
cudaMalloc( (void**)&dev_c, (N*N)*sizeof(int) );

    //THEN I FILL THE MATRICES UP WITH RANDOM NUMBERS

and finish off with this:

cudaMemcpy(dev_a, a, (N*N)sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, (N
N)*sizeof(int), cudaMemcpyHostToDevice);

compute<<<numBlocks,threadsPerBlock>>>(dev_a,dev_b,dev_c);

cudaMemcpy(c,dev_c, (N*N)*sizeof(int), cudaMemcpyDeviceToHost);

The addition is not happening. Can anyone see where i went wrong? Thanks.

Hi wolfshark,

Pointers to device memory can not be defined in this style:
int dev_a[N][N], dev_b[N][N], dev_c[N][N];

It defines 2D array, not pointers, you should write like this:
int *pdev_a, *pdev_b, *pdev_c;

Best regards!

Looks quite good to me, as sjiagc said, device pointers need to be changed.

This is how I should do it (not tested). I prefer 1-D arrays, and calculate the indices myself. And always check the results of cuda…() functions, it will save you a lot of debugging time.

#define N 1024

const dim3 threadsPerBlock(16, 16);

const dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y); // think about rounding when N is not a multiple of threadsPerBlock

__global__ void compute(int *a, int *b, int *c)

{

    int i = blockIdx.x * blockDim.x + threadIdx.x; 

    int j = blockIdx.y * blockDim.y + threadIdx.y;

    //if (i < N && j < N) // not (yet) needed, will always evaluate to true in this example

    c[j*N+i] = a[j*N+i] + b[j*N+i];

} 

int main(void)

{

    cudaError_t err;

    int *a, *b, *c;

    int *dev_a, *dev_b, *dev_c;

a = (int*)malloc(N*N*sizeof(int)); // malloc because N is 'large'

    b = (int*)malloc(N*N*sizeof(int));

    c = (int*)malloc(N*N*sizeof(int));

err = cudaMalloc( (void**)&dev_a, (N*N)*sizeof(int) );

    if(err != cudaSuccess) return -1;

    err = cudaMalloc( (void**)&dev_b, (N*N)*sizeof(int) );

    if(err != cudaSuccess) return -1;

    err = cudaMalloc( (void**)&dev_c, (N*N)*sizeof(int) );

    if(err != cudaSuccess) return -1;

//THEN I FILL THE MATRICES UP WITH RANDOM NUMBERS

    //and finish off with this:

err = cudaMemcpy(dev_a, a, (N*N)*sizeof(int), cudaMemcpyHostToDevice);

    if(err != cudaSuccess) return -1;

    err = cudaMemcpy(dev_b, b, (N*N)*sizeof(int), cudaMemcpyHostToDevice);

    if(err != cudaSuccess) return -1;

compute<<<numBlocks,threadsPerBlock>>>(dev_a, dev_b, dev_c);

err = cudaMemcpy(c,dev_c, (N*N)*sizeof(int), cudaMemcpyDeviceToHost);

    if(err != cudaSuccess) return -1;

    return 0;

}