Matrix Addition

wolfshark · June 14, 2012, 2:32am

Hi, I am very fresh in learning CUDA and I need some help adding matrices. So far i have this as my adding function:

#define N 3
const dim3 threadsPerBlock(N, N);
const dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);

global void compute(int a[N][N], int b[N][N], int c[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
c[i][j] = a[i][j] + b[i][j];
}

It is very similar to the NVIDIA programming guide example.
From there i have:

int main(void)
{
int a[N][N], b[N][N], c[N][N];
int dev_a[N][N], dev_b[N][N], dev_c[N][N];

cudaMalloc( (void**)&dev_a, (N*N)*sizeof(int) );
cudaMalloc( (void**)&dev_b, (N*N)*sizeof(int) );
cudaMalloc( (void**)&dev_c, (N*N)*sizeof(int) );

    //THEN I FILL THE MATRICES UP WITH RANDOM NUMBERS

and finish off with this:

cudaMemcpy(dev_a, a, (N*N)sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, (NN)*sizeof(int), cudaMemcpyHostToDevice);

compute<<<numBlocks,threadsPerBlock>>>(dev_a,dev_b,dev_c);

cudaMemcpy(c,dev_c, (N*N)*sizeof(int), cudaMemcpyDeviceToHost);

The addition is not happening. Can anyone see where i went wrong? Thanks.

sjiagc · June 14, 2012, 3:21am

Hi wolfshark,

Pointers to device memory can not be defined in this style:
int dev_a[N][N], dev_b[N][N], dev_c[N][N];

It defines 2D array, not pointers, you should write like this:
int *pdev_a, *pdev_b, *pdev_c;

Best regards!

Gert-Jan · June 14, 2012, 8:17am

Looks quite good to me, as sjiagc said, device pointers need to be changed.

This is how I should do it (not tested). I prefer 1-D arrays, and calculate the indices myself. And always check the results of cuda…() functions, it will save you a lot of debugging time.

#define N 1024

const dim3 threadsPerBlock(16, 16);

const dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y); // think about rounding when N is not a multiple of threadsPerBlock

__global__ void compute(int *a, int *b, int *c)

{

    int i = blockIdx.x * blockDim.x + threadIdx.x; 

    int j = blockIdx.y * blockDim.y + threadIdx.y;

    //if (i < N && j < N) // not (yet) needed, will always evaluate to true in this example

    c[j*N+i] = a[j*N+i] + b[j*N+i];

} 

int main(void)

{

    cudaError_t err;

    int *a, *b, *c;

    int *dev_a, *dev_b, *dev_c;

a = (int*)malloc(N*N*sizeof(int)); // malloc because N is 'large'

    b = (int*)malloc(N*N*sizeof(int));

    c = (int*)malloc(N*N*sizeof(int));

err = cudaMalloc( (void**)&dev_a, (N*N)*sizeof(int) );

    if(err != cudaSuccess) return -1;

    err = cudaMalloc( (void**)&dev_b, (N*N)*sizeof(int) );

    if(err != cudaSuccess) return -1;

    err = cudaMalloc( (void**)&dev_c, (N*N)*sizeof(int) );

    if(err != cudaSuccess) return -1;

//THEN I FILL THE MATRICES UP WITH RANDOM NUMBERS

    //and finish off with this:

err = cudaMemcpy(dev_a, a, (N*N)*sizeof(int), cudaMemcpyHostToDevice);

    if(err != cudaSuccess) return -1;

    err = cudaMemcpy(dev_b, b, (N*N)*sizeof(int), cudaMemcpyHostToDevice);

    if(err != cudaSuccess) return -1;

compute<<<numBlocks,threadsPerBlock>>>(dev_a, dev_b, dev_c);

err = cudaMemcpy(c,dev_c, (N*N)*sizeof(int), cudaMemcpyDeviceToHost);

    if(err != cudaSuccess) return -1;

    return 0;

}

Topic		Replies	Views
Matrix Addition CUDA Programming and Performance	1	1147	June 14, 2012
basic matrix addition CUDA Programming and Performance	3	1884	March 9, 2012
2matrix addition CUDA Programming and Performance	3	919	April 28, 2010
I got the wrong result from matrix summation CUDA Programming and Performance	2	521	June 1, 2011
CUDA Matrix Addition - 1D Memory, threads and blocks in 1D Matrix Addition in CUDA C using Texture a CUDA Programming and Performance	1	2351	November 26, 2011
CUDA Matrix Addition - 1D Memory, threads and blocks in 1D Matrix Addition in CUDA C using global m CUDA Programming and Performance	0	1077	November 26, 2011
First CUDA program... looks good, executing wrong? CUDA Programming and Performance	3	1020	June 24, 2009
matrixAddition a simple cuda program, not working. please help CUDA Programming and Performance	1	3580	August 18, 2009
CUDA Matrix Addition - 1D Memory, threads and blocks in 1D using global memory CUDA Programming and Performance	1	2632	November 27, 2011
2D matrix addition question CUDA Programming and Performance	7	12246	May 17, 2009

Matrix Addition

Related topics