I am struggling in handling Matrices using CUDA.

I am trying to go through the examples but still not getting hold of very simple operations. Need your help and guidance to carry out simple operations on Matrices as below:

Operations required to carry out:

- How do I access each element of a Matrix. In simple C, it would have been A[i][j]. What is the equivalent using CUDA? I know it should be of the nature of A[offset], but I am unable to clearly write a kernel for it. So let us assume that matrix(initialized with Random numbers) is of 10x10 and I have to access A[2][3], then what should be the formula for the notation of A[offset]? What should be the formula to access elements in a row or how should I loop inorder to segregate a matrix row-wise? I mean divide 10x10 matrix into 10 arrays of 1x10 ?
- If there are 2 matrices and I have to carry out a simple operation, say C[1][1] = A[2][3]+B[4][5]+13, then how would I go about doing it in CUDA, using threads?

Any sample would be of great help. I am able to write the below code (addition of elements of a matrix), but not able to move ahead in the direction of above. Also I am getting addition of 1st row only in the below program, not for each element of both the matrix. I guess the division of work is wrong. Any comment? Please look into this urgently. - Also, if I am doing addition in the kernel, say for 2 matrices, then should I be printing it, by first transferring to Host and then printing OR should I print it via Kernel, itself. I tried first option, but it is giving sum junk values (same) for each element. I guess it is some memory location. Please suggest.

#include <cuda_runtime.h>

#include <stdio.h>

#include <stdlib.h>

#include <math.h>

//Number of Elements, Grid and Block Config

#define N 10

#define BLOCK_SIZE 1

#define GRID_SIZE 1

//Host & Kernel(Device) Functions

//Initialize Array with some Random Numbers

void initialData(int *ip, const int size)

{

int i;

```
for (i = 0; i < size; i++)
{
ip[i] = (int)(rand() & 5);
}
```

}

//Print Data initialized above

void printData(int *ip, int size)

{

```
for (int i = 0; i < size; i++)
{
printf("%d", ip[i]);
}
return;
```

}

//Print Matrix

void printMatrix(int *C, const int nx, const int ny) {

int *ic = C;

printf("\nMatrix: (%d.%d)\n", nx, ny);

for (int iy = 0; iy<ny; iy++) {

for (int ix = 0; ix<nx; ix++) {

printf("%d\t", ic[ix]);

}

ic += nx;

printf("\n");

}

printf("\n");

}

//Sum of Matrix

**global** void sumMatrix(int *MatA, int *MatB, int *MatC, int size) {

```
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < size) {
MatC[idx] = MatA[idx] + MatB[idx] ;
printf("%d\t", MatC[idx]);
}
/*
for (int z = 0; z < size; z++) {
printf("\n%d", MatC[z]);
}
*/
return;
```

}

//Main Function

int main(int argc, char **argv) {

```
//Data Size
int size = N*N;
int nbytes = size * sizeof(int);
//Allocate memory on Host
int *h_A, *h_B, *h_C;
h_A = (int *)malloc(nbytes);
h_B = (int *)malloc(nbytes);
h_C = (int *)malloc(nbytes);
//Initialize Data on Host side
initialData(h_A, size);
initialData(h_B, size);
//Allocate memory on device
int *d_A, *d_B, *d_MatC;
cudaMalloc((void **)&d_A, nbytes);
cudaMalloc((void **)&d_B, nbytes);
cudaMalloc((void **)&d_MatC, nbytes);
//Transfer Data from Host to Device
cudaMemcpy(d_A, h_A, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nbytes, cudaMemcpyHostToDevice);
//Initialize the Dimensions of Block and Grid
//dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
//dim3 dimGrid(GRID_SIZE, GRID_SIZE);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(GRID_SIZE,GRID_SIZE);
sumMatrix <<<1, 10>>> (d_A, d_B, d_MatC, size);
cudaDeviceSynchronize;
//Transfer Data from Host to Device
cudaMemcpy(h_C, d_MatC, nbytes, cudaMemcpyHostToDevice);
printMatrix(h_A, N, N);
printMatrix(h_B, N, N);
//printMatrix(d_MatC, N, N);
free(h_A);
free(h_B);
free(h_C);
// reset device
cudaDeviceReset();
return 0;
```

}

Output: