I am struggling in handling Matrices using CUDA.
I am trying to go through the examples but still not getting hold of very simple operations. Need your help and guidance to carry out simple operations on Matrices as below:
Operations required to carry out:
- How do I access each element of a Matrix. In simple C, it would have been A[i][j]. What is the equivalent using CUDA? I know it should be of the nature of A[offset], but I am unable to clearly write a kernel for it. So let us assume that matrix(initialized with Random numbers) is of 10x10 and I have to access A[2][3], then what should be the formula for the notation of A[offset]? What should be the formula to access elements in a row or how should I loop inorder to segregate a matrix row-wise? I mean divide 10x10 matrix into 10 arrays of 1x10 ?
- If there are 2 matrices and I have to carry out a simple operation, say C[1][1] = A[2][3]+B[4][5]+13, then how would I go about doing it in CUDA, using threads?
Any sample would be of great help. I am able to write the below code (addition of elements of a matrix), but not able to move ahead in the direction of above. Also I am getting addition of 1st row only in the below program, not for each element of both the matrix. I guess the division of work is wrong. Any comment? Please look into this urgently. - Also, if I am doing addition in the kernel, say for 2 matrices, then should I be printing it, by first transferring to Host and then printing OR should I print it via Kernel, itself. I tried first option, but it is giving sum junk values (same) for each element. I guess it is some memory location. Please suggest.
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
//Number of Elements, Grid and Block Config
#define N 10
#define BLOCK_SIZE 1
#define GRID_SIZE 1
//Host & Kernel(Device) Functions
//Initialize Array with some Random Numbers
void initialData(int *ip, const int size)
{
int i;
for (i = 0; i < size; i++)
{
ip[i] = (int)(rand() & 5);
}
}
//Print Data initialized above
void printData(int *ip, int size)
{
for (int i = 0; i < size; i++)
{
printf("%d", ip[i]);
}
return;
}
//Print Matrix
void printMatrix(int *C, const int nx, const int ny) {
int *ic = C;
printf(“\nMatrix: (%d.%d)\n”, nx, ny);
for (int iy = 0; iy<ny; iy++) {
for (int ix = 0; ix<nx; ix++) {
printf(“%d\t”, ic[ix]);
}
ic += nx;
printf(“\n”);
}
printf(“\n”);
}
//Sum of Matrix
global void sumMatrix(int *MatA, int *MatB, int *MatC, int size) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < size) {
MatC[idx] = MatA[idx] + MatB[idx] ;
printf("%d\t", MatC[idx]);
}
/*
for (int z = 0; z < size; z++) {
printf("\n%d", MatC[z]);
}
*/
return;
}
//Main Function
int main(int argc, char **argv) {
//Data Size
int size = N*N;
int nbytes = size * sizeof(int);
//Allocate memory on Host
int *h_A, *h_B, *h_C;
h_A = (int *)malloc(nbytes);
h_B = (int *)malloc(nbytes);
h_C = (int *)malloc(nbytes);
//Initialize Data on Host side
initialData(h_A, size);
initialData(h_B, size);
//Allocate memory on device
int *d_A, *d_B, *d_MatC;
cudaMalloc((void **)&d_A, nbytes);
cudaMalloc((void **)&d_B, nbytes);
cudaMalloc((void **)&d_MatC, nbytes);
//Transfer Data from Host to Device
cudaMemcpy(d_A, h_A, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nbytes, cudaMemcpyHostToDevice);
//Initialize the Dimensions of Block and Grid
//dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
//dim3 dimGrid(GRID_SIZE, GRID_SIZE);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(GRID_SIZE,GRID_SIZE);
sumMatrix <<<1, 10>>> (d_A, d_B, d_MatC, size);
cudaDeviceSynchronize;
//Transfer Data from Host to Device
cudaMemcpy(h_C, d_MatC, nbytes, cudaMemcpyHostToDevice);
printMatrix(h_A, N, N);
printMatrix(h_B, N, N);
//printMatrix(d_MatC, N, N);
free(h_A);
free(h_B);
free(h_C);
// reset device
cudaDeviceReset();
return 0;
}
Output: