hi,

I’m very new to cuda. I just make my program to triple value of given matrix.

The program works fine after compilation.

I would like to know if there is a better way to do it. Your view will help me a lot

Here is the entire program:

//This is a sample program which triples the values of a given matrix

#include<stdio.h>

#include<cuda.h>

//matrix size

#define ROW 8

#define COLON 8

#define BLOCK_SIZE 4

**global** void triple_kernel(float *a, int M, int N)

{

//setup block index

int bIdy = blockIdx.y;

//set up threads index

int tIdx = threadIdx.x;

int tIdy = threadIdx.y;

```
//triple values for each block
//methode 1: This uses the shared memory based on sample code provide
//in matrix multiplication
//first submatrice to be compute by block
int aBegin = M * BLOCK_SIZE *bIdy;
//last submatrix to be computed
int aEnd = aBegin + N -1;
//step to go the next submatrix
int step = BLOCK_SIZE;
//start computation on all A's submatrix
for(int i = aBegin; i <= aEnd; i+= step)
{
//declare the submatrix in shared memory
__shared__ float AS[BLOCK_SIZE][BLOCK_SIZE];
//initialize the submatrix of A in shared memory
AS[tIdx][tIdy] = a[i + M * tIdy + tIdx];
//synchronize
__syncthreads();
//start operation in the submatrix
AS[tIdx][tIdy] = AS[tIdx][tIdy] * AS[tIdx][tIdy] * AS[tIdx][tIdy];
//synchronize
__syncthreads();
//update the original matrix
a[i + M * tIdy + tIdx] = AS[tIdx][tIdy];
}
```

}

int main(void)

{

//host matrix declaration

float *a_h, *b_h;

//device matrix declaration

float *a_d;

```
//memory allocation
size_t size = ROW * COLON * sizeof(float);
a_h = (float*)malloc(size);
b_h = (float*)malloc(size);
//cuda memory allocation
cudaMalloc((void**)&a_d, size);
//initialize the host matrix
printf("values before tripling\n\n");
for(int i = 0; i < ROW; i++){
for(int j = 0; j < COLON; j++){
a_h[ROW * i + j] = (float)(i + j);
printf("%3.0f ", a_h[ROW*i +j]);
}
printf("\n");
}
//copy to to the device
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
//method one: transfer block per block
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
int tmpx = ROW / dimBlock.x + (ROW % dimBlock.x == 0? 0:1);
int tmpy = COLON / dimBlock.y + (COLON % dimBlock.y == 0? 0:1);
printf("tmpx %d, tmpy %d\n", tmpx,tmpy);
dim3 dimGrid(tmpx, tmpy);
//launch the kernel to compute
triple_kernel <<< dimGrid, dimBlock >>>(a_d, ROW, COLON);
//retrieve the data from the device
cudaMemcpy(b_h,a_d, size, cudaMemcpyDeviceToHost);
printf("values after tripling\n\n");
for(int i = 0; i < ROW; i++){
for(int j = 0; j < COLON; j++){
printf("%4.0f ", b_h[ROW*i +j]);
}
printf("\n");
}
//free memory
cudaFree(a_d);
free(a_h);
free(b_h);
return 0;
```

}