# Better way to program Cuda

hi,

I’m very new to cuda. I just make my program to triple value of given matrix.
The program works fine after compilation.
I would like to know if there is a better way to do it. Your view will help me a lot

Here is the entire program:

//This is a sample program which triples the values of a given matrix
#include<stdio.h>
#include<cuda.h>

//matrix size
#define ROW 8
#define COLON 8
#define BLOCK_SIZE 4

global void triple_kernel(float *a, int M, int N)
{
//setup block index
int bIdy = blockIdx.y;

``````//triple values for each block

//methode 1: This uses the shared memory based on sample code provide
//in matrix multiplication

//first submatrice to be compute by block
int aBegin = M * BLOCK_SIZE *bIdy;
//last submatrix to be computed
int aEnd = aBegin + N -1;
//step to go the next submatrix
int step = BLOCK_SIZE;

//start computation on all A's submatrix
for(int i = aBegin; i <= aEnd; i+= step)
{
//declare the submatrix in shared memory
__shared__ float AS[BLOCK_SIZE][BLOCK_SIZE];
//initialize the submatrix of A in shared memory
AS[tIdx][tIdy] = a[i + M * tIdy + tIdx];

//synchronize

//start operation in the submatrix
AS[tIdx][tIdy] = AS[tIdx][tIdy] * AS[tIdx][tIdy] * AS[tIdx][tIdy];

//synchronize

//update the original matrix
a[i + M * tIdy + tIdx] = AS[tIdx][tIdy];
}
``````

}

int main(void)
{
//host matrix declaration
float *a_h, *b_h;
//device matrix declaration
float *a_d;

``````//memory allocation
size_t size = ROW * COLON * sizeof(float);
a_h = (float*)malloc(size);
b_h = (float*)malloc(size);

//cuda memory allocation
cudaMalloc((void**)&a_d, size);

//initialize the host matrix
printf("values before tripling\n\n");
for(int i = 0; i < ROW; i++){
for(int j = 0; j < COLON; j++){
a_h[ROW * i + j] = (float)(i  + j);
printf("%3.0f ", a_h[ROW*i +j]);
}
printf("\n");
}

//copy to to the device
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);

//method one: transfer block per block
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
int tmpx = ROW / dimBlock.x + (ROW % dimBlock.x == 0? 0:1);
int tmpy = COLON / dimBlock.y + (COLON % dimBlock.y == 0? 0:1);
printf("tmpx %d, tmpy %d\n", tmpx,tmpy);
dim3 dimGrid(tmpx, tmpy);

//launch the kernel to compute
triple_kernel <<< dimGrid, dimBlock >>>(a_d, ROW, COLON);

//retrieve the data from the device
cudaMemcpy(b_h,a_d, size, cudaMemcpyDeviceToHost);
printf("values after tripling\n\n");
for(int i = 0; i < ROW; i++){
for(int j = 0; j < COLON; j++){
printf("%4.0f ", b_h[ROW*i +j]);
}
printf("\n");
}

//free memory
cudaFree(a_d);
free(a_h);
free(b_h);
return 0;
``````

}

Hello,

First of all, why is this in the Linux support forum instead of the General CUDA programming forum?

Second, with regard to your query, if I understood right, what you’re trying to do, is simply have each cell in a matrix powered by 3? If so, why use shared memory at all?

``````__global__ void matPow3(float *mat, unsigned int szX, unsignedint szY)

{

unsigned int tx = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int ty = blockIdx.y * blockDim.y + threadIdx.y;

// Check that this thread is designated withing matrix's boundaries.

if (tx < SzX && ty < SzY)

{

// Compiler should be able to implicitly use a register if you just do: mat[ty * SzX + tx] = mat[ty * SzX + tx] * mat[ty * SzX + tx] * mat[ty * SzX + tx];

// but, we'll write it explicitly for brevity.

float val = mat[ty * SzX + tx];

mat[ty * SzX + tx] = val * val * val;

}

}

.

.

.

// Kernel invocation

dim3 blockSz(16, 16);	  // This is usually a good starting point for the block size.

unsigned int gx = szX / blockSz.x + (szX % blockSz.x > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

unsigned int gy = szY / blockSz.y + (szY % blockSz.y > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

dim3 gridSz(gx, gy);

matPow3<<< gridSz, blockSz >>>(mat, szX, szY);
``````

Hello,

First of all, why is this in the Linux support forum instead of the General CUDA programming forum?

Second, with regard to your query, if I understood right, what you’re trying to do, is simply have each cell in a matrix powered by 3? If so, why use shared memory at all?

``````__global__ void matPow3(float *mat, unsigned int szX, unsignedint szY)

{

unsigned int tx = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int ty = blockIdx.y * blockDim.y + threadIdx.y;

// Check that this thread is designated withing matrix's boundaries.

if (tx < SzX && ty < SzY)

{

// Compiler should be able to implicitly use a register if you just do: mat[ty * SzX + tx] = mat[ty * SzX + tx] * mat[ty * SzX + tx] * mat[ty * SzX + tx];

// but, we'll write it explicitly for brevity.

float val = mat[ty * SzX + tx];

mat[ty * SzX + tx] = val * val * val;

}

}

.

.

.

// Kernel invocation

dim3 blockSz(16, 16);	  // This is usually a good starting point for the block size.

unsigned int gx = szX / blockSz.x + (szX % blockSz.x > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

unsigned int gy = szY / blockSz.y + (szY % blockSz.y > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

dim3 gridSz(gx, gy);

matPow3<<< gridSz, blockSz >>>(mat, szX, szY);
``````