Better way to program Cuda

hi,

I’m very new to cuda. I just make my program to triple value of given matrix.
The program works fine after compilation.
I would like to know if there is a better way to do it. Your view will help me a lot

Here is the entire program:



//This is a sample program which triples the values of a given matrix
#include<stdio.h>
#include<cuda.h>

//matrix size
#define ROW 8
#define COLON 8
#define BLOCK_SIZE 4

global void triple_kernel(float *a, int M, int N)
{
//setup block index
int bIdy = blockIdx.y;
//set up threads index
int tIdx = threadIdx.x;
int tIdy = threadIdx.y;

//triple values for each block

//methode 1: This uses the shared memory based on sample code provide 
//in matrix multiplication 

//first submatrice to be compute by block 
int aBegin = M * BLOCK_SIZE *bIdy;
//last submatrix to be computed 
int aEnd = aBegin + N -1;
//step to go the next submatrix 
int step = BLOCK_SIZE;

//start computation on all A's submatrix
for(int i = aBegin; i <= aEnd; i+= step)
{
	//declare the submatrix in shared memory 
	__shared__ float AS[BLOCK_SIZE][BLOCK_SIZE];
	//initialize the submatrix of A in shared memory 
	AS[tIdx][tIdy] = a[i + M * tIdy + tIdx];
	
	//synchronize 
	__syncthreads();
	
	//start operation in the submatrix
	AS[tIdx][tIdy] = AS[tIdx][tIdy] * AS[tIdx][tIdy] * AS[tIdx][tIdy];
	
	//synchronize
	__syncthreads();
	
	//update the original matrix
	a[i + M * tIdy + tIdx] = AS[tIdx][tIdy];
}

}

int main(void)
{
//host matrix declaration
float *a_h, *b_h;
//device matrix declaration
float *a_d;

//memory allocation 
size_t size = ROW * COLON * sizeof(float);
a_h = (float*)malloc(size);
b_h = (float*)malloc(size);

//cuda memory allocation
cudaMalloc((void**)&a_d, size);

//initialize the host matrix
printf("values before tripling\n\n");
for(int i = 0; i < ROW; i++){
	for(int j = 0; j < COLON; j++){
		a_h[ROW * i + j] = (float)(i  + j);
		printf("%3.0f ", a_h[ROW*i +j]);
	}
	printf("\n");
}

//copy to to the device 
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);

//method one: transfer block per block 
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
int tmpx = ROW / dimBlock.x + (ROW % dimBlock.x == 0? 0:1);
int tmpy = COLON / dimBlock.y + (COLON % dimBlock.y == 0? 0:1);
printf("tmpx %d, tmpy %d\n", tmpx,tmpy);
dim3 dimGrid(tmpx, tmpy);

//launch the kernel to compute 
triple_kernel <<< dimGrid, dimBlock >>>(a_d, ROW, COLON);

//retrieve the data from the device 
cudaMemcpy(b_h,a_d, size, cudaMemcpyDeviceToHost);
printf("values after tripling\n\n");
for(int i = 0; i < ROW; i++){
	for(int j = 0; j < COLON; j++){
		printf("%4.0f ", b_h[ROW*i +j]);
	}
	printf("\n");
}

//free memory
cudaFree(a_d);
free(a_h);
free(b_h);
return 0;

}



Hello,

First of all, why is this in the Linux support forum instead of the General CUDA programming forum?

Second, with regard to your query, if I understood right, what you’re trying to do, is simply have each cell in a matrix powered by 3? If so, why use shared memory at all?

__global__ void matPow3(float *mat, unsigned int szX, unsignedint szY)

{

	unsigned int tx = blockIdx.x * blockDim.x + threadIdx.x;

	unsigned int ty = blockIdx.y * blockDim.y + threadIdx.y;

	// Check that this thread is designated withing matrix's boundaries.

	if (tx < SzX && ty < SzY)

	{

		// Compiler should be able to implicitly use a register if you just do: mat[ty * SzX + tx] = mat[ty * SzX + tx] * mat[ty * SzX + tx] * mat[ty * SzX + tx];

		// but, we'll write it explicitly for brevity.

		float val = mat[ty * SzX + tx];

		mat[ty * SzX + tx] = val * val * val;

	}

}

.

.

.

// Kernel invocation

dim3 blockSz(16, 16);	  // This is usually a good starting point for the block size.

unsigned int gx = szX / blockSz.x + (szX % blockSz.x > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

unsigned int gy = szY / blockSz.y + (szY % blockSz.y > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

dim3 gridSz(gx, gy);

matPow3<<< gridSz, blockSz >>>(mat, szX, szY);

Hello,

First of all, why is this in the Linux support forum instead of the General CUDA programming forum?

Second, with regard to your query, if I understood right, what you’re trying to do, is simply have each cell in a matrix powered by 3? If so, why use shared memory at all?

__global__ void matPow3(float *mat, unsigned int szX, unsignedint szY)

{

	unsigned int tx = blockIdx.x * blockDim.x + threadIdx.x;

	unsigned int ty = blockIdx.y * blockDim.y + threadIdx.y;

	// Check that this thread is designated withing matrix's boundaries.

	if (tx < SzX && ty < SzY)

	{

		// Compiler should be able to implicitly use a register if you just do: mat[ty * SzX + tx] = mat[ty * SzX + tx] * mat[ty * SzX + tx] * mat[ty * SzX + tx];

		// but, we'll write it explicitly for brevity.

		float val = mat[ty * SzX + tx];

		mat[ty * SzX + tx] = val * val * val;

	}

}

.

.

.

// Kernel invocation

dim3 blockSz(16, 16);	  // This is usually a good starting point for the block size.

unsigned int gx = szX / blockSz.x + (szX % blockSz.x > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

unsigned int gy = szY / blockSz.y + (szY % blockSz.y > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

dim3 gridSz(gx, gy);

matPow3<<< gridSz, blockSz >>>(mat, szX, szY);

Hi weliad,

Thank you for your answer.
I’m using CUDA on fd 10. This is why i sent it here.
I used the shared memory because i thought i should be
faster to do so than loading directly from the global memory.
But carefully thinking, I Think you are totally right since
there is no reuse of the data.

Thank you very much. I’m on my way and hope to help other newbies
as soon as possible.