Better way to program Cuda

kelson · June 29, 2010, 5:17am

hi,

I’m very new to cuda. I just make my program to triple value of given matrix.
The program works fine after compilation.
I would like to know if there is a better way to do it. Your view will help me a lot

Here is the entire program:

//This is a sample program which triples the values of a given matrix
#include<stdio.h>
#include<cuda.h>

//matrix size
#define ROW 8
#define COLON 8
#define BLOCK_SIZE 4

global void triple_kernel(float *a, int M, int N)
{
//setup block index
int bIdy = blockIdx.y;
//set up threads index
int tIdx = threadIdx.x;
int tIdy = threadIdx.y;

//triple values for each block

//methode 1: This uses the shared memory based on sample code provide 
//in matrix multiplication 

//first submatrice to be compute by block 
int aBegin = M * BLOCK_SIZE *bIdy;
//last submatrix to be computed 
int aEnd = aBegin + N -1;
//step to go the next submatrix 
int step = BLOCK_SIZE;

//start computation on all A's submatrix
for(int i = aBegin; i <= aEnd; i+= step)
{
	//declare the submatrix in shared memory 
	__shared__ float AS[BLOCK_SIZE][BLOCK_SIZE];
	//initialize the submatrix of A in shared memory 
	AS[tIdx][tIdy] = a[i + M * tIdy + tIdx];
	
	//synchronize 
	__syncthreads();
	
	//start operation in the submatrix
	AS[tIdx][tIdy] = AS[tIdx][tIdy] * AS[tIdx][tIdy] * AS[tIdx][tIdy];
	
	//synchronize
	__syncthreads();
	
	//update the original matrix
	a[i + M * tIdy + tIdx] = AS[tIdx][tIdy];
}

}

int main(void)
{
//host matrix declaration
float *a_h, *b_h;
//device matrix declaration
float *a_d;

//memory allocation 
size_t size = ROW * COLON * sizeof(float);
a_h = (float*)malloc(size);
b_h = (float*)malloc(size);

//cuda memory allocation
cudaMalloc((void**)&a_d, size);

//initialize the host matrix
printf("values before tripling\n\n");
for(int i = 0; i < ROW; i++){
	for(int j = 0; j < COLON; j++){
		a_h[ROW * i + j] = (float)(i  + j);
		printf("%3.0f ", a_h[ROW*i +j]);
	}
	printf("\n");
}

//copy to to the device 
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);

//method one: transfer block per block 
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
int tmpx = ROW / dimBlock.x + (ROW % dimBlock.x == 0? 0:1);
int tmpy = COLON / dimBlock.y + (COLON % dimBlock.y == 0? 0:1);
printf("tmpx %d, tmpy %d\n", tmpx,tmpy);
dim3 dimGrid(tmpx, tmpy);

//launch the kernel to compute 
triple_kernel <<< dimGrid, dimBlock >>>(a_d, ROW, COLON);

//retrieve the data from the device 
cudaMemcpy(b_h,a_d, size, cudaMemcpyDeviceToHost);
printf("values after tripling\n\n");
for(int i = 0; i < ROW; i++){
	for(int j = 0; j < COLON; j++){
		printf("%4.0f ", b_h[ROW*i +j]);
	}
	printf("\n");
}

//free memory
cudaFree(a_d);
free(a_h);
free(b_h);
return 0;

}

weliad · July 1, 2010, 6:22pm

Hello,

First of all, why is this in the Linux support forum instead of the General CUDA programming forum?

Second, with regard to your query, if I understood right, what you’re trying to do, is simply have each cell in a matrix powered by 3? If so, why use shared memory at all?

__global__ void matPow3(float *mat, unsigned int szX, unsignedint szY)

{

	unsigned int tx = blockIdx.x * blockDim.x + threadIdx.x;

	unsigned int ty = blockIdx.y * blockDim.y + threadIdx.y;

	// Check that this thread is designated withing matrix's boundaries.

	if (tx < SzX && ty < SzY)

	{

		// Compiler should be able to implicitly use a register if you just do: mat[ty * SzX + tx] = mat[ty * SzX + tx] * mat[ty * SzX + tx] * mat[ty * SzX + tx];

		// but, we'll write it explicitly for brevity.

		float val = mat[ty * SzX + tx];

		mat[ty * SzX + tx] = val * val * val;

	}

}

.

.

.

// Kernel invocation

dim3 blockSz(16, 16);	  // This is usually a good starting point for the block size.

unsigned int gx = szX / blockSz.x + (szX % blockSz.x > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

unsigned int gy = szY / blockSz.y + (szY % blockSz.y > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

dim3 gridSz(gx, gy);

matPow3<<< gridSz, blockSz >>>(mat, szX, szY);

weliad · July 1, 2010, 6:22pm

Hello,

First of all, why is this in the Linux support forum instead of the General CUDA programming forum?

Second, with regard to your query, if I understood right, what you’re trying to do, is simply have each cell in a matrix powered by 3? If so, why use shared memory at all?

__global__ void matPow3(float *mat, unsigned int szX, unsignedint szY)

{

	unsigned int tx = blockIdx.x * blockDim.x + threadIdx.x;

	unsigned int ty = blockIdx.y * blockDim.y + threadIdx.y;

	// Check that this thread is designated withing matrix's boundaries.

	if (tx < SzX && ty < SzY)

	{

		// Compiler should be able to implicitly use a register if you just do: mat[ty * SzX + tx] = mat[ty * SzX + tx] * mat[ty * SzX + tx] * mat[ty * SzX + tx];

		// but, we'll write it explicitly for brevity.

		float val = mat[ty * SzX + tx];

		mat[ty * SzX + tx] = val * val * val;

	}

}

.

.

.

// Kernel invocation

dim3 blockSz(16, 16);	  // This is usually a good starting point for the block size.

unsigned int gx = szX / blockSz.x + (szX % blockSz.x > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

unsigned int gy = szY / blockSz.y + (szY % blockSz.y > 0 ? 1 : 0);  // divide the matrix size by the block size, and add 1 if there's a remainder.

dim3 gridSz(gx, gy);

matPow3<<< gridSz, blockSz >>>(mat, szX, szY);

kelson · July 5, 2010, 3:09am

Hi weliad,

Thank you for your answer.
I’m using CUDA on fd 10. This is why i sent it here.
I used the shared memory because i thought i should be
faster to do so than loading directly from the global memory.
But carefully thinking, I Think you are totally right since
there is no reuse of the data.

Thank you very much. I’m on my way and hope to help other newbies
as soon as possible.

Topic		Replies	Views
Example of Matrix Multiplication(from cuda book) points that i dont anderstend ... CUDA Programming and Performance	6	34329	February 3, 2008
Shared memory matrix multiplication not working CUDA Programming and Performance	6	46	October 11, 2024
copy from 1D array to shared memory matrix in cuda CUDA Programming and Performance	7	2052	June 9, 2015
Cuda matrix multiplication too slow CUDA Programming and Performance	5	13323	February 17, 2010
Cuda Latency problems Slow Cuda CUDA Programming and Performance	15	13927	September 5, 2008
Getting an Error Using CudaMalloc3d CUDA Programming and Performance	10	2459	December 10, 2015
Subtle problem with matrix-matrix multiplication CUDA Programming and Performance	10	1120	December 12, 2020
optimization shared memory fail major speed using shared memory in detriment of global memory CUDA Programming and Performance	3	3667	March 31, 2011
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	9993	January 18, 2012
[SOLVED] Code his own shared memory with device memory! CUDA Programming and Performance	15	2552	October 7, 2015

Better way to program Cuda

Related topics