CUDA Matrix Addition - 1D Memory, threads and blocks in 1D Matrix Addition in CUDA C using global m

#include<stdio.h>

#include<cutil_inline.h>

#define BLOCK_SIZE 128

__global__ static void AddKernel(float *d_Buff1, float *d_Buff2,float *d_Result, int iMatSizeM, int iMatSizeN)

{

	const int tid = blockDim.x * blockIdx.x + threadIdx.x;

	if(tid<(iMatSizeM * iMatSizeN))

	{

		d_Result[tid] = d_Buff1[tid] + d_Buff2[tid];

	}

}

void printMatrix(float *pflMat, int iMatSizeM, int iMatSizeN)

{

	for(int idxM = 0; idxM < iMatSizeM; idxM++)

	{

		for(int idxN = 0; idxN < iMatSizeN; idxN++)

		{

			printf("%f\t",pflMat[(idxM * iMatSizeN) + idxN]);

		}

		printf("\n");

	}

	printf("\n");

}

int main()

{

	int iMatSizeM=0,iMatSizeN=0;

	printf("Enter size of Matrix(M*N):");

	scanf("%d %d",&iMatSizeM,&iMatSizeN);

	float *h_flMat1 = NULL, *h_flMat2 = NULL, *h_flMatSum = NULL;

	h_flMat1 = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);

	h_flMat2 = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);

	h_flMatSum = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);

	for(int j=0;j<(iMatSizeM*iMatSizeN);j++)

	{

		h_flMat1[j]=(float)rand()/(float)RAND_MAX;

		h_flMat2[j]=(float)rand()/(float)RAND_MAX;

	}

	printf("Matrix 1\n");

	printMatrix(h_flMat1, iMatSizeM, iMatSizeN);

	printf("Matrix 2\n");

	printMatrix(h_flMat2, iMatSizeM, iMatSizeN);

	float *d_flMat1 = NULL, *d_flMat2 = NULL, *d_flMatSum = NULL;

	cudaMalloc(&d_flMat1,iMatSizeM*iMatSizeN*sizeof(float));

	cudaMalloc(&d_flMat2,iMatSizeM*iMatSizeN*sizeof(float));

	cudaMalloc(&d_flMatSum,iMatSizeM*iMatSizeN*sizeof(float));

	cudaMemcpy(d_flMat1,h_flMat1,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpy(d_flMat2,h_flMat2,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpy(d_flMatSum,h_flMatSum,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);

	dim3 blocks(1,1,1);

	dim3 threads(BLOCK_SIZE,1,1);

	blocks.x=((iMatSizeM * iMatSizeN)/BLOCK_SIZE) + (((iMatSizeM * iMatSizeN)%BLOCK_SIZE)==0?0:1);

	AddKernel<<<blocks,threads>>>(d_flMat1, d_flMat2, d_flMatSum, iMatSizeM, iMatSizeN);

	cudaThreadSynchronize();

	cudaMemcpy(h_flMatSum,d_flMatSum,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyDeviceToHost);

	cudaFree(d_flMat1);

	cudaFree(d_flMat2);

	cudaFree(d_flMatSum);

	printf("Matrix Sum\n");

	printMatrix(h_flMatSum, iMatSizeM, iMatSizeN);

}