#include<stdio.h>
#include<cutil_inline.h>
#define BLOCK_SIZE 128
__global__ static void AddKernel(float *d_Buff1, float *d_Buff2,float *d_Result, int iMatSizeM, int iMatSizeN)
{
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
if(tid<(iMatSizeM * iMatSizeN))
{
d_Result[tid] = d_Buff1[tid] + d_Buff2[tid];
}
}
void printMatrix(float *pflMat, int iMatSizeM, int iMatSizeN)
{
for(int idxM = 0; idxM < iMatSizeM; idxM++)
{
for(int idxN = 0; idxN < iMatSizeN; idxN++)
{
printf("%f\t",pflMat[(idxM * iMatSizeN) + idxN]);
}
printf("\n");
}
printf("\n");
}
int main()
{
int iMatSizeM=0,iMatSizeN=0;
printf("Enter size of Matrix(M*N):");
scanf("%d %d",&iMatSizeM,&iMatSizeN);
float *h_flMat1 = NULL, *h_flMat2 = NULL, *h_flMatSum = NULL;
h_flMat1 = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);
h_flMat2 = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);
h_flMatSum = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);
for(int j=0;j<(iMatSizeM*iMatSizeN);j++)
{
h_flMat1[j]=(float)rand()/(float)RAND_MAX;
h_flMat2[j]=(float)rand()/(float)RAND_MAX;
}
printf("Matrix 1\n");
printMatrix(h_flMat1, iMatSizeM, iMatSizeN);
printf("Matrix 2\n");
printMatrix(h_flMat2, iMatSizeM, iMatSizeN);
float *d_flMat1 = NULL, *d_flMat2 = NULL, *d_flMatSum = NULL;
cudaMalloc(&d_flMat1,iMatSizeM*iMatSizeN*sizeof(float));
cudaMalloc(&d_flMat2,iMatSizeM*iMatSizeN*sizeof(float));
cudaMalloc(&d_flMatSum,iMatSizeM*iMatSizeN*sizeof(float));
cudaMemcpy(d_flMat1,h_flMat1,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_flMat2,h_flMat2,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_flMatSum,h_flMatSum,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);
dim3 blocks(1,1,1);
dim3 threads(BLOCK_SIZE,1,1);
blocks.x=((iMatSizeM * iMatSizeN)/BLOCK_SIZE) + (((iMatSizeM * iMatSizeN)%BLOCK_SIZE)==0?0:1);
AddKernel<<<blocks,threads>>>(d_flMat1, d_flMat2, d_flMatSum, iMatSizeM, iMatSizeN);
cudaThreadSynchronize();
cudaMemcpy(h_flMatSum,d_flMatSum,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_flMat1);
cudaFree(d_flMat2);
cudaFree(d_flMatSum);
printf("Matrix Sum\n");
printMatrix(h_flMatSum, iMatSizeM, iMatSizeN);
}