#include<stdio.h>
#include<cutil_inline.h>
#define BLOCK_SIZE 16
__global__ static void AddKernel(float *d_Buff1, float *d_Buff2,float *d_Result, int iMatSizeM, int iMatSizeN)
{
const int tidx = blockDim.x * blockIdx.x + threadIdx.x;
const int tidy = blockDim.y * blockIdx.y + threadIdx.y;
if(tidx<iMatSizeM && tidy<iMatSizeN)
{
d_Result[tidx * iMatSizeN + tidy] = d_Buff1[tidx * iMatSizeN + tidy] + d_Buff2[tidx * iMatSizeN + tidy];
}
}
void printMatrix(float *pflMat, int iMatSizeM, int iMatSizeN)
{
for(int idxM = 0; idxM < iMatSizeM; idxM++)
{
for(int idxN = 0; idxN < iMatSizeN; idxN++)
{
printf("%f\t",pflMat[(idxM * iMatSizeN) + idxN]);
}
printf("\n");
}
printf("\n");
}
int main()
{
int iMatSizeM=0,iMatSizeN=0;
printf("Enter size of Matrix(M*N):");
scanf("%d %d",&iMatSizeM,&iMatSizeN);
float *h_flMat1 = (float*)malloc(iMatSizeM * iMatSizeN * sizeof(float));
float *h_flMat2 = (float*)malloc(iMatSizeM * iMatSizeN * sizeof(float));
float *h_flMatSum = (float*)malloc(iMatSizeM * iMatSizeN * sizeof(float));
for(int j=0;j<(iMatSizeM*iMatSizeN);j++)
{
h_flMat1[j]=(float)rand()/(float)RAND_MAX;
h_flMat2[j]=(float)rand()/(float)RAND_MAX;
}
printf("Matrix 1\n");
printMatrix(h_flMat1, iMatSizeM, iMatSizeN);
printf("Matrix 2\n");
printMatrix(h_flMat2, iMatSizeM, iMatSizeN);
float *d_flMat1 = NULL, *d_flMat2 = NULL, *d_flMatSum = NULL;
cudaMalloc(&d_flMat1, iMatSizeM * iMatSizeN * sizeof(float));
cudaMalloc(&d_flMat2, iMatSizeM * iMatSizeN * sizeof(float));
cudaMalloc(&d_flMatSum, iMatSizeM * iMatSizeN * sizeof(float));
cudaMemcpy(d_flMat1, h_flMat1, iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_flMat2, h_flMat2, iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_flMatSum, h_flMatSum, iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);
dim3 blocks(1,1,1);
dim3 threadsperblock(BLOCK_SIZE,BLOCK_SIZE,1);
blocks.x=((iMatSizeM/BLOCK_SIZE) + (((iMatSizeM)%BLOCK_SIZE)==0?0:1));
blocks.y=((iMatSizeN/BLOCK_SIZE) + (((iMatSizeN)%BLOCK_SIZE)==0?0:1));
AddKernel<<<blocks,threadsperblock>>>(d_flMat1,d_flMat2,d_flMatSum,iMatSizeM,iMatSizeN);
cudaThreadSynchronize();
cudaMemcpy(h_flMatSum,d_flMatSum,iMatSizeM * iMatSizeN * sizeof(float),cudaMemcpyDeviceToHost);
cudaFree(d_flMat1);
cudaFree(d_flMat2);
cudaFree(d_flMatSum);
printf("Matrix Sum\n");
printMatrix(h_flMatSum, iMatSizeM, iMatSizeN);
}
CUDA Matrix Addition - 1D Memory, threads and blocks in 1D
Using Texture and Constant Memory
#include<stdio.h>
#include<cutil_inline.h>
#define BLOCK_SIZE 16
texture<float,2>texVecA;
texture<float,2>texVecB;
__constant__ int ciMatSizeM;
__constant__ int ciMatSizeN;
__global__ static void AddKernel(float *d_Result)
{
const int tidx = blockDim.x * blockIdx.x + threadIdx.x;
const int tidy = blockDim.y * blockIdx.y + threadIdx.y;
if(tidx<ciMatSizeM && tidy<ciMatSizeN)
{
float flValA = tex2D(texVecA,tidx,tidy);
float flValB = tex2D(texVecB,tidx,tidy);
d_Result[tidx * ciMatSizeN + tidy] = flValA + flValB;
}
}
void printMatrix(float *pflMat, int iMatSizeM, int iMatSizeN)
{
for(int idxM = 0; idxM < iMatSizeM; idxM++)
{
for(int idxN = 0; idxN < iMatSizeN; idxN++)
{
printf("%f\t",pflMat[(idxM * iMatSizeN) + idxN]);
}
printf("\n");
}
printf("\n");
}
int main()
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
int iMatSizeM=0,iMatSizeN=0;
printf("Enter size of Matrix(M*N):");
scanf("%d %d",&iMatSizeM,&iMatSizeN);
float *h_flMat1 = (float*)malloc(iMatSizeM * iMatSizeN * sizeof(float));
float *h_flMat2 = (float*)malloc(iMatSizeM * iMatSizeN * sizeof(float));
float *h_flMatSum = (float*)malloc(iMatSizeM * iMatSizeN * sizeof(float));
for(int j=0;j<(iMatSizeM*iMatSizeN);j++)
{
h_flMat1[j]=(float)rand()/(float)RAND_MAX;
h_flMat2[j]=(float)rand()/(float)RAND_MAX;
}
printf("Matrix 1\n");
printMatrix(h_flMat1, iMatSizeM, iMatSizeN);
printf("Matrix 2\n");
printMatrix(h_flMat2, iMatSizeM, iMatSizeN);
float *d_flMat1 = NULL, *d_flMat2 = NULL, *d_flMatSum = NULL;
cudaMalloc(&d_flMat1, iMatSizeM * iMatSizeN * sizeof(float));
cudaMalloc(&d_flMat2, iMatSizeM * iMatSizeN * sizeof(float));
cudaMalloc(&d_flMatSum, iMatSizeM * iMatSizeN * sizeof(float));
cudaMemcpy(d_flMat1, h_flMat1, iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_flMat2, h_flMat2, iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(ciMatSizeM,&iMatSizeM,sizeof(float),0);
cudaMemcpyToSymbol(ciMatSizeN,&iMatSizeN,sizeof(float),0);
cudaBindTexture2D(0, texVecA, d_flMat1, channelDesc, iMatSizeN, iMatSizeM, iMatSizeM * sizeof(float));
cudaBindTexture2D(0, texVecB, d_flMat2, channelDesc, iMatSizeN, iMatSizeM, iMatSizeM * sizeof(float));
dim3 blocks(1,1,1);
dim3 threadsperblock(BLOCK_SIZE,BLOCK_SIZE,1);
blocks.x=((iMatSizeM/BLOCK_SIZE) + (((iMatSizeM)%BLOCK_SIZE)==0?0:1));
blocks.y=((iMatSizeN/BLOCK_SIZE) + (((iMatSizeN)%BLOCK_SIZE)==0?0:1));
AddKernel<<<blocks,threadsperblock>>>(d_flMatSum);
cudaThreadSynchronize();
cudaMemcpy(h_flMatSum,d_flMatSum,iMatSizeM * iMatSizeN * sizeof(float),cudaMemcpyDeviceToHost);
cudaUnbindTexture(texVecA);
cudaUnbindTexture(texVecB);
cudaFree(d_flMat1);
cudaFree(d_flMat2);
cudaFree(d_flMatSum);
printf("Matrix Sum\n");
printMatrix(h_flMatSum, iMatSizeM, iMatSizeN);
}