#include<stdio.h>
#include<cutil_inline.h>
#define BLOCK_SIZE 128
texture<float,1>texVecA;
texture<float,1>texVecB;
__constant__ int ciMatSizeM;
__constant__ int ciMatSizeN;
__global__ static void AddKernel(float *d_Result)
{
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
if(tid<(ciMatSizeM * ciMatSizeN))
{
float flValA = tex1Dfetch(texVecA,tid);
float flValB = tex1Dfetch(texVecB,tid);
d_Result[tid] = flValA + flValB;
}
}
void printMatrix(float *pflMat, int iMatSizeM, int iMatSizeN)
{
for(int idxM = 0; idxM < iMatSizeM; idxM++)
{
for(int idxN = 0; idxN < iMatSizeN; idxN++)
{
printf("%f\t",pflMat[(idxM * iMatSizeN) + idxN]);
}
printf("\n");
}
printf("\n");
}
int main()
{
int iMatSizeM=0,iMatSizeN=0;
printf("Enter size of Matrix(M*N):");
scanf("%d %d",&iMatSizeM,&iMatSizeN);
float *h_flMat1 = NULL, *h_flMat2 = NULL, *h_flMatSum = NULL;
h_flMat1 = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);
h_flMat2 = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);
h_flMatSum = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);
for(int j=0;j<(iMatSizeM*iMatSizeN);j++)
{
h_flMat1[j]=(float)rand()/(float)RAND_MAX;
h_flMat2[j]=(float)rand()/(float)RAND_MAX;
}
printf("Matrix 1\n");
printMatrix(h_flMat1, iMatSizeM, iMatSizeN);
printf("Matrix 2\n");
printMatrix(h_flMat2, iMatSizeM, iMatSizeN);
float *d_flMat1 = NULL, *d_flMat2 = NULL, *d_flMatSum = NULL;
cudaMalloc(&d_flMat1,iMatSizeM*iMatSizeN*sizeof(float));
cudaMalloc(&d_flMat2,iMatSizeM*iMatSizeN*sizeof(float));
cudaMalloc(&d_flMatSum,iMatSizeM*iMatSizeN*sizeof(float));
cudaMemcpy(d_flMat1,h_flMat1,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_flMat2,h_flMat2,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_flMatSum,h_flMatSum,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(ciMatSizeM,&iMatSizeM,sizeof(float),0);
cudaMemcpyToSymbol(ciMatSizeN,&iMatSizeN,sizeof(float),0);
cudaBindTexture(0,texVecA,d_flMat1,(iMatSizeM * iMatSizeN * sizeof(float)));
cudaBindTexture(0, texVecB, d_flMat2, (iMatSizeM * iMatSizeN * sizeof(float)));
dim3 blocks(1,1,1);
dim3 threads(BLOCK_SIZE,1,1);
blocks.x=((iMatSizeM * iMatSizeN) / BLOCK_SIZE) + (((iMatSizeM * iMatSizeN) % BLOCK_SIZE) == 0 ? 0 : 1);
AddKernel<<<blocks,threads>>>(d_flMatSum);
cudaThreadSynchronize();
cudaMemcpy(h_flMatSum,d_flMatSum,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyDeviceToHost);
cudaUnbindTexture(texVecA);
cudaUnbindTexture(texVecB);
cudaFree(d_flMat1);
cudaFree(d_flMat2);
cudaFree(d_flMatSum);
printf("Matrix Sum\n");
printMatrix(h_flMatSum, iMatSizeM, iMatSizeN);
}
CUDA Matrix Addition - 1D Memory, threads and blocks in 1D Matrix Addition in CUDA C using Texture a
What is your question?