CUDA Matrix Addition - 1D Memory, threads and blocks in 1D Matrix Addition in CUDA C using Texture a

#include<stdio.h>

#include<cutil_inline.h>

#define BLOCK_SIZE 128

texture<float,1>texVecA;

texture<float,1>texVecB;

__constant__ int ciMatSizeM;

__constant__ int ciMatSizeN;

__global__ static void AddKernel(float *d_Result)

{

	const int tid = blockDim.x * blockIdx.x + threadIdx.x;

	if(tid<(ciMatSizeM * ciMatSizeN))

	{

		float flValA = tex1Dfetch(texVecA,tid);

		float flValB = tex1Dfetch(texVecB,tid);

		d_Result[tid] = flValA + flValB;

	}

}

void printMatrix(float *pflMat, int iMatSizeM, int iMatSizeN)

{

	for(int idxM = 0; idxM < iMatSizeM; idxM++)

	{

		for(int idxN = 0; idxN < iMatSizeN; idxN++)

		{

			printf("%f\t",pflMat[(idxM * iMatSizeN) + idxN]);

		}

		printf("\n");

	}

	printf("\n");

}

int main()

{

	int iMatSizeM=0,iMatSizeN=0;

	printf("Enter size of Matrix(M*N):");

	scanf("%d %d",&iMatSizeM,&iMatSizeN);

	float *h_flMat1 = NULL, *h_flMat2 = NULL, *h_flMatSum = NULL;

	h_flMat1 = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);

	h_flMat2 = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);

	h_flMatSum = (float*)malloc(sizeof(float) * iMatSizeM * iMatSizeN);

	for(int j=0;j<(iMatSizeM*iMatSizeN);j++)

	{

		h_flMat1[j]=(float)rand()/(float)RAND_MAX;

		h_flMat2[j]=(float)rand()/(float)RAND_MAX;

	}

	printf("Matrix 1\n");

	printMatrix(h_flMat1, iMatSizeM, iMatSizeN);

	printf("Matrix 2\n");

	printMatrix(h_flMat2, iMatSizeM, iMatSizeN);

	float *d_flMat1 = NULL, *d_flMat2 = NULL, *d_flMatSum = NULL;

	cudaMalloc(&d_flMat1,iMatSizeM*iMatSizeN*sizeof(float));

	cudaMalloc(&d_flMat2,iMatSizeM*iMatSizeN*sizeof(float));

	cudaMalloc(&d_flMatSum,iMatSizeM*iMatSizeN*sizeof(float));

	cudaMemcpy(d_flMat1,h_flMat1,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpy(d_flMat2,h_flMat2,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpy(d_flMatSum,h_flMatSum,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpyToSymbol(ciMatSizeM,&iMatSizeM,sizeof(float),0);

	cudaMemcpyToSymbol(ciMatSizeN,&iMatSizeN,sizeof(float),0);

	

	cudaBindTexture(0,texVecA,d_flMat1,(iMatSizeM * iMatSizeN * sizeof(float)));

	cudaBindTexture(0, texVecB, d_flMat2, (iMatSizeM * iMatSizeN * sizeof(float)));

	

	dim3 blocks(1,1,1);

	dim3 threads(BLOCK_SIZE,1,1);

	blocks.x=((iMatSizeM * iMatSizeN) / BLOCK_SIZE) + (((iMatSizeM * iMatSizeN) % BLOCK_SIZE) == 0 ? 0 : 1);

	AddKernel<<<blocks,threads>>>(d_flMatSum);

	cudaThreadSynchronize();

	cudaMemcpy(h_flMatSum,d_flMatSum,iMatSizeM * iMatSizeN * sizeof(float), cudaMemcpyDeviceToHost);

	cudaUnbindTexture(texVecA);

	cudaUnbindTexture(texVecB);

	

	cudaFree(d_flMat1);

	cudaFree(d_flMat2);

	cudaFree(d_flMatSum);

	printf("Matrix Sum\n");

	printMatrix(h_flMatSum, iMatSizeM, iMatSizeN);

}

What is your question?