Matrix multiplication result Valildation Newbee question

kumaresh · January 21, 2009, 3:22pm

hi…

I am sorry if I sound so dumb. Here is my trouble: I understand that we cannot compare float in C/C++. But how can I verify my CUDA output against CPU computed result? My second CUDA program is matrix multiplication (duh?) I compared results (host and CUDA output) and got out unscathed. But when I crank the input matrices size to 2 ^ 7 my results break apart :( . Here is the listing. I know it is not a great code. Don’t flame me, I am learning XD. I have also attached the diff file. Thanks for your time.

__global__ void MatMul(float *a, float *b, float *c, const unsigned int N)

{

	unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;

	unsigned int j = threadIdx.y + blockIdx.y * blockDim.y;

	//if(i < N && j < N) really required ??

	{

		unsigned int k = 0;//blockIdx.y * blockDim.y, kEnd = blockIdx.y * (1 + blockDim.y);

		unsigned int temp = i * N;

		for(k = 0; k < N; ++k) c[temp + j] += a[temp + k] * b[k * N + j];

	}

	return;

}

void HostMatMul(float *a, float *b, float *c, const unsigned int N)

{

	for(unsigned int i = 0; i < N; ++i)

	{

		unsigned int temp = i * N;

		for(unsigned int j = 0; j < N; ++j)

		{

			unsigned int idx = temp + j;

			c[idx] = 0.0;

			for(unsigned int k = 0; k < N; ++k)

			{

				c[idx] += a[temp + k] * b[k * N + j];

			}

			//file >> c[idx];

		}

	}

	//file.close();

	return;

}

int HostVerifyResult(float *a, float *b, const unsigned int N)

{

	float iRet = 0.0f;

	unsigned int i, j, temp;

	for(i = 0; i < N; ++i)

		for(j = 0, temp = i * N; j < N; ++j)

		{

			iRet = a[temp + j] - b[temp +j];

			if(iRet != 0.0f) cout << i << " x " << j << " : " << b[i * N + j] << " vs " << a[i * N + j] << " = " << iRet << endl;

		}

	return 0;

}

int main()

{

	float *aD, *bD, *cD, *aH, *bH, *cH, *aVerify;

	const unsigned int shiftFactor  = 7; // breaks at 7. anything < 7 passes

	const unsigned int N			= 1 << shiftFactor;

	const unsigned int matN		= N * N;

	const unsigned int memSize	  = sizeof(float) * N * N;

	aH	  = (float*) malloc(memSize);

	bH	  = (float*) malloc(memSize);

	cH	  = (float*) malloc(memSize);

	aVerify = (float*) malloc(memSize);

	if(aH && bH && cH)

	{

		unsigned int i = N * N - 1;

		do

		{

			bH[i] = aH[i] = (float) i;

			cH[i] = 0.0f;

		}while(i--);

		const unsigned int n_threadsx   = warpSize / 2;

		const unsigned int n_threadsy   = warpSize / 2;

		const unsigned int n_threads	= n_threadsx * n_threadsy;

		const unsigned int n_blocks	 = matN / n_threads + ((matN % n_threads) ? 1 : 0);

		cout <<"blocks : " << n_blocks << "\n";

		dim3 dimBlocks (n_threadsx, n_threadsy);

		dim3 dimGrid (1, n_blocks);

		if(n_blocks > 1)

		{

			if(n_blocks > maxGridDim) cout << "WARNING thread per element policy cannot be satisfied!!\n";

			else

			{

				dimGrid.y = 2 * N / warpSize;

				dimGrid.x = 2 * N / WarpSize;

			}

		}

		cout << "Grid Dim : " << dimGrid.x << " " << dimGrid.y << " " << dimGrid.z << endl;

		cout << "Block Dim : " << dimBlocks.x << " " << dimBlocks.y << " " << dimBlocks.z << endl;

		cudaMalloc((void**) &aD, memSize);

		cudaMalloc((void**) &bD, memSize);

		cudaMalloc((void**) &cD, memSize);

		cudaMemcpy(aD, aH, memSize, cudaMemcpyHostToDevice);

		cudaMemcpy(bD, bH, memSize, cudaMemcpyHostToDevice);

		cudaMemcpy(cD, cH, memSize, cudaMemcpyHostToDevice);

		MatMul <<<dimGrid, dimBlocks>>> (aD, bD, cD, N);

		cudaThreadSynchronize(); // a necessary ?

		cudaMemcpy(aVerify, cD, memSize, cudaMemcpyDeviceToHost);

		HostMatMul(aH, bH, cH, N);

		HostVerifyResult(cH, aVerify, N);

		free(aH);

		free(bH);

		free(cH);

		cudaFree(aD);

		cudaFree(bD);

		cudaFree(cD);

	}

	return 0;

}

EDIT:

It’s just a single bit difference. May be I should use smaller numbers… Will try that soon :)
res.zip (37.2 KB)

tatou1234 · January 22, 2009, 8:28am

hi…

I am sorry if I sound so dumb. Here is my trouble: I understand that we cannot compare float in C/C++. But how can I verify my CUDA output against CPU computed result? My second CUDA program is matrix multiplication (duh?) I compared results (host and CUDA output) and got out unscathed. But when I crank the input matrices size to 2 ^ 7 my results break apart :( . Here is the listing. I know it is not a great code. Don’t flame me, I am learning XD. I have also attached the diff file. Thanks for your time.

__global__ void MatMul(float *a, float *b, float *c, const unsigned int N)

{

	unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;

	unsigned int j = threadIdx.y + blockIdx.y * blockDim.y;

	//if(i < N && j < N) really required ??

	{

		unsigned int k = 0;//blockIdx.y * blockDim.y, kEnd = blockIdx.y * (1 + blockDim.y);

		unsigned int temp = i * N;

		for(k = 0; k < N; ++k) c[temp + j] += a[temp + k] * b[k * N + j];

	}

	return;

}

void HostMatMul(float *a, float *b, float *c, const unsigned int N)

{

	for(unsigned int i = 0; i < N; ++i)

	{

		unsigned int temp = i * N;

		for(unsigned int j = 0; j < N; ++j)

		{

			unsigned int idx = temp + j;

			c[idx] = 0.0;

			for(unsigned int k = 0; k < N; ++k)

			{

				c[idx] += a[temp + k] * b[k * N + j];

			}

			//file >> c[idx];

		}

	}

	//file.close();

	return;

}

int HostVerifyResult(float *a, float *b, const unsigned int N)

{

	float iRet = 0.0f;

	unsigned int i, j, temp;

	for(i = 0; i < N; ++i)

		for(j = 0, temp = i * N; j < N; ++j)

		{

			iRet = a[temp + j] - b[temp +j];

			if(iRet != 0.0f) cout << i << " x " << j << " : " << b[i * N + j] << " vs " << a[i * N + j] << " = " << iRet << endl;

		}

	return 0;

}

int main()

{

	float *aD, *bD, *cD, *aH, *bH, *cH, *aVerify;

	const unsigned int shiftFactor  = 7; // breaks at 7. anything < 7 passes

	const unsigned int N			= 1 << shiftFactor;

	const unsigned int matN		= N * N;

	const unsigned int memSize	  = sizeof(float) * N * N;

	aH	  = (float*) malloc(memSize);

	bH	  = (float*) malloc(memSize);

	cH	  = (float*) malloc(memSize);

	aVerify = (float*) malloc(memSize);

	if(aH && bH && cH)

	{

		unsigned int i = N * N - 1;

		do

		{

			bH[i] = aH[i] = (float) i;

			cH[i] = 0.0f;

		}while(i--);

		const unsigned int n_threadsx   = warpSize / 2;

		const unsigned int n_threadsy   = warpSize / 2;

		const unsigned int n_threads	= n_threadsx * n_threadsy;

		const unsigned int n_blocks	 = matN / n_threads + ((matN % n_threads) ? 1 : 0);

		cout <<"blocks : " << n_blocks << "\n";

		dim3 dimBlocks (n_threadsx, n_threadsy);

		dim3 dimGrid (1, n_blocks);

		if(n_blocks > 1)

		{

			if(n_blocks > maxGridDim) cout << "WARNING thread per element policy cannot be satisfied!!\n";

			else

			{

				dimGrid.y = 2 * N / warpSize;

				dimGrid.x = 2 * N / WarpSize;

			}

		}

		cout << "Grid Dim : " << dimGrid.x << " " << dimGrid.y << " " << dimGrid.z << endl;

		cout << "Block Dim : " << dimBlocks.x << " " << dimBlocks.y << " " << dimBlocks.z << endl;

		cudaMalloc((void**) &aD, memSize);

		cudaMalloc((void**) &bD, memSize);

		cudaMalloc((void**) &cD, memSize);

		cudaMemcpy(aD, aH, memSize, cudaMemcpyHostToDevice);

		cudaMemcpy(bD, bH, memSize, cudaMemcpyHostToDevice);

		cudaMemcpy(cD, cH, memSize, cudaMemcpyHostToDevice);

		MatMul <<<dimGrid, dimBlocks>>> (aD, bD, cD, N);

		cudaThreadSynchronize(); // a necessary ?

		cudaMemcpy(aVerify, cD, memSize, cudaMemcpyDeviceToHost);

		HostMatMul(aH, bH, cH, N);

		HostVerifyResult(cH, aVerify, N);

		free(aH);

		free(bH);

		free(cH);

		cudaFree(aD);

		cudaFree(bD);

		cudaFree(cD);

	}

	return 0;

}

EDIT:

It’s just a single bit difference. May be I should use smaller numbers… Will try that soon :)

you have a nice example with matrix multiplication in NVIDIA_CUDA_SDK called matrixMul.

try to find, in your program results, some wrong values between host and device values. sure there are different but maybe the difference could be so small because the float isn’t always exactly.

jack · January 22, 2009, 4:05pm

Like tatou said, you can’t take an element in the host-computed result and the GPU-computed result, subtract them, and compare to zero. It will almost never be exact due to rounding errors in floating-point computations (not a GPU problem, just a fact of life when using finite-length / floating-point representations of numbers). Instead, you should subtract the elements and compare to an ‘epsilon’ value, where epsilon is the highest value of error you are willing to allow in your code.

Another, simpler, way you could check the error of your calculations is to write a bit of code to do matrix (elementwise) subtraction; subtract the two matrices, then compute a matrix norm on the difference matrix. The value of the norm will be a sort of ‘average’ error for each element in the computations, which you can record, then test various changes in your algorithm to see if the norm gets any lower.

Topic		Replies	Views
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	9997	January 18, 2012
Matrix Multiplication In CUDA CUDA Programming and Performance	6	2533	May 11, 2015
matrix multiplication CUDA Programming and Performance	10	3780	March 7, 2010
Matrix Multiplication Garbage value :( CUDA Programming and Performance	10	3404	July 25, 2009
Matrix Multiplication Buggy CUDA Programming and Performance	13	5234	May 5, 2010
matrix multiplication with its transpose in cuda(cudamemcpy from device to host not working) . CUDA Programming and Performance	6	1743	October 5, 2018
32 x 32 Matrix Multiplication CUDA Programming and Performance	2	2859	March 5, 2010
matrix multiplication benchmark CUDA Programming and Performance	8	4344	May 21, 2010
Cuda matrix multiplication too slow CUDA Programming and Performance	5	13328	February 17, 2010
matrix multiply reduction CUDA Programming and Performance	41	35539	January 15, 2011

Matrix multiplication result Valildation Newbee question

Related topics