Take Garbage Value At the place of Result

Dbajaj · November 10, 2009, 5:41am

I wrote a code to Find the Multiplication of Square Matrix(3*3).it is given right result of First 3 Elements,but afterward take garbage values (Show the snapshot of output attach with it).I use Visual Stdio 2005 with nvidia GPU and Win32.

Why it is not given a right result.

/************************************************************

********

*  SquareMatrixMultification

*  This is a example of the CUDA program.

************************************************************

*********/

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>

#include<conio.h>

__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)

{

		int i=blockIdx.y*blockDim.y+threadIdx.y;

		int j=blockIdx.x*blockDim.x+threadIdx.x;

		int k;

		float a,b;

		float sum=0;

		for(k=0;k<width;k++)

		{

			 a=x[i*width+k];

			

			 b=y[k*width+j];

			 

			sum+=a*b;

		}

		

		z[i*width+j]=sum;

}

int main()

{

			float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;

			int width=3;

			int i;

			size_t size=sizeof(float)*(width*width);

			a_h=(float *)malloc(size);

			b_h=(float *)malloc(size);

			c_h=(float *)malloc(size);

			cudaMalloc((void **)&a_d,size);

			cudaMalloc((void **)&b_d,size);

			cudaMalloc((void **)&c_d,size);

			printf("\nEnter the Elements of First Matrix");

			for(i=0;i<(width*width);i++)

			{

				 scanf("%f",&a_h[i]);

			}

			

			printf("\nElements of First Matrix");

			for(i=0;i<(width*width);i++)

			{

				printf("\n%f",a_h[i]);

			}

			cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);

			printf("\nEnter the Elements of Second Matrix");

			for(i=0;i<(width*width);i++)

			{

				 scanf("%f",&b_h[i]);

			}

			printf("\nElements of Second Matrix");

			for(i=0;i<(width*width);i++)

			{

				printf("\n%f",b_h[i]);

			}

			cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);

			int blocksize=4;

			width=width*width;

			int nblock=width/blocksize+(width%blocksize==0?0:1);

			width=3;

			SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);

			cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);

			printf("\nMultification of SquareMatrics");

			for(i=0;i<(width*width);i++)

			{

				 printf("\n%f",c_h[i]);

			}

			free(a_h);

			free(b_h);

			free(c_h);

			cudaFree(a_d);

			cudaFree(b_d);

			cudaFree(c_d);

			getch();

			return 0;

}

LSChien · November 10, 2009, 5:50am

you need to impose boundary condition

__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)

{

		int i=blockIdx.y*blockDim.y+threadIdx.y;

		int j=blockIdx.x*blockDim.x+threadIdx.x;

		int k;

		float a,b;

		float sum=0;

		if ( (i < width) && ( j < width ) ){

			for(k=0;k<width;k++)

			{

			 a=x[i*width+k];

			

			 b=y[k*width+j];

			 

			sum+=a*b;

			}

		

			z[i*width+j]=sum;

		}// for valid (i,j)

}

Dbajaj · November 10, 2009, 6:34am

you need to impose boundary condition

__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)

{

		int i=blockIdx.y*blockDim.y+threadIdx.y;

		int j=blockIdx.x*blockDim.x+threadIdx.x;

		int k;

		float a,b;

		float sum=0;

		if ( (i < width) && ( j < width ) ){

			for(k=0;k<width;k++)

			{

			 a=x[i*width+k];

			

			 b=y[k*width+j];

			 

			sum+=a*b;

			}

		

			z[i*width+j]=sum;

		}// for valid (i,j)

}

Respected Sir,

                               Thanks to quick reply.I change my code according your suggestion.But this it  is given Wrong Result(snapshot of output).Why it is not given right result .

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>

#include<conio.h>

__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)

{

		int i=blockIdx.y*blockDim.y+threadIdx.y;

		int j=blockIdx.x*blockDim.x+threadIdx.x;

		int k;

		float a,b;

		float sum=0;

		if((i<width)&&(j<width))

		{

		for(k=0;k<width;k++)

		{

			 a=x[i*width+k];

			

			 b=y[k*width+j];

			 

			sum+=a*b;

		}

		

		z[i*width+j]=sum;

		}

}

int main()

{

			float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;

			int width=3;

			int i;

			size_t size=sizeof(float)*(width*width);

			a_h=(float *)malloc(size);

			b_h=(float *)malloc(size);

			c_h=(float *)malloc(size);

			cudaMalloc((void **)&a_d,size);

			cudaMalloc((void **)&b_d,size);

			cudaMalloc((void **)&c_d,size);

			printf("\nEnter the Elements of First Matrix");

			for(i=0;i<(width*width);i++)

			{

				 scanf("%f",&a_h[i]);

			}

			

			printf("\nElements of First Matrix");

			for(i=0;i<(width*width);i++)

			{

				printf("\n%f",a_h[i]);

			}

			cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);

			printf("\nEnter the Elements of Second Matrix");

			for(i=0;i<(width*width);i++)

			{

				 scanf("%f",&b_h[i]);

			}

			printf("\nElements of Second Matrix");

			for(i=0;i<(width*width);i++)

			{

				printf("\n%f",b_h[i]);

			}

			cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);

			int blocksize=4;

			width=width*width;

			int nblock=width/blocksize+(width%blocksize==0?0:1);

			width=3;

			SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);

			cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);

			printf("\nMultification of SquareMatrics");

			for(i=0;i<(width*width);i++)

			{

				 printf("\n%f",c_h[i]);

			}

			free(a_h);

			free(b_h);

			free(c_h);

			cudaFree(a_d);

			cudaFree(b_d);

			cudaFree(c_d);

			getch();

			return 0;

}

Thanking you

Deepak Bajaj
MultiplicatonofSquareMatrixwithimposeboundry_Conditions.bmp (2.64 MB)

Sarnath · November 10, 2009, 6:44am

Bajaj,

Attach JPG files. They have far lesser size than BMP
You can address people by “First Name”. “Respected Sir” is not normal in intenet forums.
The best debugger lies in between your ears.

LSChien · November 10, 2009, 9:24am

you use 1D threads block and 1D grid block, this cannot cover all (i,j) in your kernel.

modify your code

int blocksize=4;

width=width*width;

int nblock=width/blocksize+(width%blocksize==0?0:1);

width=3;

SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);

to 2-D threads block and 2-D grid block as

dim3 blocksize( 2, 2 );

	dim3 nblock( (width+1)/2, (width+1)/2 );

	SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);

Dbajaj · November 10, 2009, 10:42am

I wrote a code to Find the Multiplication of Square Matrix(3*3).it is given right result of First 3 Elements,but afterward take garbage values (Show the snapshot of output attach with it).I use Visual Stdio 2005 with nvidia GPU and Win32.

Why it is not given a right result.

/************************************************************

********

*  SquareMatrixMultification

*  This is a example of the CUDA program.

************************************************************

*********/

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>

#include<conio.h>

__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)

{

		int i=blockIdx.y*blockDim.y+threadIdx.y;

		int j=blockIdx.x*blockDim.x+threadIdx.x;

		int k;

		float a,b;

		float sum=0;

		for(k=0;k<width;k++)

		{

			 a=x[i*width+k];

			

			 b=y[k*width+j];

			 

			sum+=a*b;

		}

		

		z[i*width+j]=sum;

}

int main()

{

			float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;

			int width=3;

			int i;

			size_t size=sizeof(float)*(width*width);

			a_h=(float *)malloc(size);

			b_h=(float *)malloc(size);

			c_h=(float *)malloc(size);

			cudaMalloc((void **)&a_d,size);

			cudaMalloc((void **)&b_d,size);

			cudaMalloc((void **)&c_d,size);

			printf("\nEnter the Elements of First Matrix");

			for(i=0;i<(width*width);i++)

			{

				 scanf("%f",&a_h[i]);

			}

			

			printf("\nElements of First Matrix");

			for(i=0;i<(width*width);i++)

			{

				printf("\n%f",a_h[i]);

			}

			cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);

			printf("\nEnter the Elements of Second Matrix");

			for(i=0;i<(width*width);i++)

			{

				 scanf("%f",&b_h[i]);

			}

			printf("\nElements of Second Matrix");

			for(i=0;i<(width*width);i++)

			{

				printf("\n%f",b_h[i]);

			}

			cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);

			int blocksize=4;

			width=width*width;

			int nblock=width/blocksize+(width%blocksize==0?0:1);

			width=3;

			SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);

			cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);

			printf("\nMultification of SquareMatrics");

			for(i=0;i<(width*width);i++)

			{

				 printf("\n%f",c_h[i]);

			}

			free(a_h);

			free(b_h);

			free(c_h);

			cudaFree(a_d);

			cudaFree(b_d);

			cudaFree(c_d);

			getch();

			return 0;

}

Dbajaj · November 10, 2009, 10:46am

you use 1D threads block and 1D grid block, this cannot cover all (i,j) in your kernel.

modify your code
int blocksize=4;

width=width*width;

int nblock=width/blocksize+(width%blocksize==0?0:1);

width=3;

SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);
to 2-D threads block and 2-D grid block as
dim3 blocksize( 2, 2 );

	dim3 nblock( (width+1)/2, (width+1)/2 );

	SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);

Thanks LSChien .