Take Garbage Value At the place of Result

I wrote a code to Find the Multiplication of Square Matrix(3*3).it is given right result of First 3 Elements,but afterward take garbage values (Show the snapshot of output attach with it).I use Visual Stdio 2005 with nvidia GPU and Win32.

Why it is not given a right result.

/************************************************************

********

*  SquareMatrixMultification

*  This is a example of the CUDA program.

************************************************************

*********/

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>

#include<conio.h>

__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)

{

		int i=blockIdx.y*blockDim.y+threadIdx.y;

		int j=blockIdx.x*blockDim.x+threadIdx.x;

		int k;

		float a,b;

		float sum=0;

		for(k=0;k<width;k++)

		{

			 a=x[i*width+k];

			

			 b=y[k*width+j];

			 

			sum+=a*b;

		}

		

		z[i*width+j]=sum;

}

int main()

{

			float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;

			int width=3;

			int i;

			size_t size=sizeof(float)*(width*width);

			a_h=(float *)malloc(size);

			b_h=(float *)malloc(size);

			c_h=(float *)malloc(size);

			cudaMalloc((void **)&a_d,size);

			cudaMalloc((void **)&b_d,size);

			cudaMalloc((void **)&c_d,size);

			printf("\nEnter the Elements of First Matrix");

			for(i=0;i<(width*width);i++)

			{

				 scanf("%f",&a_h[i]);

			}

			

			printf("\nElements of First Matrix");

			for(i=0;i<(width*width);i++)

			{

				printf("\n%f",a_h[i]);

			}

			cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);

			printf("\nEnter the Elements of Second Matrix");

			for(i=0;i<(width*width);i++)

			{

				 scanf("%f",&b_h[i]);

			}

			printf("\nElements of Second Matrix");

			for(i=0;i<(width*width);i++)

			{

				printf("\n%f",b_h[i]);

			}

			cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);

			int blocksize=4;

			width=width*width;

			int nblock=width/blocksize+(width%blocksize==0?0:1);

			width=3;

			SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);

			cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);

			printf("\nMultification of SquareMatrics");

			for(i=0;i<(width*width);i++)

			{

				 printf("\n%f",c_h[i]);

			}

			free(a_h);

			free(b_h);

			free(c_h);

			cudaFree(a_d);

			cudaFree(b_d);

			cudaFree(c_d);

			getch();

			return 0;

}

SquareMatrixMultification.JPG

you need to impose boundary condition

__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)

{

		int i=blockIdx.y*blockDim.y+threadIdx.y;

		int j=blockIdx.x*blockDim.x+threadIdx.x;

		int k;

		float a,b;

		float sum=0;

		if ( (i < width) && ( j < width ) ){

			for(k=0;k<width;k++)

			{

			 a=x[i*width+k];

			

			 b=y[k*width+j];

			 

			sum+=a*b;

			}

		

			z[i*width+j]=sum;

		}// for valid (i,j)

}

Respected Sir,

                               Thanks to quick reply.I change my code according your suggestion.But this it  is given Wrong Result(snapshot of output).Why it is not given right result .
#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>

#include<conio.h>

__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)

{

		int i=blockIdx.y*blockDim.y+threadIdx.y;

		int j=blockIdx.x*blockDim.x+threadIdx.x;

		int k;

		float a,b;

		float sum=0;

		if((i<width)&&(j<width))

		{

		for(k=0;k<width;k++)

		{

			 a=x[i*width+k];

			

			 b=y[k*width+j];

			 

			sum+=a*b;

		}

		

		z[i*width+j]=sum;

		}

}

int main()

{

			float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;

			int width=3;

			int i;

			size_t size=sizeof(float)*(width*width);

			a_h=(float *)malloc(size);

			b_h=(float *)malloc(size);

			c_h=(float *)malloc(size);

			cudaMalloc((void **)&a_d,size);

			cudaMalloc((void **)&b_d,size);

			cudaMalloc((void **)&c_d,size);

			printf("\nEnter the Elements of First Matrix");

			for(i=0;i<(width*width);i++)

			{

				 scanf("%f",&a_h[i]);

			}

			

			printf("\nElements of First Matrix");

			for(i=0;i<(width*width);i++)

			{

				printf("\n%f",a_h[i]);

			}

			cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);

			printf("\nEnter the Elements of Second Matrix");

			for(i=0;i<(width*width);i++)

			{

				 scanf("%f",&b_h[i]);

			}

			printf("\nElements of Second Matrix");

			for(i=0;i<(width*width);i++)

			{

				printf("\n%f",b_h[i]);

			}

			cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);

			int blocksize=4;

			width=width*width;

			int nblock=width/blocksize+(width%blocksize==0?0:1);

			width=3;

			SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);

			cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);

			printf("\nMultification of SquareMatrics");

			for(i=0;i<(width*width);i++)

			{

				 printf("\n%f",c_h[i]);

			}

			free(a_h);

			free(b_h);

			free(c_h);

			cudaFree(a_d);

			cudaFree(b_d);

			cudaFree(c_d);

			getch();

			return 0;

}

Thanking you

Deepak Bajaj
MultiplicatonofSquareMatrixwithimposeboundry_Conditions.bmp (2.64 MB)

Bajaj,

  1. Attach JPG files. They have far lesser size than BMP
  2. You can address people by “First Name”. “Respected Sir” is not normal in intenet forums.
  3. The best debugger lies in between your ears.

you use 1D threads block and 1D grid block, this cannot cover all (i,j) in your kernel.

modify your code

int blocksize=4;

width=width*width;

int nblock=width/blocksize+(width%blocksize==0?0:1);

width=3;

SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);

to 2-D threads block and 2-D grid block as

dim3 blocksize( 2, 2 );

	dim3 nblock( (width+1)/2, (width+1)/2 );

	SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);

Thanks LSChien .