I wrote a code to Find the Multiplication of Square Matrix(3*3).it is given right result of First 3 Elements,but afterward take garbage values (Show the snapshot of output attach with it).I use Visual Stdio 2005 with nvidia GPU and Win32.
Why it is not given a right result.
/************************************************************
********
* SquareMatrixMultification
* This is a example of the CUDA program.
************************************************************
*********/
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include<conio.h>
__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)
{
int i=blockIdx.y*blockDim.y+threadIdx.y;
int j=blockIdx.x*blockDim.x+threadIdx.x;
int k;
float a,b;
float sum=0;
for(k=0;k<width;k++)
{
a=x[i*width+k];
b=y[k*width+j];
sum+=a*b;
}
z[i*width+j]=sum;
}
int main()
{
float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;
int width=3;
int i;
size_t size=sizeof(float)*(width*width);
a_h=(float *)malloc(size);
b_h=(float *)malloc(size);
c_h=(float *)malloc(size);
cudaMalloc((void **)&a_d,size);
cudaMalloc((void **)&b_d,size);
cudaMalloc((void **)&c_d,size);
printf("\nEnter the Elements of First Matrix");
for(i=0;i<(width*width);i++)
{
scanf("%f",&a_h[i]);
}
printf("\nElements of First Matrix");
for(i=0;i<(width*width);i++)
{
printf("\n%f",a_h[i]);
}
cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
printf("\nEnter the Elements of Second Matrix");
for(i=0;i<(width*width);i++)
{
scanf("%f",&b_h[i]);
}
printf("\nElements of Second Matrix");
for(i=0;i<(width*width);i++)
{
printf("\n%f",b_h[i]);
}
cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);
int blocksize=4;
width=width*width;
int nblock=width/blocksize+(width%blocksize==0?0:1);
width=3;
SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);
cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);
printf("\nMultification of SquareMatrics");
for(i=0;i<(width*width);i++)
{
printf("\n%f",c_h[i]);
}
free(a_h);
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
getch();
return 0;
}
Thanks to quick reply.I change my code according your suggestion.But this it is given Wrong Result(snapshot of output).Why it is not given right result .
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include<conio.h>
__global__ void SquareMatrixMul(float *x,float *y,float *z,int width)
{
int i=blockIdx.y*blockDim.y+threadIdx.y;
int j=blockIdx.x*blockDim.x+threadIdx.x;
int k;
float a,b;
float sum=0;
if((i<width)&&(j<width))
{
for(k=0;k<width;k++)
{
a=x[i*width+k];
b=y[k*width+j];
sum+=a*b;
}
z[i*width+j]=sum;
}
}
int main()
{
float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;
int width=3;
int i;
size_t size=sizeof(float)*(width*width);
a_h=(float *)malloc(size);
b_h=(float *)malloc(size);
c_h=(float *)malloc(size);
cudaMalloc((void **)&a_d,size);
cudaMalloc((void **)&b_d,size);
cudaMalloc((void **)&c_d,size);
printf("\nEnter the Elements of First Matrix");
for(i=0;i<(width*width);i++)
{
scanf("%f",&a_h[i]);
}
printf("\nElements of First Matrix");
for(i=0;i<(width*width);i++)
{
printf("\n%f",a_h[i]);
}
cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
printf("\nEnter the Elements of Second Matrix");
for(i=0;i<(width*width);i++)
{
scanf("%f",&b_h[i]);
}
printf("\nElements of Second Matrix");
for(i=0;i<(width*width);i++)
{
printf("\n%f",b_h[i]);
}
cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);
int blocksize=4;
width=width*width;
int nblock=width/blocksize+(width%blocksize==0?0:1);
width=3;
SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);
cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);
printf("\nMultification of SquareMatrics");
for(i=0;i<(width*width);i++)
{
printf("\n%f",c_h[i]);
}
free(a_h);
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
getch();
return 0;
}
you use 1D threads block and 1D grid block, this cannot cover all (i,j) in your kernel.
modify your code
int blocksize=4;
width=width*width;
int nblock=width/blocksize+(width%blocksize==0?0:1);
width=3;
SquareMatrixMul<<<nblock,blocksize>>>(a_d,b_d,c_d,width);