Hi all,
I read a program to find the Multiplication of SquareMatrix Using shared Memory.But It gives the wrong result(outputsnapshot)and my expected result is other(shapshot expect).
Tell me?
How Can I get a exacted output?
Thanks in Advance
#include<stdio.h>
#include<cuda.h>
__global__ void Shar(float *a,float *b,float *c,int n)
{
__shared__ float aTile[4][4],bTile[4][4];
int row=blockIdx.y*blockDim.y+threadIdx.y;
int col=blockIdx.x*blockDim.x+threadIdx.x;
float sum=0.0;
aTile[threadIdx.y][threadIdx.x]=a[row*n+threadIdx.x];
bTile[threadIdx.y][threadIdx.x]=b[threadIdx.y*n+col];
__syncthreads();
for(int i=0;i<n;i++)
{
sum+=aTile[threadIdx.y][i]*bTile[i][threadIdx.x];
}
c[row*n+col]=sum;
}
int main()
{
float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;
int i,n;
n=4;
size_t size=sizeof(float)*(n*n);
a_h=(float*)malloc(size);
b_h=(float*)malloc(size);
c_h=(float*)malloc(size);
cudaMalloc((void**)&a_d,size);
cudaMalloc((void**)&b_d,size);
cudaMalloc((void**)&c_d,size);
for(i=0;i<(n*n);i++)
{
a_h[i]=1;
}
cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
for(i=0;i<(n*n);i++)
{
b_h[i]=1;
}
cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);
int blocksize=4;
int nblock=n/blocksize+(n%blocksize==0?0:1);
int TILE_DIM=4;
Shar<<<nblock,blocksize,TILE_DIM>>>(a_d,b_d,c_d,n);
cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);
printf("\nMultiplication Of Matrix");
for(i=0;i<(n*n);i++)
{
printf("\n%f",c_h[i]);
}
free(a_h);
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
return 0;
}