Hi,
I have a kernel that calculates matrix multiplication (on two 4 x 4 matrices). The result comes correct when I chose the execution configuration as 1, 16 i.e 1 block and 16 threads per block. However, when I change the execution configuration to 4, 4, it prints wrong results. I would appreciate your help here.
Thanks,
The kernel and part of main() is given here:
__global__ void Shar(float *a,float *b,float *c,int n)
{
__shared__ float aTile[4][4],bTile[4][4];
int row=blockIdx.y*blockDim.y+threadIdx.y;
int col=blockIdx.x*blockDim.x+threadIdx.x;
float sum=0.0;
aTile[threadIdx.y][threadIdx.x]=a[row*n+threadIdx.x];
bTile[threadIdx.y][threadIdx.x]=b[threadIdx.y*n+col];
__syncthreads();
for(int i=0;i<n;i++)
{
sum+=aTile[threadIdx.y][i]*bTile[i][threadIdx.x];
//printf("\n%f",sum);
}
c[row*n+col]=sum;
}
int main()
{
float *a_h,*b_h,*c_h,*a_d,*b_d,*c_d;
int i,n;
n=4;
size_t size=sizeof(float)*(n*n);
a_h=(float*)malloc(size);
b_h=(float*)malloc(size);
c_h=(float*)malloc(size);
cudaMalloc((void**)&a_d,size);
cudaMalloc((void**)&b_d,size);
cudaMalloc((void**)&c_d,size);
for(i=0;i<(n*n);i++)
{
a_h[i]=2;
}
cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
for(i=0;i<(n*n);i++)
{
b_h[i]=2;
}
cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);
Shar<<<1,16>>>(a_d,b_d,c_d,n);
cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);
printf("\nMultiplication Of Matrix");
for(i=0;i<(n*n);i++)
{
printf("\n%f",c_h[i]);
}
free(a_h);
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
return 0;
}