i am doing the matrix multiplication using shared memory. The algorithm is based on David krik lectures and on building the program i am not getting any errors. but in output it’s showing some garbage value. Even with my best efforts i am unable to find the error in algorithm.
please help me. :">
the kernel function is as follows.
global_ void MatrixMulKernel(int *Md, int *Nd, int *Pd, int Width)
{
device shared int Mds[2][2];
device shared int Nds[2][2];
int bx = blockIdx.x;
int by= blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
// Identify the row and column of the Pd element to work on
int Row = by * 2 + ty;
int Col = bx * 2 + tx;
int Pvalue = 0;
// Loop over the Md and Nd tiles required to compute the Pd element
for (int m = 0; m <(Width/2); ++m)
{
// Coolaborative loading of Md and Nd tiles into shared memory
Mds[ty][tx] = Md[Row*Width+(m*2+tx)];
Nds[ty][tx] = Nd[Col+(m*2+ty)*Width];
__syncthreads();
for (int k = 0; k < 2; ++k)
{
Pvalue += Mds[ty][k] * Nds[k][tx];
}
Pd[Row*Width+Col] = Pvalue;
__syncthreads();
}
}