Hallo,

I am writing matrix-matrix multiplication program for GPU (for square matrices). I have two kernels - one reading elements of matrices directly from global memory and the other utilizes shared memory to coalesce reading of both input matrices.

```
__global__ void Kernel_global(float* d_C, float* d_A, float* d_B, int n){
//blockDim.x is equal to n
int i;
int j;
int indexA;
int indexB;
int indexC;
int k;
float temp = 0.f;
j=threadIdx.x;
i=threadIdx.y;
indexC=i*n+j;
#pragma unroll 2
for(k=0;k<n;++k){
indexA=i*n+k;
indexB=k*n+j;
temp+=d_A[indexA]*d_B[indexB];
}
d_C[indexC]=temp;
}
__global__ void Kernel_shared(float* d_C, float* d_A, float* d_B, int n){
//blockDim.x is equal to n
extern __shared__ float in_mats [];
float* s_A;
float* s_B;
int i;
int j;
int indexA;
int indexB;
int indexC;
int k;
float temp = 0.f;
j=threadIdx.x;
i=threadIdx.y;
s_A = in_mats;
s_B = in_mats+n*n;
s_A[i*blockDim.x+j]=d_A[i*blockDim.x+j];
s_B[i*blockDim.x+j]=d_B[i*blockDim.x+j];
indexC=i*n+j;
__syncthreads();
#pragma unroll 2
for(k=0;k<n;++k){
indexA=i*n+k;
indexB=k*n+j;
temp+=s_A[indexA]*s_B[indexB];
}
d_C[indexC]=temp;
}
```

The Kernel is run only on one SM, with all threads in one block thus maximal dimension of matrices is limited to 32x32=1024.

I am really surprised that both kernels have equal execution time on GTX 460. I expected speed-up due to coalesced reading of matrix d_A. Can anyone clarify it to me please?