I code a simple sgemm with shared memory, like this:
// BM = BN = BK
template <const int BM, const int BN, const int BK>
__global__ void matmul_shared(float *A, float *B, float *C, int M, int N, int K) {
__shared__ float s_a[BM][BK], s_b[BK][BN];
int tx = threadIdx.x, ty = threadIdx.y;
int bx = blockIdx.x * BN, by = blockIdx.y * BM;
int x = bx + tx, y = by + ty;
float reg = 0.0f;
if (y < M && x < N) {
for (int offs = 0; offs < K; offs += BK) {
if (offs + tx < K)
s_a[ty][tx] = A[y * K + (offs + tx)];
if (offs + ty < K)
s_b[ty][tx] = B[(offs + ty) * N + x];
__syncthreads();
for (int k = 0; k < BK && (offs + k) < K; ++k)
reg += s_a[ty][k] * s_b[k][tx];
__syncthreads();
}
C[y * N + x] = reg;
}
}
I do some test:
test 1, BM = BN = BK = 32;
test 2, BM = BN = BK = 16.
In my test, M = N = K = 1024.
In these cases, there are no bank conflicts in shared store/load, but I found “other bank conflicts” in NCU and this value of test1 is not zero. What’s the value mean? Why is this value 0 in test2 but not in test1?