The 32*32 tiled matrix multiplication kernel has store bank conflicts which couldn’t be explained:
constexpr int DSIZE = 8192;
constexpr int block_size = 32;
// matrix multiply kernel: C = A * B
__global__ void mmul(const float *A, const float *B, float *C, int ds) {
// declare cache in shared memory
__shared__ float As[block_size][block_size];
__shared__ float Bs[block_size][block_size];
int idx = threadIdx.x + blockDim.x * blockIdx.x; // create thread x index
int idy = threadIdx.y + blockDim.y * blockIdx.y; // create thread y index
if ((idx < ds) && (idy < ds)) {
float temp = 0;
for (int i = 0; i < ds / block_size; i++) {
// Load data into shared memory. HAS store bank conflicts!
As[threadIdx.y][threadIdx.x] =
A[idy * ds + (i * block_size + threadIdx.x)];
Bs[threadIdx.y][threadIdx.x] =
B[(i * block_size + threadIdx.y) * ds + idx];
__syncthreads();
for (int k = 0; k < block_size; k++)
// Keep track of the running sum. NO load bank conflicts!!!
temp += As[threadIdx.y][k] *
Bs[k][threadIdx.x]; // dot product of row and column
__syncthreads();
}
// Write to global memory
C[idy * ds + idx] = temp;
}
}
The value of threadIdx.x was in [0, 32) so the shared memory store ops seemed to be conflict free. But ncu said there were a lot of bank conflicts:
mmul(const float *, const float *, float *, int) (256, 256, 1)x(32, 32, 1), Context 1, Stream 7, Device 0, CC 8.9
Section: Command line profiler metrics
Metric Name Metric Unit Metric Value
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum 252249457
It’s also strange that the load ops had no conflicts with the same addressing pattern.
Furthermore, the same kernel with block_size = 16
has no store/load bank conflicts:
mmul(const float *, const float *, float *, int) (512, 512, 1)x(16, 16, 1), Context 1, Stream 7, Device 0, CC 8.9
Section: Command line profiler metrics
Metric Name Metric Unit Metric Value
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum 0
Can anyone help explain what’s the root cause of bank conflicts for 32*32 tiling? Thanks!