Why the default configuration of GEMM in CUTLASS use a ThreadblockShape of [128, 128, 8]? I know that BlockM (128) and BlockN (128) might be determined in terms of arithmetic intensity, but why BlockK is set to 8 ?
// include/cutlass/gemm/device/default_gemm_configuration.h
template <
typename ArchTag,
typename ElementA,
typename ElementB,
typename ElementC,
typename ElementAccumulator>
struct DefaultGemmConfiguration<
arch::OpClassSimt,
ArchTag,
ElementA,
ElementB,
ElementC,
ElementAccumulator> {
static int const kAlignmentA = 1;
static int const kAlignmentB = 1;
using ThreadblockShape = GemmShape<128, 128, 8>;
using WarpShape = GemmShape<32, 64, 8>;
using InstructionShape = GemmShape<1, 1, 1>;
static int const kStages = 2;
using EpilogueOutputOp = epilogue::thread::LinearCombination<
ElementC,
1,
ElementAccumulator,
ElementAccumulator
>;
using Operator = arch::OpMultiplyAdd;
};