CUTLASS: Division by Zero when using smaller threadtile sizes

I was trying CUTLASS out and was evaluating possible tuning parameters (to achieve the best results to compare them to different approaches).

When configuring the GemmTraits per

typedef cutlass::gemm::SgemmTraits<
        cutlass::MatrixLayout::kColumnMajor,           // Layout of A matrix
        cutlass::MatrixLayout::kColumnMajor,           // Layout of B matrix
        cutlass::Shape<BLOCK_K, BLOCK_N, BLOCK_M>,     // Threadblock tile size
                                                       // The shape of the matrix multiply operation performed by each iteration of the mainloop.
        cutlass::gemm::LinearScaling<float>,           // Output Functor
        cutlass::Shape<THREAD_K, THREAD_N, THREAD_M>   // Tile size for thread-level GEMM 
                                                       // (K-by-N-by-M)

    > GemmTraits;

I get division by zero compiler errors when supplying smaller values than the (in the simplified SGemmTraits somewhat hardcoded) <8, 8, 8> shape as the last parameter (e.g. THREAD_K, THREAD_N and THREAD_M are all 4).

The error message is quite lengthy, but would indicate a problem in the epilogue. This is the first block of several division by zero errors:

nvcc -o cutlass_gemm cutlass_gemm.cu -gencode=arch=compute_75,code=sm_75 -I.
./cutlass/fragment_multiply_add.h(55): warning: division by zero
          detected during:
            instantiation of "void cutlass::gemm::FragmentMultiplyAdd<ScalarAlphaBeta_, ScalarAccum_, fragMul2>::multiply(cutlass::gemm::FragmentMultiplyAdd<ScalarAlphaBeta_, ScalarAccum_, fragMul2>::ScalarAlphaBeta, const FragmentB_ &, FragmentCd_ &) [with ScalarAlphaBeta_=float, ScalarAccum_=float, fragMul2=true, FragmentB_=cutlass::Fragment<float, 0, 16UL>, FragmentCd_=cutlass::Fragment<float, 0, 16UL>]" 
./cutlass/gemm/linear_scaling.h(141): here
            instantiation of "void cutlass::gemm::LinearScaling<Scalar_, FragmentMultiplyAdd_>::evaluate(const FragmentA_ &, const FragmentB_ &, FragmentB_ &) [with Scalar_=float, FragmentMultiplyAdd_=cutlass::gemm::FragmentMultiplyAdd<float, float, true>, FragmentA_=cutlass::Fragment<float, 0, 16UL>, FragmentB_=cutlass::Fragment<float, 0, 16UL>]" 
./cutlass/gemm/gemm_epilogue.h(189): here
            instantiation of "void cutlass::gemm::GemmEpilogue<GemmEpilogueTraits_>::epilogue_with_or_without_beta<kSourceRequired>(cutlass::gemm::GemmEpilogue<GemmEpilogueTraits_>::Accumulators &, const cutlass::Coord<3, int> &, int) [with GemmEpilogueTraits_=cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>, kSourceRequired=true]" 
./cutlass/gemm/gemm_epilogue.h(98): here
            instantiation of "void cutlass::gemm::GemmEpilogue<GemmEpilogueTraits_>::epilogue(cutlass::gemm::GemmEpilogue<GemmEpilogueTraits_>::Accumulators &, const cutlass::Coord<3, int> &, int) [with GemmEpilogueTraits_=cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>]" 
./cutlass/gemm/gemm_mainloop.h(267): here
            instantiation of "void cutlass::gemm::GemmMainloop<Traits_>::multiply_add() [with Traits_=cutlass::gemm::GemmTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kA, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kA, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kB, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kB, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 128, 8, 1>, cutlass::Shape<1, 256, 8, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 256, 8, 1>, 1, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileATraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 0>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileBTraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::GemmEpilogue<cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>>, cutlass::gemm::IdentityBlockSwizzle, int, cutlass::gemm::ClearAccumulators<float, 1>>]" 
cutlass/gemm/gemm.h(81): here
            instantiation of "void cutlass::gemm::gemm_kernel_nolb<Gemm_>(Gemm_::Params) [with Gemm_=cutlass::gemm::GemmMainloop<cutlass::gemm::GemmTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kA, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kA, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kB, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kB, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 128, 8, 1>, cutlass::Shape<1, 256, 8, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 256, 8, 1>, 1, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileATraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 0>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileBTraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::GemmEpilogue<cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>>, cutlass::gemm::IdentityBlockSwizzle, int, cutlass::gemm::ClearAccumulators<float, 1>>>]" 
cutlass/gemm/gemm.h(132): here
            instantiation of "cutlass::gemm::Launch<Gemm, false>::Launch(Gemm::Params, dim3, dim3, cudaStream_t) [with Gemm=cutlass::gemm::GemmMainloop<cutlass::gemm::GemmTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kA, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kA, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kB, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kB, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 128, 8, 1>, cutlass::Shape<1, 256, 8, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 256, 8, 1>, 1, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileATraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 0>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileBTraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::GemmEpilogue<cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>>, cutlass::gemm::IdentityBlockSwizzle, int, cutlass::gemm::ClearAccumulators<float, 1>>>]" 
cutlass/gemm/gemm.h(226): here
            instantiation of "cudaError_t cutlass::gemm::Gemm<Traits_>::launch(const cutlass::gemm::Gemm<Traits_>::Params &, cudaStream_t) [with Traits_=cutlass::gemm::SgemmTraits<cutlass::MatrixLayout::kColumnMajor, cutlass::MatrixLayout::kColumnMajor, cutlass::Shape<8, 128, 128, 1>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, cutlass::Shape<4, 4, 4, 1>, 1, 1, int, cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>>]" 
cutlass_gemm.cu(116): here

The thread block tile size is constrained as follows (after searching through all the templates and dealing with different, but easier to spot division by zero errors):

BLOCK_K >= THREAD_K * 1
BLOCK_N >= THREAD_N * 4
BLOCK_M >= THREAD_M * 8

but I couldn’t figure out any constraints for the thread tile size. Are there any?

Thank you for any help. :)