I was trying CUTLASS out and was evaluating possible tuning parameters (to achieve the best results to compare them to different approaches).
When configuring the GemmTraits per
typedef cutlass::gemm::SgemmTraits<
cutlass::MatrixLayout::kColumnMajor, // Layout of A matrix
cutlass::MatrixLayout::kColumnMajor, // Layout of B matrix
cutlass::Shape<BLOCK_K, BLOCK_N, BLOCK_M>, // Threadblock tile size
// The shape of the matrix multiply operation performed by each iteration of the mainloop.
cutlass::gemm::LinearScaling<float>, // Output Functor
cutlass::Shape<THREAD_K, THREAD_N, THREAD_M> // Tile size for thread-level GEMM
// (K-by-N-by-M)
> GemmTraits;
I get division by zero compiler errors when supplying smaller values than the (in the simplified SGemmTraits somewhat hardcoded) <8, 8, 8> shape as the last parameter (e.g. THREAD_K, THREAD_N and THREAD_M are all 4).
The error message is quite lengthy, but would indicate a problem in the epilogue. This is the first block of several division by zero errors:
nvcc -o cutlass_gemm cutlass_gemm.cu -gencode=arch=compute_75,code=sm_75 -I.
./cutlass/fragment_multiply_add.h(55): warning: division by zero
detected during:
instantiation of "void cutlass::gemm::FragmentMultiplyAdd<ScalarAlphaBeta_, ScalarAccum_, fragMul2>::multiply(cutlass::gemm::FragmentMultiplyAdd<ScalarAlphaBeta_, ScalarAccum_, fragMul2>::ScalarAlphaBeta, const FragmentB_ &, FragmentCd_ &) [with ScalarAlphaBeta_=float, ScalarAccum_=float, fragMul2=true, FragmentB_=cutlass::Fragment<float, 0, 16UL>, FragmentCd_=cutlass::Fragment<float, 0, 16UL>]"
./cutlass/gemm/linear_scaling.h(141): here
instantiation of "void cutlass::gemm::LinearScaling<Scalar_, FragmentMultiplyAdd_>::evaluate(const FragmentA_ &, const FragmentB_ &, FragmentB_ &) [with Scalar_=float, FragmentMultiplyAdd_=cutlass::gemm::FragmentMultiplyAdd<float, float, true>, FragmentA_=cutlass::Fragment<float, 0, 16UL>, FragmentB_=cutlass::Fragment<float, 0, 16UL>]"
./cutlass/gemm/gemm_epilogue.h(189): here
instantiation of "void cutlass::gemm::GemmEpilogue<GemmEpilogueTraits_>::epilogue_with_or_without_beta<kSourceRequired>(cutlass::gemm::GemmEpilogue<GemmEpilogueTraits_>::Accumulators &, const cutlass::Coord<3, int> &, int) [with GemmEpilogueTraits_=cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>, kSourceRequired=true]"
./cutlass/gemm/gemm_epilogue.h(98): here
instantiation of "void cutlass::gemm::GemmEpilogue<GemmEpilogueTraits_>::epilogue(cutlass::gemm::GemmEpilogue<GemmEpilogueTraits_>::Accumulators &, const cutlass::Coord<3, int> &, int) [with GemmEpilogueTraits_=cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>]"
./cutlass/gemm/gemm_mainloop.h(267): here
instantiation of "void cutlass::gemm::GemmMainloop<Traits_>::multiply_add() [with Traits_=cutlass::gemm::GemmTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kA, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kA, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kB, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kB, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 128, 8, 1>, cutlass::Shape<1, 256, 8, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 256, 8, 1>, 1, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileATraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 0>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileBTraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::GemmEpilogue<cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>>, cutlass::gemm::IdentityBlockSwizzle, int, cutlass::gemm::ClearAccumulators<float, 1>>]"
cutlass/gemm/gemm.h(81): here
instantiation of "void cutlass::gemm::gemm_kernel_nolb<Gemm_>(Gemm_::Params) [with Gemm_=cutlass::gemm::GemmMainloop<cutlass::gemm::GemmTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kA, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kA, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kB, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kB, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 128, 8, 1>, cutlass::Shape<1, 256, 8, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 256, 8, 1>, 1, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileATraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 0>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileBTraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::GemmEpilogue<cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>>, cutlass::gemm::IdentityBlockSwizzle, int, cutlass::gemm::ClearAccumulators<float, 1>>>]"
cutlass/gemm/gemm.h(132): here
instantiation of "cutlass::gemm::Launch<Gemm, false>::Launch(Gemm::Params, dim3, dim3, cudaStream_t) [with Gemm=cutlass::gemm::GemmMainloop<cutlass::gemm::GemmTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kA, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kA, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 64, 32, 1>, 1>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::GlobalLoadStream<cutlass::GemmOperand::kB, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<cutlass::GemmOperand::kB, cutlass::MatrixLayout::kColumnMajor, const float, cutlass::Shape<1, 128, 8, 1>, cutlass::Shape<1, 256, 8, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 256, 8, 1>, 1, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 0, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileATraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 0>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileBTraits<const float, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<2, 8, 4, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 4>, float, cutlass::IteratorAdvance::kH, cutlass::MemorySpace::kShared, int, float, cutlass::FragmentElementType::kScalar, cutlass::Shape<0, 0, 0, 0>>, cutlass::Copy<cutlass::Fragment<float, 4, 16UL>>>, cutlass::gemm::GemmEpilogue<cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>>, cutlass::gemm::IdentityBlockSwizzle, int, cutlass::gemm::ClearAccumulators<float, 1>>>]"
cutlass/gemm/gemm.h(226): here
instantiation of "cudaError_t cutlass::gemm::Gemm<Traits_>::launch(const cutlass::gemm::Gemm<Traits_>::Params &, cudaStream_t) [with Traits_=cutlass::gemm::SgemmTraits<cutlass::MatrixLayout::kColumnMajor, cutlass::MatrixLayout::kColumnMajor, cutlass::Shape<8, 128, 128, 1>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, cutlass::Shape<4, 4, 4, 1>, 1, 1, int, cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<4, 4, 4, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true>>, int>>>]"
cutlass_gemm.cu(116): here
The thread block tile size is constrained as follows (after searching through all the templates and dealing with different, but easier to spot division by zero errors):
BLOCK_K >= THREAD_K * 1
BLOCK_N >= THREAD_N * 4
BLOCK_M >= THREAD_M * 8
but I couldn’t figure out any constraints for the thread tile size. Are there any?
Thank you for any help. :)