Hi There,
I am curious how to setup the input A/B/C and Scaling Factors in register space for the following tensor core insturction.
Thanks in advance.
// MMA.SF 16x8x32 TN E5M2 x E5M2 with SF UE8M0
template <int VS>
struct SM120_16x8x32_TN_VS<float_e5m2_t, float_e5m2_t, float, float_ue8m0_t, VS>
{
using DRegisters = float[4];
using ARegisters = uint32_t[4];
using BRegisters = uint32_t[2];
using CRegisters = float[4];
using SFARegisters = uint8_t[1];
using SFBRegisters = uint8_t[1];
CUTE_HOST_DEVICE static void
fma(float & d0, float & d1, float & d2, float & d3,
uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
uint32_t const& b0, uint32_t const& b1,
float const & c0, float const & c1, float const & c2, float const & c3,
uint8_t const& sfa0,
uint8_t const& sfb0)
{
#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
static constexpr uint16_t tidA = 0;
static constexpr uint16_t bidA = 0;
static constexpr uint16_t tidB = 0;
static constexpr uint16_t bidB = 0;
CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");
asm volatile(
"mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e5m2.e5m2.f32.ue8m0 "
"{%0, %1, %2, %3},"
"{%4, %5, %6, %7},"
"{%8, %9},"
"{%10, %11, %12, %13},"
"{%14},"
"{%15, %16},"
"{%17},"
"{%18, %19};\n"
: "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3)
: "r"(a0), "r"(a1), "r"(a2), "r"(a3),
"r"(b0), "r"(b1),
"f"(c0), "f"(c1), "f"(c2), "f"(c3),
"r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
"r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));
#else
CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
#endif
}
};