Run ptx (mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32) on sm_120a

Hi There,

I am curious how to setup the input A/B/C and Scaling Factors in register space for the following tensor core insturction.
Thanks in advance.

// MMA.SF 16x8x32 TN E5M2 x E5M2 with SF UE8M0
template <int VS>
struct SM120_16x8x32_TN_VS<float_e5m2_t, float_e5m2_t, float, float_ue8m0_t, VS>
{
  using DRegisters = float[4];
  using ARegisters = uint32_t[4];
  using BRegisters = uint32_t[2];
  using CRegisters = float[4];

  using SFARegisters = uint8_t[1];
  using SFBRegisters = uint8_t[1];

  CUTE_HOST_DEVICE static void
  fma(float         & d0, float         & d1, float         & d2, float         & d3,
      uint32_t const& a0, uint32_t const& a1, uint32_t const& a2, uint32_t const& a3,
      uint32_t const& b0, uint32_t const& b1,
      float const   & c0, float const   & c1, float const   & c2, float const   & c3,
      uint8_t const& sfa0,
      uint8_t const& sfb0)
  {
#if defined(CUTE_ARCH_MXF8F6F4_MMA_ENABLED)
    static constexpr uint16_t tidA = 0;
    static constexpr uint16_t bidA = 0;
    static constexpr uint16_t tidB = 0;
    static constexpr uint16_t bidB = 0;

    CUTE_STATIC_ASSERT(VS == 32, "Scaling factor vector size has to be 32 for MXF8F6F4 MMA.");

    asm volatile(
    "mma.sync.aligned.kind::mxf8f6f4.block_scale.scale_vec::1X.m16n8k32.row.col.f32.e5m2.e5m2.f32.ue8m0 "
    "{%0,  %1,  %2,  %3},"
    "{%4,  %5,  %6,  %7},"
    "{%8,  %9},"
    "{%10, %11, %12, %13},"
    "{%14},"
    "{%15, %16},"
    "{%17},"
    "{%18, %19};\n"
    :  "=f"(d0),  "=f"(d1),  "=f"(d2),  "=f"(d3)
    :   "r"(a0),   "r"(a1),   "r"(a2),   "r"(a3),
        "r"(b0),   "r"(b1),
        "f"(c0),   "f"(c1),   "f"(c2),   "f"(c3),
        "r"(uint32_t(sfa0)) , "h"(bidA), "h"(tidA),
        "r"(uint32_t(sfb0)) , "h"(bidB), "h"(tidB));

#else
    CUTE_INVALID_CONTROL_PATH("Attempting to use SM120::BLOCKSCALED::SM120_16x8x32_TN_VS without CUTE_ARCH_MXF8F6F4_MMA_ENABLED");
#endif
  }
};

Block scaling is explained in the ptx documentation. 1. Introduction — PTX ISA 8.7 documentation