I have a kernel which is using 97 registers with a headroom of 71 registers (obtained from nsight compute, as shown below)
What bothers me is that I have a cute::Tensor that I’d like to be placed in register file, but the compiler decides to place them on local memory, as indicated by both the compilation output and nsight compute profiling output
352 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 126 registers, 25344 bytes smem, 528 bytes cmem[0]
I know that dynamically indexing array is not supported in register files so I have some template programming to statically determine the index.
...
#define BINARY_DIM_SWITCH_2(VALUE, CONST_NAME, LAMBDA) \
if (VALUE == 0) \
{ \
constexpr static int CONST_NAME = 0; \
LAMBDA(); \
} \
else \
{ \
constexpr static int CONST_NAME = 1; \
LAMBDA(); \
}
#define BINARY_DIM_SWITCH_4(VALUE, CONST_NAME, LAMBDA) \
if (VALUE < 2) \
{ \
BINARY_DIM_SWITCH_2(VALUE, CONST_NAME, LAMBDA); \
} \
else \
{ \
BINARY_DIM_SWITCH_2(VALUE - 2, CONST_NAME##_offset, [&]() { \
constexpr static int CONST_NAME = CONST_NAME##_offset + 2; \
LAMBDA(); }); \
}
#define BINARY_DIM_SWITCH_8(VALUE, CONST_NAME, LAMBDA) \
if (VALUE < 4) \
{ \
BINARY_DIM_SWITCH_4(VALUE, CONST_NAME, LAMBDA); \
} \
else \
{ \
BINARY_DIM_SWITCH_4(VALUE - 4, CONST_NAME##_offset, [&]() { \
constexpr static int CONST_NAME = CONST_NAME##_offset + 4; \
LAMBDA(); }); \
}
#define BINARY_DIM_SWITCH_16(VALUE, CONST_NAME, LAMBDA) \
if (VALUE < 8) \
{ \
BINARY_DIM_SWITCH_8(VALUE, CONST_NAME, LAMBDA); \
} \
else \
{ \
BINARY_DIM_SWITCH_8(VALUE - 8, CONST_NAME##_offset, [&]() { \
constexpr static int CONST_NAME = CONST_NAME##_offset + 8; \
LAMBDA(); }); \
}
#define BINARY_DIM_SWITCH_32(VALUE, CONST_NAME, LAMBDA) \
if (VALUE < 16) \
{ \
BINARY_DIM_SWITCH_16(VALUE, CONST_NAME, LAMBDA); \
} \
else \
{ \
BINARY_DIM_SWITCH_16(VALUE - 16, CONST_NAME##_offset, [&]() { \
constexpr static int CONST_NAME = CONST_NAME##_offset + 16; \
LAMBDA(); }); \
}
#define BINARY_DIM_SWITCH_64(VALUE, CONST_NAME, LAMBDA) \
if (VALUE < 32) \
{ \
BINARY_DIM_SWITCH_32(VALUE, CONST_NAME, LAMBDA); \
} \
else \
{ \
BINARY_DIM_SWITCH_32(VALUE - 32, CONST_NAME##_offset, [&]() { \
constexpr static int CONST_NAME = CONST_NAME##_offset + 32; \
LAMBDA(); }); \
}
#define BINARY_DIM_SWITCH(VALUE, CONST_NAME, HEADDIM, LAMBDA) \
if (HEADDIM == 1 && VALUE == 0) \
{ \
constexpr static int CONST_NAME = 0; \
LAMBDA(); \
} \
else if (HEADDIM == 2) \
{ \
BINARY_DIM_SWITCH_2(VALUE, CONST_NAME, LAMBDA); \
} \
else if (HEADDIM == 4) \
{ \
BINARY_DIM_SWITCH_4(VALUE, CONST_NAME, LAMBDA); \
} \
else if (HEADDIM == 8) \
{ \
BINARY_DIM_SWITCH_8(VALUE, CONST_NAME, LAMBDA); \
} \
else if (HEADDIM == 16) \
{ \
BINARY_DIM_SWITCH_16(VALUE, CONST_NAME, LAMBDA); \
} \
else if (HEADDIM == 32) \
{ \
BINARY_DIM_SWITCH_32(VALUE, CONST_NAME, LAMBDA); \
} \
else if (HEADDIM == 64) \
{ \
BINARY_DIM_SWITCH_64(VALUE, CONST_NAME, LAMBDA); \
} \
else \
{ \
static_assert(HEADDIM == 1 || HEADDIM == 2 || HEADDIM == 4 || HEADDIM == 8 || HEADDIM == 16 || HEADDIM == 32 || HEADDIM == 64, "Unsupported HEADDIM value"); \
}
...
template <int DIM, typename Tensor, typename T>
__forceinline__ __device__ void static_add(Tensor &arr, T val)
{
arr[DIM] += val;
}
...
auto rdQK_acc = make_tensor<ElementAccum>(Shape<Int<Headdim>>{}));
...
BINARY_DIM_SWITCH(rD(i_mod_stage_D, d), DIM, Headdim, [&]()
{ static_add<DIM>(rdQK_acc, rdQK_acc_buffer[d]); });
...
where the first bits of code defines a macro for doing switch case on the index, so I think compiler should be able to resolve the index statically.
Despite the effort, the tensor is still placed in local memory. What am I missing? Is this possibly cute related?