Hi! I am learning cutlass, and I see something like: (from official post)
/// CUTLASS SGEMM example
__global__ void gemm_kernel(void gemm_kernel(
float *C, float *C,
float const *A, float const *A,
float const *B, float const *B,
int M, int M,
int N, int N,
int K) {int K) {
// Define the GEMM tile sizes - discussed in next section// Define the GEMM tile sizes - discussed in next section
typedef block_task_policy <typedef block_task_policy <
128, // BlockItemsY: Height in rows of a tile128, // BlockItemsY: Height in rows of a tile
32, // BlockItemsX - Width in columns of a tile32, // BlockItemsX - Width in columns of a tile
8, // ThreadItemsY - Height in rows of a thread-tile8, // ThreadItemsY - Height in rows of a thread-tile
4, // ThreadItemsX - Width in columns of a thread-tile4, // ThreadItemsX - Width in columns of a thread-tile
8, // BlockItemsK - Depth of a tile8, // BlockItemsK - Depth of a tile
true, // UseDoubleScratchTiles - whether to double-buffer SMEMtrue, // UseDoubleScratchTiles - whether to double-buffer SMEM
block_raster_enum::Default // Block rasterization strategy::Default // Block rasterization strategy
> block_task_policy_t;> block_task_policy_t;
// Define the epilogue functor// Define the epilogue functor
typedef gemm::blas_scaled_epilogue<float, float, float> epilogue_op_t ;typedef gemm::blas_scaled_epilogue<float, float, float> epilogue_op_t ;
// Define the block_task type.// Define the block_task type.
typedef block_task < typedef block_task <
block_task_policy_t, block_task_policy_t,
float, float,
float, float,
matrix_transform_t::NonTranspose, matrix_transform_t::NonTranspose,
4, 4,
matrix_transform_t::NonTranspose, matrix_transform_t::NonTranspose,
4, 4,
epilogue_op_t, epilogue_op_t,
4, 4,
true true
> block_task_t;> block_task_t;
// Declare statically-allocated shared storage// Declare statically-allocated shared storage
__shared__ block_task_t::scratch_storage_t smem;block_task_t::scratch_storage_t smem;
// Construct and run the task// Construct and run the task
block_task_t(block_task_t(
reinterpret_cast(&smem),reinterpret_cast(&smem),
&smem,&smem,
A,,
B,,
C,,
epilogue_op_t(1, 0),epilogue_op_t(1, 0),
M,,
N,,
K).run();).run();
}}
To guide usage…of which can not see the base level implementation of the GEMM. I guess there should exist! But the github page of cutlass is kind of…messy…I tried hard myself!! But really can not find…
Could anyone kindly provide me a link? Thank you!!!
================
By the way, I see a “naive gemm” in cutlass github. Sorry, that is not what I want! Haha!