I’m trying to implement custom behaviors with flash-attn 3 (hopper) base.
Code can be found here.
There’s no problem with building library with default options, but compile takes too much time when adding nvcc -G
flag (or --ptxas-options=-g
) to debug the device codes.
nvcc_flags = [
"-std=c++17",
# "-U__CUDA_NO_HALF_OPERATORS__",
# "-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
# "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage", # printing out number of registers
# "-lineinfo",
"-DCUTLASS_DEBUG_TRACE_LEVEL=0",
"-DNDEBUG",
"-gencode",
"arch=compute_90a,code=sm_90a",
"--threads",
"16",
"-g", # Host code debug
"-O0", # Host code opt level
"-G", # Device code debug
"-DFLASH_DEBUG",
]
Above is list of all nvcc flags I’m using.
- Current situation
- Compilation without debug flag takes about 64 seconds
- Adding
-g -O0
tocxx_flags
is compilable with no problem - Adding
-g -O0
tonvcc_flags
is compilable with no problem(36 seconds) - Adding
-G
(device code debug) tonvcc_flags
never finishes(>2h and fail)
- What I’ve tried
- Make
/tmp
dir ramdisk - Only compile the minimum cu files to reduce instantiation
if debug_mode:
sources = [
"flash_api.cpp",
"flash_fwd_hdim64_bf16_gqa4_sm90.cu",
"flash_bwd_hdim64_bf16_sm90.cu",
]
- Narrow down some switches by explicit instantiation
// flash_api.cpp
void run_mha_fwd(Flash_fwd_params ¶ms, cudaStream_t stream,
bool force_split_kernel = false) {
int dtype = 1;
if (params.is_bf16) {
dtype = 2;
} else if (params.is_e4m3) {
dtype = 3;
}
#ifdef FLASH_DEBUG
run_mha_fwd_gqa_<cutlass::bfloat16_t, 64, 4>(params, stream);
#else
PREC_SWITCH(dtype, Element, [&] {
HEADDIM_SWITCH(params.d, kHeadSize, [&] {
if (!params.use_gqa_packing) {
run_mha_fwd_<Element, kHeadSize>(params, stream);
} else {
QUERYHEAD_SWITCH(params.h_h_k_ratio, kBlockH, [&] {
run_mha_fwd_gqa_<Element, kHeadSize, kBlockH>(params,
stream);
});
}
});
});
#endif
// flash_fwd_launch_template.h
template<typename T, int kBlockH>
void run_mha_fwd_hdim64_gqa(Flash_fwd_params ¶ms, cudaStream_t stream) {
constexpr static int Headdim = 64;
constexpr static bool UseCluster = false;
using Seqlen_traits = flash::FixedSeqLenTraits;
using Seqlen_traits_Q = flash::FixedGQASeqLenTraits;
#ifdef FLASH_DEBUG
constexpr static int kNumMmaWGs = 2;
run_flash_fwd<
Flash_fwd_kernel_traits<Headdim,
/*kBlockM_=*/kNumMmaWGs * 64,
/*kBlockN_=*/128,
/*kNWarps_=*/4 + kNumMmaWGs * 4,
/*kStages_=*/2,
/*Is_Q_in_regs_=*/false,
/*kClusterM_=*/UseCluster ? 2 : 1,
/*elem_type=*/T,
/*Is_split_=*/false,
/*kBlockH_=*/kBlockH>,
/*Is_causal=*/false,
/*Is_local=*/true,
Seqlen_traits,
Seqlen_traits_Q
>(params, stream);
#else
MMA_3WG_SWITCH(kBlockH * params.seqlen_q, kNumMmaWGs, [&] {
My dev machine has 224 CPU cores but increasing ninja or nvcc threads is meaningless cuz cicc and ptxas is not parallelizable.
ptxas process for single ptx file takes almost forever(more than 2 hours).
I’m kinda new to CUDA, so I might miss some important options.
Is there any other way to reduce the compilation time when adding device debug flag?
(edit)
[1/3] c++ -MMD -MF /tmp/tmpo2nq5vjf.build-temp/src/flash_api.o.d -pthread -B /home/ubuntu/anaconda3/envs/flash-dev/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /home/ubuntu/anaconda3/envs/flash-dev/include -fPIC -O2 -isystem /home/ubuntu/anaconda3/envs/flash-dev/include -fPIC -I/home/ubuntu/workspace/flash-attention/csrc/cutlass/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/TH -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/home/ubuntu/anaconda3/envs/flash-dev/include/python3.11 -c -c /home/ubuntu/workspace/flash-attention/2d/src/flash_api.cpp -o /tmp/tmpo2nq5vjf.build-temp/src/flash_api.o -std=c++17 -DFLASHATTENTION_ENABLE_2D -g -O0 -DFLASH_DEBUG -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=flashattn_2d_hopper_cuda -D_GLIBCXX_USE_CXX11_ABI=0
/home/ubuntu/workspace/flash-attention/2d/src/flash_api.cpp: In function ‘void run_mha_fwd(Flash_fwd_params&, cudaStream_t, bool)’:
/home/ubuntu/workspace/flash-attention/2d/src/flash_api.cpp:457:9: warning: variable ‘dtype’ set but not used [-Wunused-but-set-variable]
457 | int dtype = 1;
| ^~~~~
[2/3] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /tmp/tmpo2nq5vjf.build-temp/src/flash_fwd_hdim64_bf16_gqa4_sm90.o.d -I/home/ubuntu/workspace/flash-attention/csrc/cutlass/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/TH -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/home/ubuntu/anaconda3/envs/flash-dev/include/python3.11 -c -c /home/ubuntu/workspace/flash-attention/2d/src/flash_fwd_hdim64_bf16_gqa4_sm90.cu -o /tmp/tmpo2nq5vjf.build-temp/src/flash_fwd_hdim64_bf16_gqa4_sm90.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT162_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math -DCUTLASS_DEBUG_TRACE_LEVEL=0 -DNDEBUG -DFLASHATTENTION_ENABLE_2D -gencode arch=compute_90a,code=sm_90a --threads 16 -g -O0 -G -DFLASH_DEBUG -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=flashattn_2d_hopper_cuda -D_GLIBCXX_USE_CXX11_ABI=0
Warning: Function too large, generated debug information may not be accurate.
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN5flash15compute_attn_wsI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi12ELi2ELb0ELi1EN7cutlass10bfloat16_tELb0ELi4EELb0ELb1ENS_30DynamicPersistentTileSchedulerILi256ELi32ELb0EEENS_12SeqLenTraitsILb0ELb0ELb0EEENS7_ILb0ELb0ELb1EEEEEvNS_21CollectiveMainloopFwdIT_XT0_EXT1_ELb1ET3_T4_E6ParamsENS_21CollectiveEpilogueFwdISB_SD_E6ParamsENT2_6ParamsESD_SC_'
ptxas info : (C7511) Potential Performance Loss: wgmma.mma_async instructions are serialized due to insufficient register resources for the wgmma pipeline in the function '_ZN4cute28SM90_64x64x16_F32BF16BF16_RSILNS_4GMMA5MajorE0ELS2_1ELNS1_7ScaleInE1ELS3_1EE3fmaERKjS6_S6_S6_RKmRfS9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_NS1_8ScaleOutE'
ptxas info : (C7511) Potential Performance Loss: wgmma.mma_async instructions are serialized due to insufficient register resources for the wgmma pipeline in the function '_ZN4cute29SM90_64x128x16_F32BF16BF16_SSILNS_4GMMA5MajorE0ELS2_0ELNS1_7ScaleInE1ELS3_1EE3fmaERKmS6_RfS7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_NS1_8ScaleOutE'
[3/3] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /tmp/tmpo2nq5vjf.build-temp/src/flash_bwd_hdim64_bf16_sm90.o.d -I/home/ubuntu/workspace/flash-attention/csrc/cutlass/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/TH -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/home/ubuntu/anaconda3/envs/flash-dev/include/python3.11 -c -c /home/ubuntu/workspace/flash-attention/2d/src/flash_bwd_hdim64_bf16_sm90.cu -o /tmp/tmpo2nq5vjf.build-temp/src/flash_bwd_hdim64_bf16_sm90.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT162_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math -DCUTLASS_DEBUG_TRACE_LEVEL=0 -DNDEBUG -DFLASHATTENTION_ENABLE_2D -gencode arch=compute_90a,code=sm_90a --threads 16 -g -O0 -G -DFLASH_DEBUG -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=flashattn_2d_hopper_cuda -D_GLIBCXX_USE_CXX11_ABI=0
FAILED: /tmp/tmpo2nq5vjf.build-temp/src/flash_bwd_hdim64_bf16_sm90.o
/usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /tmp/tmpo2nq5vjf.build-temp/src/flash_bwd_hdim64_bf16_sm90.o.d -I/home/ubuntu/workspace/flash-attention/csrc/cutlass/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/TH -I/home/ubuntu/anaconda3/envs/flash-dev/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/home/ubuntu/anaconda3/envs/flash-dev/include/python3.11 -c -c /home/ubuntu/workspace/flash-attention/2d/src/flash_bwd_hdim64_bf16_sm90.cu -o /tmp/tmpo2nq5vjf.build-temp/src/flash_bwd_hdim64_bf16_sm90.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT162_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math -DCUTLASS_DEBUG_TRACE_LEVEL=0 -DNDEBUG -DFLASHATTENTION_ENABLE_2D -gencode arch=compute_90a,code=sm_90a --threads 16 -g -O0 -G -DFLASH_DEBUG -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=flashattn_2d_hopper_cuda -D_GLIBCXX_USE_CXX11_ABI=0
Warning: Function too large, generated debug information may not be accurate.
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb0ELb1ELb1ELb1ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb1EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb1ELb0ELb1ELb0ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb1EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb0ELb0ELb1ELb1ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb1EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb1ELb0ELb1ELb1ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb1EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb0ELb1ELb0ELb1ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb0EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb0ELb0ELb0ELb1ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb0EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb1ELb0ELb0ELb1ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb0EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb0ELb1ELb0ELb0ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb0EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb0ELb0ELb0ELb0ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb0EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb1ELb0ELb0ELb0ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb0EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb0ELb1ELb1ELb0ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb1EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb0ELb0ELb1ELb0ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb1EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb1ELb0ELb1ELb0ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb1EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb0ELb1ELb1ELb0ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb1EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb0ELb0ELb1ELb0ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb1EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb0ELb0ELb1ELb1ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb1EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb1ELb0ELb1ELb1ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb1EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb0ELb1ELb1ELb1ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb1EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb0ELb1ELb0ELb0ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb0EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb0ELb1ELb0ELb1ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb0EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb0ELb0ELb0ELb0ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb0EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb0ELb0ELb0ELb1ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb0EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZZN5flash21CollectiveMainloopBwdILi2EN4cute5tupleIJNS1_1CILi1EEES4_S4_EEENS2_IJNS3_ILi128EEES6_NS3_ILi64EEEEEEN7cutlass10bfloat16_tEfNS9_4arch4Sm90ELb1ELb0ELb0ELb1ELb0ELb0ELi1ELi2ELi2EE3mmaINS_12FlashAttnBwdISD_NS_21CollectiveEpilogueBwdIS8_SA_Li256ELb0EEENS_22SingleTileSchedulerBwdEE13SharedStorageENS1_6TensorINS1_11ArrayEngineIfLi32EEENS1_6LayoutINS2_IJNS2_IJNS3_ILi2EEESP_NS3_ILi8EEEEEES4_S4_EEENS2_IJNS2_IJS4_SP_NS3_ILi4EEEEEENS3_ILi0EEESV_EEEEEEEEEvRKNSD_6ParamsENS9_16PipelineTmaAsyncILi2EEES13_RNS9_13PipelineStateILj2EEERT0_S18_iiNS2_IJiiiEEERT_ENKUlvE_clEv'
ptxas info : (C7505) Potential Performance Loss: 'setmaxnreg' ignored to allow debugging.
ptxas info : (C7509) Potential Performance Loss: wgmma.mma_async instructions are serialized due to the presence of Extern calls in the function '_ZN7cutlass13device_kernelIN5flash12FlashAttnBwdINS1_21CollectiveMainloopBwdILi2EN4cute5tupleIJNS4_1CILi1EEES7_S7_EEENS5_IJNS6_ILi128EEES9_NS6_ILi64EEEEEENS_10bfloat16_tEfNS_4arch4Sm90ELb1ELb0ELb0ELb0ELb0ELb0ELi1ELi2ELi2EEENS1_21CollectiveEpilogueBwdISB_SC_Li256ELb0EEENS1_22SingleTileSchedulerBwdEEEEEvNT_6ParamsE'
ptxas info : (C7511) Potential Performance Loss: wgmma.mma_async instructions are serialized due to insufficient register resources for the wgmma pipeline in the function '_ZN4cute28SM90_64x64x16_F32BF16BF16_RSILNS_4GMMA5MajorE0ELS2_1ELNS1_7ScaleInE1ELS3_1EE3fmaERKjS6_S6_S6_RKmRfS9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_S9_NS1_8ScaleOutE'
ptxas info : (C7511) Potential Performance Loss: wgmma.mma_async instructions are serialized due to insufficient register resources for the wgmma pipeline in the function '_ZN4cute29SM90_64x128x16_F32BF16BF16_SSILNS_4GMMA5MajorE0ELS2_0ELNS1_7ScaleInE1ELS3_1EE3fmaERKmS6_RfS7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_NS1_8ScaleOutE'
ptxas info : (C7511) Potential Performance Loss: wgmma.mma_async instructions are serialized due to insufficient register resources for the wgmma pipeline in the function '_ZN4cute28SM90_64x64x16_F32BF16BF16_SSILNS_4GMMA5MajorE0ELS2_1ELNS1_7ScaleInE1ELS3_1EE3fmaERKmS6_RfS7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_S7_NS1_8ScaleOutE'
double free or corruption (out)
Aborted (core dumped)
Above is printed log due to compilation failure after few hours.(Don’t know exactly how long due to afk)
Compilation takes all of the ram (2TiB) before pruning templates, but after the workaround I’ve mentioned above, it consumes under 100G so I guess it’s not a OOM issue.
Below is my developing environment.
OS: Ubuntu 22.04
Driver & CUDA: 12.4
CPU: Intel(R) Xeon(R) Platinum 8480+ x2
RAM: 2TiB
GPU: Nvidia H100 80GB SXM5 x8