Hello, the value of GPC Clock Frequency and SYS Clock Frequency varied in my profiling results,
and the kernel duration is affected by them.
So I want to know why these two frequency affect kernel duration, and how to affect?
I use below code to profile a kernel 1000 times. The kernel duration is ~0.22 ms in the beginning, and ~018 ms at last. I found it’s because of GPC Clock Frequency and SYS Clock Frequency changed.
run_nsys.sh
#!/bin/sh
export OMP_NUM_THREADS=1
export CUDA_VISIBLE_DEVICES=4,5
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
MASTER_ADDR=localhost
MASTER_PORT=29500
nsys profile --gpu-metrics-devices=cuda-visible -o ./report/attn_model_kernel/nsys/tmp1 \
torchrun --nproc_per_node=2 --master_addr $MASTER_ADDR --master_port $MASTER_PORT prof_kernel.py
prof_kernel.py
import os
import numpy as np
import torch
import torch.distributed as dist
import torch.utils.benchmark as benchmark
from torch.backends.cuda import sdp_kernel, SDPBackend
import torch.multiprocessing as mp
backend_map = {
SDPBackend.MATH: {"enable_math": True, "enable_flash": False, "enable_mem_efficient": False},
SDPBackend.FLASH_ATTENTION: {"enable_math": False, "enable_flash": True, "enable_mem_efficient": False},
SDPBackend.EFFICIENT_ATTENTION: {
"enable_math": False, "enable_flash": False, "enable_mem_efficient": True}
}
def compute_kernel(max_num=10):
rank_id = torch.cuda.current_device()
device = f"cuda:{rank_id}"
batch_size = 2
max_sequence_len = 512
num_heads = 96
embed_dimension = 128
dtype = torch.bfloat16
query = torch.rand(batch_size, num_heads, max_sequence_len, embed_dimension, dtype=dtype, device=device)
key = torch.rand(batch_size, num_heads, max_sequence_len, embed_dimension, dtype=dtype, device=device)
value = torch.rand(batch_size, num_heads, max_sequence_len, embed_dimension, dtype=dtype, device=device)
attn_output = None
for _ in range(max_num):
# prof.step()
with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
try:
# is_causal = True if attention_mask is None and q_len > 1 else False
attn_output = torch.nn.functional.scaled_dot_product_attention(
query=query,
key=key,
value=value,
attn_mask=None,
dropout_p=0.0,
is_causal=True,
)
except RuntimeError:
print("EfficientAttention is not supported. See warnings for reasons.")
return attn_output
def main():
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
dist.init_process_group(backend="nccl")
rank = dist.get_rank()
my_device = f"cuda:{rank}"
torch.cuda.set_device(my_device)
attn_out = compute_kernel(max_num=1000)
if __name__ == "__main__":
main()