I need to profile and trace unified memory records for python application, especially on pytorch codes for profiling unified memory actions on deep learning models.
So I decided to do it with cupti-python, but it’s not working correctly.
First, I used common.py at /opt/conda/lib/python3.11/site-packages/cupti-python-samples
I just added cupti.ActivityKind.UNIFIED_MEMORY_COUNTER at default_activity_list as shown below:
default_activity_list: list[cupti.ActivityKind] = [
cupti.ActivityKind.CONCURRENT_KERNEL,
cupti.ActivityKind.MEMCPY,
cupti.ActivityKind.DRIVER,
cupti.ActivityKind.MEMORY2,
cupti.ActivityKind.CONTEXT,
cupti.ActivityKind.GRAPH_TRACE,
cupti.ActivityKind.EXTERNAL_CORRELATION,
cupti.ActivityKind.NAME,
cupti.ActivityKind.MARKER,
cupti.ActivityKind.MARKER_DATA,
cupti.ActivityKind.STREAM,
cupti.ActivityKind.SYNCHRONIZATION,
cupti.ActivityKind.JIT,
cupti.ActivityKind.OVERHEAD,
cupti.ActivityKind.MEMORY_POOL,
cupti.ActivityKind.MEMSET,
cupti.ActivityKind.DEVICE,
cupti.ActivityKind.MEMCPY2,
cupti.ActivityKind.UNIFIED_MEMORY_COUNTER,
]
And my code for profiling unified memory with CUPTI, helper_cupti_um.py:
from cupti import cupti
from cuda import cuda
from common import checkCudaErrors, default_activity_list, ProfOutput
import common
import atexit
import sys
um_kind_list = [
cupti.ActivityUnifiedMemoryCounterKind.UNKNOWN,
cupti.ActivityUnifiedMemoryCounterKind.BYTES_TRANSFER_HTOD,
cupti.ActivityUnifiedMemoryCounterKind.BYTES_TRANSFER_DTOH,
cupti.ActivityUnifiedMemoryCounterKind.CPU_PAGE_FAULT_COUNT,
cupti.ActivityUnifiedMemoryCounterKind.GPU_PAGE_FAULT,
cupti.ActivityUnifiedMemoryCounterKind.THRASHING,
cupti.ActivityUnifiedMemoryCounterKind.THROTTLING,
cupti.ActivityUnifiedMemoryCounterKind.REMOTE_MAP,
cupti.ActivityUnifiedMemoryCounterKind.BYTES_TRANSFER_DTOD,
]
def at_exit_handler():
cupti.activity_flush_all(1)
def setup_cupti_um():
"""
Initialize CUPTI for Unified Memory profiling.
"""
um_config = cupti.ActivityUnifiedMemoryCounterConfig(len(um_kind_list))
# initialize CUDA Driver API
checkCudaErrors(cuda.cuInit(0))
device_count = checkCudaErrors(cuda.cuDeviceGetCount())
if device_count < 1:
print("No CUDA devices found.")
sys.exit(-1)
# for device in range(2):
for um_kind in um_kind_list:
um_config.scope = 1
um_config.device_id = 0
um_config.kind = um_kind
um_config.enable = 1
# Initialize CUPTI with Unified Memory configuration
cupti_result = cupti.activity_configure_unified_memory_counter(um_config.ptr, len(um_kind_list))
# error handling
if cupti_result == cupti.Result.ERROR_NOT_INITIALIZED:
print("CUPTI is not initialized. Please initialize CUPTI before configuring Unified Memory profiling.")
sys.exit(-1)
elif cupti_result == cupti.Result.ERROR_INVALID_PARAMETER:
print("Invalid parameter provided for Unified Memory profiling configuration.")
sys.exit(-1)
elif cupti_result == cupti.Result.ERROR_UM_PROFILING_NOT_SUPPORTED:
print("Unified Memory profiling is not supported.")
sys.exit(-1)
elif cupti_result == cupti.Result.ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE:
print("Unified Memory profiling is not supported on this device.")
sys.exit(-1)
elif cupti_result == cupti.Result.ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES:
print("Unified Memory profiling is not supported on non-P2P devices.")
sys.exit(-1)
# elif cupti_result == cupti.Result.ERROR_UM_PROFILING_NOT_SUPPORTED_WITH_MPS:
# print("Unified Memory profiling is not supported with MPS.")
# sys.exit(-1)
elif cupti_result == cupti.Result.ERROR_UNKNOWN:
print(f"Error configuring Unified Memory profiling: {cupti_result}")
sys.exit(-1)
# set exit handler to flush CUPTI activities
atexit.register(at_exit_handler)
# initialize CUPTI
common.cupti_initialize(
activity_list=common.default_activity_list,
prof_output=ProfOutput.DETAILED,
validation=False,
)
def free_cupti_um():
"""
Free CUPTI resources for Unified Memory profiling.
"""
# disable CUPTI activity
common.cupti_activity_disable(common.default_activity_list)
# flush CUPTI activity buffer
common.cupti_activity_flush()
Finally, matrix multiplication code for the test:
import torch
import sys
from helper_cupti_um import setup_cupti_um, free_cupti_um
if __name__ == "__main__":
print(torch.cuda.is_available())
# Initialize CUPTI
setup_cupti_um()
# Load the allocator
managed_alloc = torch.cuda.memory.CUDAPluggableAllocator('../alloc.so', 'managed_alloc', 'managed_free')
# Swap the current allocator
torch.cuda.memory.change_current_allocator(managed_alloc)
a = torch.randn(10000, 4096, device='cuda')
b = torch.randn(4096, 10000, device='cuda')
c = torch.matmul(a, b)
# Free CUPTI
free_cupti_um()
CUDA version 12.6.3
cupti-python 12.6.0
nvidia-cuda-cupti-cu12 12.6.37
Other metrics on default_activity_list are working but cupti.ActivityKind.UNIFIED_MEMORY_COUNTER shows nothing.
What is the problem? Please help…
