Hi,
Using the MIG technology, I use the Nvidia-smi command to monitor different partitions within the GPU. However, I cannot obtain the GI-ID parameter in Python code, which allows me to identify which partition is running at any given time. Could someone tell me how it could be done?
Here I leave a segment of how the terminal appears to me with the parameter I want to obtain:
I attach the code I am using:
import subprocess
import pandas as pd
import time
import signal
#Global variables to control monitoring and store data
running = True
gpu_records = []
process_records = []
def signal_handler(signum, frame):
"""Signal handler to stop monitoring and save data before exiting."""
global running
running = False
print("Termination signal received. Saving data and stopping monitoring...")
save_data('mig_usage') # Call the function to save data before terminating
def get_gpu_info():
"""Gets GPU usage information and processes using the nvidia-smi command."""
try:
# Run the nvidia-smi command to get GPU information
result = subprocess.run(
['nvidia-smi', '--query-gpu=timestamp,name,index,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used', '--format=csv,noheader,nounits'],
capture_output=True, text=True
)
gpu_data = result.stdout.strip().split('\n')
# Run the nvidia-smi command to get process information, including GI ID if available
result_processes = subprocess.run(
['nvidia-smi', '--query-compute-apps=pid,process_name,used_gpu_memory', '--format=csv,noheader,nounits'],
capture_output=True, text=True
)
process_data = result_processes.stdout.strip().split('\n')
return gpu_data, process_data
except Exception as e:
print(f"Error running nvidia-smi: {e}")
return [], []
def get_timestamp():
"""Gets the current timestamp in HH:MM:SS format."""
return time.strftime("%H:%M:%S")
def save_data(file):
"""Saves the accumulated GPU and process data to CSV files."""
df_gpu = pd.DataFrame(gpu_records)
df_processes = pd.DataFrame(process_records)
df_gpu.to_csv(f"{file}_gpu.csv", index=False)
df_processes.to_csv(f"{file}_processes.csv", index=False)
print(f"Data saved to {file}_gpu.csv and {file}_processes.csv")
def monitor_gpu(file, interval):
"""Monitors GPU information and processes at intervals until a signal is received to stop."""
global gpu_records, process_records
while running:
gpu_data, process_data = get_gpu_info()
# Save GPU information
timestamp = get_timestamp()
for line in gpu_data:
parts = line.split(', ')
if len(parts) >= 4: # Validation to avoid errors in case of incomplete data
gpu_record = {
'Timestamp': timestamp,
'GPU_Name': parts[1],
'GPU_Index': parts[2],
'GPU_Utilization': parts[3],
'Memory_Utilization': parts[4],
'Total_Memory_MB': parts[5],
'Free_Memory_MB': parts[6],
'Used_Memory_MB': parts[7]
}
gpu_records.append(gpu_record)
# Save process information
for line in process_data:
if not line.strip(): # Ignore empty lines
continue
parts = line.split(', ')
process_record = {
'Timestamp': timestamp,
'PID': parts[0],
'Process_Name': parts[1],
'Used_GPU_Memory_MB': parts[2]
}
# Only add 'GI_ID' if there are enough fields
if len(parts) > 3:
process_record['GI_ID'] = parts[3]
process_records.append(process_record)
# Pause before the next iteration
time.sleep(interval)
# Save data at the end of monitoring
save_data(file)
if __name__ == "__main__":
# Associate SIGINT (Ctrl+C) and SIGTERM (kill) signals with the handler
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
print("Starting GPU and associated process monitoring...\n")
output_file = 'gpu_usage'
interval = 0.01 # Adjust the interval as needed
monitor_gpu(output_file, interval)
print("Monitoring finished.\n")