Nvidia-smi for MIG technology

Hi,

Using the MIG technology, I use the Nvidia-smi command to monitor different partitions within the GPU. However, I cannot obtain the GI-ID parameter in Python code, which allows me to identify which partition is running at any given time. Could someone tell me how it could be done?

Here I leave a segment of how the terminal appears to me with the parameter I want to obtain:

I attach the code I am using:

import subprocess
import pandas as pd
import time
import signal



#Global variables to control monitoring and store data
running = True
gpu_records = []
process_records = []

def signal_handler(signum, frame):
    """Signal handler to stop monitoring and save data before exiting."""
    global running
    running = False
    print("Termination signal received. Saving data and stopping monitoring...")
    save_data('mig_usage')  # Call the function to save data before terminating

def get_gpu_info():
    """Gets GPU usage information and processes using the nvidia-smi command."""
    try:
        # Run the nvidia-smi command to get GPU information
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=timestamp,name,index,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used', '--format=csv,noheader,nounits'],
            capture_output=True, text=True
        )
        gpu_data = result.stdout.strip().split('\n')

        # Run the nvidia-smi command to get process information, including GI ID if available
        result_processes = subprocess.run(
            ['nvidia-smi', '--query-compute-apps=pid,process_name,used_gpu_memory', '--format=csv,noheader,nounits'],
            capture_output=True, text=True
        )
        process_data = result_processes.stdout.strip().split('\n')

        return gpu_data, process_data
    except Exception as e:
        print(f"Error running nvidia-smi: {e}")
        return [], []

def get_timestamp():
    """Gets the current timestamp in HH:MM:SS format."""
    return time.strftime("%H:%M:%S")

def save_data(file):
    """Saves the accumulated GPU and process data to CSV files."""
    df_gpu = pd.DataFrame(gpu_records)
    df_processes = pd.DataFrame(process_records)
    df_gpu.to_csv(f"{file}_gpu.csv", index=False)
    df_processes.to_csv(f"{file}_processes.csv", index=False)
    print(f"Data saved to {file}_gpu.csv and {file}_processes.csv")

def monitor_gpu(file, interval):
    """Monitors GPU information and processes at intervals until a signal is received to stop."""
    global gpu_records, process_records
    
    while running:
        gpu_data, process_data = get_gpu_info()
        
        # Save GPU information
        timestamp = get_timestamp()
        for line in gpu_data:
            parts = line.split(', ')
            if len(parts) >= 4:  # Validation to avoid errors in case of incomplete data
                gpu_record = {
                    'Timestamp': timestamp,
                    'GPU_Name': parts[1],
                    'GPU_Index': parts[2],
                    'GPU_Utilization': parts[3],
                    'Memory_Utilization': parts[4],
                    'Total_Memory_MB': parts[5],
                    'Free_Memory_MB': parts[6],
                    'Used_Memory_MB': parts[7]
                }
                gpu_records.append(gpu_record)
        
        # Save process information
        for line in process_data:
            if not line.strip():  # Ignore empty lines
                continue
            parts = line.split(', ')
            process_record = {
                'Timestamp': timestamp,
                'PID': parts[0],
                'Process_Name': parts[1],
                'Used_GPU_Memory_MB': parts[2]
            }
            # Only add 'GI_ID' if there are enough fields
            if len(parts) > 3:
                process_record['GI_ID'] = parts[3]
            
            process_records.append(process_record)
        
        # Pause before the next iteration
        time.sleep(interval)

    # Save data at the end of monitoring
    save_data(file)

if __name__ == "__main__":
    # Associate SIGINT (Ctrl+C) and SIGTERM (kill) signals with the handler
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    
    print("Starting GPU and associated process monitoring...\n")
    output_file = 'gpu_usage'
    interval = 0.01  # Adjust the interval as needed
    monitor_gpu(output_file, interval)
print("Monitoring finished.\n")

nvidi-smi is built on the NVML library, which appears to have Python bindings via PyPl.

MIG related functions here.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.