Hello,
I’m doing tests with MIG partitions, and I’ve encountered a problem that I don’t know how to address. I want to test how the size of a MIG partition affects the performance of a neural network, as shown in the attached code. For this, I want to create a MIG partition of a specific size, perform inference, destroy it, and then create the next one.
import os
import time
import torch
import torchvision.models as models
import csv
import matplotlib.pyplot as plt
def run_inference(model, device, num_inferences=10):
model.to(device)
model.eval()
dummy_input = torch.randn(1, 3, 224, 224, device=device)
# Warm-up
with torch.no_grad():
for _ in range(10):
model(dummy_input)
inference_times = []
with torch.no_grad():
for _ in range(num_inferences):
start_time = time.time()
model(dummy_input)
torch.cuda.synchronize()
end_time = time.time()
inference_times.append(end_time - start_time)
print("fin de inferencias")
avg_time = sum(inference_times) / num_inferences
throughput = num_inferences / sum(inference_times)
print(f"avg_tim: {avg_time} y throughput: {throughput}")
return avg_time, throughput, inference_times
def configure_mig(mig_size):
os.system("sudo nvidia-smi mig -i 0 -cgi {} -C".format(mig_size))
time.sleep(20) # Wait for MIG partitioning to take effect
def reset_mig():
torch.cuda.synchronize()
torch.cuda.empty_cache()
os.system("sudo nvidia-smi mig -dci && sudo nvidia-smi mig -dgi")
time.sleep(20)
def hard_reset_mig():
os.system("sudo nvidia-smi --gpu-reset -i 0")
time.sleep(20)
os.system("sudo nvidia-smi -i 0 -mig 1")
time.sleep(20)
def main():
os.system("sudo nvidia-smi -i 0 -mig 1")
mig_sizes = ["1g.10gb", "2g.10gb", "3g.20gb", "7g.40gb", None]
model_names = ["convnext_base", "convnext_large", "vit_l_32", "vit_b_16", "vgg16", "resnet18"]
results = []
with open("inference_results.csv", "w", newline="") as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(["Model", "MIG Partition", "Avg Inference Time (s)", "Throughput (img/s)", "Inference Times"])
hard_reset_mig()
for model_name in model_names:
model = getattr(models, model_name)(pretrained=True)
for mig_size in mig_sizes:
if mig_size:
configure_mig(mig_size)
device = "cuda:0"
else:
hard_reset_mig()
device = "cuda"
print("fin_preparación_MIG")
avg_time, throughput, inference_times = run_inference(model, device)
results.append((model_name, mig_size if mig_size else "No MIG", avg_time, throughput))
csv_writer.writerow([model_name, mig_size if mig_size else "No MIG", avg_time, throughput] + inference_times)
hard_reset_mig()
# Plot results
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
for model_name in model_names:
model_results = [res for res in results if res[0] == model_name]
labels, avg_times, throughputs = zip(*[(res[1], res[2], res[3]) for res in model_results])
ax1.plot(labels, avg_times, marker='o', label=f'{model_name} Avg Time', linestyle='--')
ax2.plot(labels, throughputs, marker='s', label=f'{model_name} Throughput')
ax1.set_ylabel('Avg Inference Time (s)', color='b')
ax2.set_ylabel('Throughput (img/s)', color='r')
ax1.set_xlabel('MIG Partition')
plt.title('Inference Performance Comparison Across Models and MIG Partitions')
ax1.legend(loc='upper left', fontsize='small')
ax2.legend(loc='upper right', fontsize='small')
plt.xticks(rotation=45)
plt.grid()
plt.save("test_mig.png")
if __name__ == "__main__":
main()
My problem occurs when I try to delete an already created MIG partition, as the test program running on the GPU continues to run and prevents me from doing so. As you can see in the code, I have tried restarting the GPU and re-enabling MIG instead of deleting the partition, but I still face the same problem. Does anyone have any idea how I could force a reboot or deletion of the partition, or alternatively, a different way of doing it?
Many thanks in advance and best regards.