Actually we’re trying to build AI Invoice parser and we are using only 100 images of Single Distributor and we’re facing problem in the same scenario.
Below are my Ubuntu In detailed logs which i am sharing
root@spark-1eaa:/home/administrator# sudo lspci -vv -s 01:00.0 | grep -i “SlotPowerLimit” ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+ SlotPowerLimit 0W ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+ SlotPowerLimit 0W ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+ SlotPowerLimit 0W ExtTag- AttnBtn- AttnInd- PwrInd- RBE+ FLReset- SlotPowerLimit 0W pcilib: sysfs_read_vpd: read failed: No such device ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+ SlotPowerLimit 0W ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+ SlotPowerLimit 0W root@spark-1eaa:/home/administrator# sudo lspci -vv -s 01:00.1 | grep -i “SlotPowerLimit” ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+ SlotPowerLimit 0W ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+ SlotPowerLimit 0W root@spark-1eaa:/home/administrator# sudo lspci -vv -s 0002:01:00.0 | grep -i “SlotPowerLimit” ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+ SlotPowerLimit 0W root@spark-1eaa:/home/administrator# sudo lspci -vv -s 0002:01:00.1 | grep -i “SlotPowerLimit” ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+ SlotPowerLimit 0W root@spark-1eaa:/home/administrator# sudo dmesg | grep -i mlx [ 2.505157] mlx5_core 0000:01:00.0: Adding to iommu group 14 [ 2.507852] mlx5_core 0000:01:00.0: enabling device (0000 → 0002) [ 2.508011] mlx5_core 0000:01:00.0: firmware version: 28.45.4028 [ 2.508049] mlx5_core 0000:01:00.0: 126.028 Gb/s available PCIe bandwidth (32.0 GT/s PCIe x4 link) [ 2.860171] mlx5_core 0000:01:00.0: Rate limit: 127 rates are supported, range: 0Mbps to 195312Mbps [ 2.861184] mlx5_core 0000:01:00.0: E-Switch: Total vports 10, per vport: max uc(128) max mc(2048) [ 2.879348] mlx5_core 0000:01:00.0: Port module event: module 0, Cable unplugged [ 2.881238] mlx5_core 0000:01:00.0: mlx5_pcie_event:296:(pid 162): Detected insufficient power on the PCIe slot (27W). [ 2.892018] mlx5_core 0000:01:00.0: mlx5e: IPSec ESP acceleration enabled [ 3.067828] mlx5_core 0000:01:00.0: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0 enhanced) [ 3.070371] mlx5_core 0000:01:00.1: Adding to iommu group 15 [ 3.073280] mlx5_core 0000:01:00.1: enabling device (0000 → 0002) [ 3.073428] mlx5_core 0000:01:00.1: firmware version: 28.45.4028 [ 3.073457] mlx5_core 0000:01:00.1: 126.028 Gb/s available PCIe bandwidth (32.0 GT/s PCIe x4 link) [ 3.432750] mlx5_core 0000:01:00.1: Rate limit: 127 rates are supported, range: 0Mbps to 195312Mbps [ 3.433479] mlx5_core 0000:01:00.1: E-Switch: Total vports 10, per vport: max uc(128) max mc(2048) [ 3.450648] mlx5_core 0000:01:00.1: Port module event: module 1, Cable unplugged [ 3.452057] mlx5_core 0000:01:00.1: mlx5_pcie_event:296:(pid 501): Detected insufficient power on the PCIe slot (27W). [ 3.468905] mlx5_core 0000:01:00.1: mlx5e: IPSec ESP acceleration enabled [ 3.662887] mlx5_core 0000:01:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0 enhanced) [ 3.670420] mlx5_core 0002:01:00.0: Adding to iommu group 16 [ 3.675388] mlx5_core 0002:01:00.0: enabling device (0000 → 0002) [ 3.675551] mlx5_core 0002:01:00.0: firmware version: 28.45.4028 [ 3.675582] mlx5_core 0002:01:00.0: 126.028 Gb/s available PCIe bandwidth (32.0 GT/s PCIe x4 link) [ 4.045462] mlx5_core 0002:01:00.0: Rate limit: 127 rates are supported, range: 0Mbps to 195312Mbps [ 4.046084] mlx5_core 0002:01:00.0: E-Switch: Total vports 10, per vport: max uc(128) max mc(2048) [ 4.060885] mlx5_core 0002:01:00.0: Port module event: module 0, Cable unplugged [ 4.063358] mlx5_core 0002:01:00.0: mlx5_pcie_event:296:(pid 485): Detected insufficient power on the PCIe slot (27W). [ 4.078030] mlx5_core 0002:01:00.0: mlx5e: IPSec ESP acceleration enabled [ 4.247667] mlx5_core 0002:01:00.0: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0 enhanced) [ 4.250586] mlx5_core 0002:01:00.1: Adding to iommu group 17 [ 4.255323] mlx5_core 0002:01:00.1: enabling device (0000 → 0002) [ 4.255490] mlx5_core 0002:01:00.1: firmware version: 28.45.4028 [ 4.255524] mlx5_core 0002:01:00.1: 126.028 Gb/s available PCIe bandwidth (32.0 GT/s PCIe x4 link) [ 4.632185] mlx5_core 0002:01:00.1: Rate limit: 127 rates are supported, range: 0Mbps to 195312Mbps [ 4.633921] mlx5_core 0002:01:00.1: E-Switch: Total vports 10, per vport: max uc(128) max mc(2048) [ 4.657372] mlx5_core 0002:01:00.1: Port module event: module 1, Cable unplugged [ 4.658331] mlx5_core 0002:01:00.1: mlx5_pcie_event:296:(pid 11): Detected insufficient power on the PCIe slot (27W). [ 4.672594] mlx5_core 0002:01:00.1: mlx5e: IPSec ESP acceleration enabled [ 4.854775] mlx5_core 0002:01:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0 enhanced) [ 4.860600] mlx5_core 0002:01:00.0 enP2p1s0f0np0: renamed from eth2 [ 4.861202] mlx5_core 0000:01:00.1 enp1s0f1np1: renamed from eth1 [ 4.861774] mlx5_core 0000:01:00.0 enp1s0f0np0: renamed from eth0 [ 4.861992] mlx5_core 0002:01:00.1 enP2p1s0f1np1: renamed from eth3 [ 8.313154] mlx5_core 0002:01:00.0 enP2p1s0f0np0: Link down [ 8.607376] mlx5_core 0002:01:00.1 enP2p1s0f1np1: Link down [ 8.959706] mlx5_core 0000:01:00.0 enp1s0f0np0: Link down [ 9.259489] mlx5_core 0000:01:00.1 enp1s0f1np1: Link down root@spark-1eaa:/home/administrator# sudo dmesg | grep -i “link down” [ 8.313154] mlx5_core 0002:01:00.0 enP2p1s0f0np0: Link down [ 8.607376] mlx5_core 0002:01:00.1 enP2p1s0f1np1: Link down [ 8.959706] mlx5_core 0000:01:00.0 enp1s0f0np0: Link down [ 9.259489] mlx5_core 0000:01:00.1 enp1s0f1np1: Link down root@spark-1eaa:/home/administrator# lspci -t -[0000:00]—00.0-[01-0f]–±00.0 \-00.1 -[0002:00]—00.0-[01-0f]–±00.0 \-00.1 -[0004:00]—00.0-[01-0f]----00.0 -[0007:00]—00.0-[01-0f]----00.0 -[0009:00]—00.0-[01-0f]----00.0 -[000f:00]—00.0-[01]----00.0 root@spark-1eaa:/home/administrator# sudo journalctl -k | grep -i pcie Dec 08 13:14:24 spark-1eaa kernel: ACPI: USB4 _OSC: OS supports USB3+ DisplayPort+ PCIe+ XDomain+ Dec 08 13:14:24 spark-1eaa kernel: ACPI: USB4 _OSC: OS controls USB3+ DisplayPort+ PCIe+ XDomain+ Dec 08 13:14:24 spark-1eaa kernel: acpi PNP0A08:00: _OSC: OS now controls [PCIeHotplug PME AER PCIeCapability LTR] Dec 08 13:14:24 spark-1eaa kernel: pci 0000:00:00.0: [10de:22ce] type 01 class 0x060400 PCIe Root Port Dec 08 13:14:24 spark-1eaa kernel: pci 0000:01:00.0: [15b3:1021] type 00 class 0x020000 PCIe Endpoint Dec 08 13:14:24 spark-1eaa kernel: pci 0000:01:00.1: [15b3:1021] type 00 class 0x020000 PCIe Endpoint Dec 08 13:14:24 spark-1eaa kernel: acpi PNP0A08:01: _OSC: OS now controls [PCIeHotplug PME AER PCIeCapability LTR] Dec 08 13:14:24 spark-1eaa kernel: pci 0002:00:00.0: [10de:22ce] type 01 class 0x060400 PCIe Root Port Dec 08 13:14:24 spark-1eaa kernel: pci 0002:01:00.0: [15b3:1021] type 00 class 0x020000 PCIe Endpoint Dec 08 13:14:24 spark-1eaa kernel: pci 0002:01:00.1: [15b3:1021] type 00 class 0x020000 PCIe Endpoint Dec 08 13:14:24 spark-1eaa kernel: acpi PNP0A08:02: _OSC: OS now controls [PCIeHotplug PME AER PCIeCapability LTR] Dec 08 13:14:24 spark-1eaa kernel: pci 0004:00:00.0: [10de:22ce] type 01 class 0x060400 PCIe Root Port Dec 08 13:14:24 spark-1eaa kernel: pci 0004:01:00.0: [144d:a810] type 00 class 0x010802 PCIe Endpoint Dec 08 13:14:24 spark-1eaa kernel: acpi PNP0A08:04: _OSC: OS now controls [PCIeHotplug PME AER PCIeCapability LTR] Dec 08 13:14:24 spark-1eaa kernel: pci 0007:00:00.0: [10de:22d0] type 01 class 0x060400 PCIe Root Port Dec 08 13:14:24 spark-1eaa kernel: pci 0007:01:00.0: [10ec:8127] type 00 class 0x020000 PCIe Endpoint Dec 08 13:14:24 spark-1eaa kernel: acpi PNP0A08:06: _OSC: OS now controls [PCIeHotplug PME AER PCIeCapability LTR] Dec 08 13:14:24 spark-1eaa kernel: pci 0009:00:00.0: [10de:22d0] type 01 class 0x060400 PCIe Root Port Dec 08 13:14:24 spark-1eaa kernel: pci 0009:01:00.0: [14c3:7925] type 00 class 0x028000 PCIe Endpoint Dec 08 13:14:24 spark-1eaa kernel: acpi PNP0A08:0b: _OSC: OS now controls [PCIeHotplug PME AER PCIeCapability LTR] Dec 08 13:14:24 spark-1eaa kernel: pci 000f:00:00.0: [10de:22d1] type 01 class 0x060400 PCIe Root Port Dec 08 13:14:24 spark-1eaa kernel: pci 000f:01:00.0: [10de:2e12] type 00 class 0x030000 PCIe Endpoint Dec 08 13:14:24 spark-1eaa kernel: pci 000f:01:00.0: 0.000 Gb/s available PCIe bandwidth, limited by Unknown x0 link at 000f:00:00.0 (capable of 32.000 Gb/s with 2.5 GT/s PCIe x16 link) Dec 08 13:14:24 spark-1eaa kernel: pcieport 0000:00:00.0: Adding to iommu group 0 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0000:00:00.0: PME: Signaling with IRQ 330 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0000:00:00.0: AER: enabled with IRQ 331 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0002:00:00.0: Adding to iommu group 1 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0002:00:00.0: PME: Signaling with IRQ 333 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0002:00:00.0: AER: enabled with IRQ 334 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0004:00:00.0: Adding to iommu group 2 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0004:00:00.0: PME: Signaling with IRQ 336 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0004:00:00.0: AER: enabled with IRQ 337 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0004:00:00.0: pciehp: Slot #4 AttnBtn- PwrCtrl- MRL- AttnInd- PwrInd- HotPlug+ Surprise+ Interlock- NoCompl+ IbPresDis- LLActRep+ Dec 08 13:14:24 spark-1eaa kernel: pcieport 0007:00:00.0: Adding to iommu group 3 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0007:00:00.0: PME: Signaling with IRQ 339 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0007:00:00.0: AER: enabled with IRQ 340 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0007:00:00.0: pciehp: Slot #7 AttnBtn- PwrCtrl- MRL- AttnInd- PwrInd- HotPlug+ Surprise+ Interlock- NoCompl+ IbPresDis- LLActRep+ Dec 08 13:14:24 spark-1eaa kernel: pcieport 0009:00:00.0: Adding to iommu group 4 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0009:00:00.0: PME: Signaling with IRQ 342 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0009:00:00.0: AER: enabled with IRQ 343 Dec 08 13:14:24 spark-1eaa kernel: pcieport 0009:00:00.0: pciehp: Slot #9 AttnBtn- PwrCtrl- MRL- AttnInd- PwrInd- HotPlug+ Surprise+ Interlock- NoCompl+ IbPresDis- LLActRep+ Dec 08 13:14:24 spark-1eaa kernel: pcieport 000f:00:00.0: Adding to iommu group 5 Dec 08 13:14:24 spark-1eaa kernel: pcieport 000f:00:00.0: PME: Signaling with IRQ 344 Dec 08 13:14:24 spark-1eaa kernel: pcieport 000f:00:00.0: AER: enabled with IRQ 346 Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0000:01:00.0: 126.028 Gb/s available PCIe bandwidth (32.0 GT/s PCIe x4 link) Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0000:01:00.0: mlx5_pcie_event:296:(pid 162): Detected insufficient power on the PCIe slot (27W). Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0000:01:00.1: 126.028 Gb/s available PCIe bandwidth (32.0 GT/s PCIe x4 link) Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0000:01:00.1: mlx5_pcie_event:296:(pid 501): Detected insufficient power on the PCIe slot (27W). Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0002:01:00.0: 126.028 Gb/s available PCIe bandwidth (32.0 GT/s PCIe x4 link) Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0002:01:00.0: mlx5_pcie_event:296:(pid 485): Detected insufficient power on the PCIe slot (27W). Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0002:01:00.1: 126.028 Gb/s available PCIe bandwidth (32.0 GT/s PCIe x4 link) Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0002:01:00.1: mlx5_pcie_event:296:(pid 11): Detected insufficient power on the PCIe slot (27W). root@spark-1eaa:/home/administrator# sudo journalctl -k | grep -i power Dec 08 13:14:24 spark-1eaa kernel: thermal_sys: Registered thermal governor ‘power_allocator’ Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.P0RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.R0RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.P2RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.R2RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.P4RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.R4RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.P6RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.R6RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.P7RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.R7RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.P8RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.R8RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.P9RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.R9RR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.USB5.RHUB.PRT2.PWFR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.PBRR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.RBRR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.PCRR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.RCRR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.PDRR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.RDRR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.PERR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.RERR: New power resource Dec 08 13:14:24 spark-1eaa kernel: ACPI: \_SB_.PFRR: New power resource Dec 08 13:14:24 spark-1eaa kernel: input: Power Button as /devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C0C:00/input/input0 Dec 08 13:14:24 spark-1eaa kernel: ACPI: button: Power Button [PWRB] Dec 08 13:14:24 spark-1eaa kernel: PM: genpd: Disabling unused power domains Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0000:01:00.0: mlx5_pcie_event:296:(pid 162): Detected insufficient power on the PCIe slot (27W). Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0000:01:00.1: mlx5_pcie_event:296:(pid 501): Detected insufficient power on the PCIe slot (27W). Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0002:01:00.0: mlx5_pcie_event:296:(pid 485): Detected insufficient power on the PCIe slot (27W). Dec 08 13:14:24 spark-1eaa kernel: mlx5_core 0002:01:00.1: mlx5_pcie_event:296:(pid 11): Detected insufficient power on the PCIe slot (27W). root@spark-1eaa:/home/administrator#
and while training the model we use the below following arguments:-
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
# FastVisionModel.for_training(model) # Enable for training!
# trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
train_dataset = prepared_dataset,
args = SFTConfig(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 8,
warmup_steps = 5,
# max_steps = 30,
num_train_epochs = 2, # Set this instead of max_steps for full training runs
learning_rate = 1e-4,
logging_steps = 1,
optim = “adamw_8bit”,
weight_decay = 0.001,
lr_scheduler_type = “linear”,
seed = 3407,
output_dir = “outputs”,
report_to = “none”, # For Weights and Biases
# load_best_model_at_end = True, # MUST USE for early stopping
# metric_for_best_model = “eval_loss”, # metric we want to early stop on
# greater_is_better = False, # the lower the eval loss, the better
# save_strategy = “steps”,
# eval_strategy = “steps”,
# You MUST put the below items for vision finetuning:
remove_unused_columns = False,
dataset_text_field = “”,
dataset_kwargs = {“skip_prepare_dataset”: True},
max_length = 2048,
),
)
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from transformers import EarlyStoppingCallback
FastVisionModel.for_training(model) # Enable for training!
early_stopping_callback = EarlyStoppingCallback(
early_stopping_patience=3,
early_stopping_threshold=0.001
)
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
train_dataset = prepared_train_dataset,
eval_dataset = prepared_eval_dataset,
args = SFTConfig(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 5,
# max_steps = 30,
num_train_epochs = 3, # Set this instead of max_steps for full training runs
learning_rate = 2e-4,
logging_steps = 5,
optim = “adamw_8bit”,
weight_decay = 0.01,
lr_scheduler_type = “linear”,
fp16 = False,
bf16 = True,
seed = 3407,
output_dir = "outputs3",
report_to = "none", # For Weights and Biases
# load_best_model_at_end = True, # MUST USE for early stopping
# metric_for_best_model = "eval_loss", # metric we want to early stop on
# greater_is_better = False, # the lower the eval loss, the better
# save_strategy = "epoch",
# save_steps=15,
# eval_strategy = "epoch",
dataloader_num_workers=8,
# eval_steps= 15,
# You MUST put the below items for vision finetuning:
remove_unused_columns = False,
dataset_text_field = "",
dataset_kwargs = {"skip_prepare_dataset": True},
dataset_num_proc=8,
max_length = 2048,
),
# callbacks=\[early_stopping_callback\],
)