I got NVRM Xinit error 69 and 45 after one hour of running jobs

Hi,

4 graphics cards RTX 3080 running one process each at a time. No screens.

I always have an error that freezes the computer after one hour more or less. I have tried different xorg.conf configurations without success.

The temperatures are fine. This is are the indicators when running:

nvidia-smi --query-gpu=name,gpu_bus_id,utilization.gpu,utilization.memory,temperature.gpu,fan.speed,power.draw -
-format=csv

name, pci.bus_id, utilization.gpu [%], utilization.memory [%], temperature.gpu, fan.speed [%], power.draw [W]
NVIDIA GeForce RTX 3080, 00000000:05:00.0, 100 %, 100 %, 65, 76 %, 269.89 W
NVIDIA GeForce RTX 3080, 00000000:08:00.0, 100 %, 100 %, 53, 65 %, 207.35 W
NVIDIA GeForce RTX 3080, 00000000:09:00.0, 100 %, 100 %, 62, 72 %, 270.00 W
NVIDIA GeForce RTX 3080, 00000000:0A:00.0, 100 %, 100 %, 57, 70 %, 249.55 W

Xorg configuration:

# nvidia-xconfig: X configuration file generated by nvidia-xconfig
# nvidia-xconfig:  version 470.57.02


Section "ServerLayout"
    Identifier     "Layout0"
    Screen      0  "intel"
    Screen      1  "Screen0"
    Screen      2  "Screen1" RightOf "Screen0"
    Screen      3  "Screen2" RightOf "Screen1"
    Screen      4  "Screen3" RightOf "Screen2"
EndSection

Section "Device"
   Identifier "intel"
   Driver "intel"
   BusID "PCI:0@0:2:0"
EndSection

Section "Screen"
   Identifier "intel"
   Device "intel"
EndSection

Section "Monitor"
    Identifier     "Monitor0"
    VendorName     "Unknown"
    ModelName      "Unknown"
    Option         "DPMS"
EndSection

Section "Monitor"
    Identifier     "Monitor1"
    VendorName     "Unknown"
    ModelName      "Unknown"
    Option         "DPMS"
EndSection

Section "Monitor"
    Identifier     "Monitor2"
    VendorName     "Unknown"
    ModelName      "Unknown"
    Option         "DPMS"
EndSection

Section "Monitor"
    Identifier     "Monitor3"
    VendorName     "Unknown"
    ModelName      "Unknown"
    Option         "DPMS"
EndSection

Section "Device"
    Identifier     "Device0"
    Driver         "nvidia"
    VendorName     "NVIDIA Corporation"
    BoardName      "NVIDIA GeForce RTX 3080"
    BusID          "PCI:5:0:0"
EndSection

Section "Device"
    Identifier     "Device1"
    Driver         "nvidia"
    VendorName     "NVIDIA Corporation"
    BoardName      "NVIDIA GeForce RTX 3080"
    BusID          "PCI:9:0:0"
EndSection

Section "Device"
    Identifier     "Device2"
    Driver         "nvidia"
    VendorName     "NVIDIA Corporation"
    BoardName      "NVIDIA GeForce RTX 3080"
    BusID          "PCI:8:0:0"
EndSection

Section "Device"
    Identifier     "Device3"
    Driver         "nvidia"
    VendorName     "NVIDIA Corporation"
    BoardName      "NVIDIA GeForce RTX 3080"
    BusID          "PCI:10:0:0"
EndSection

Section "Screen"
    Identifier     "Screen0"
    Device         "Device0"
    Monitor        "Monitor0"
    DefaultDepth    24
    Option         "AllowEmptyInitialConfiguration" "True"
    Option         "Coolbits" "28"
    SubSection     "Display"
        Depth       24
    EndSubSection
EndSection

Section "Screen"
    Identifier     "Screen1"
    Device         "Device1"
    Monitor        "Monitor1"
    DefaultDepth    24
    Option         "AllowEmptyInitialConfiguration" "True"
    Option         "Coolbits" "28"
    SubSection     "Display"
        Depth       24
    EndSubSection
EndSection

Section "Screen"
    Identifier     "Screen2"
    Device         "Device2"
    Monitor        "Monitor2"
    DefaultDepth    24
    Option         "AllowEmptyInitialConfiguration" "True"
    Option         "Coolbits" "28"
    SubSection     "Display"
        Depth       24
    EndSubSection
EndSection

Section "Screen"
    Identifier     "Screen3"
    Device         "Device3"
    Monitor        "Monitor3"
    DefaultDepth    24
    Option         "AllowEmptyInitialConfiguration" "True"
    Option         "Coolbits" "28"
    SubSection     "Display"
        Depth       24
    EndSubSection
EndSection

This is the output in the Syslog:

Thank you