Hi team
I used slurm cluster on this flavour of Nvidia BCM and it worked but now its showing below mentioned error when running some srun commands . please refer below and let me know if more info is required .
[root@master88 ~]# sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
defq* up infinite 2 idle node[002,004]
[root@master88 ~]#
[root@master88 ~]#
[root@master88 ~]#
[root@master88 ~]# SLURM_DEBUG=2 srun --mpi=pmix_v3 -N 1 --ntasks=16 -w node004 --container-image=192.168.61.4:5000#/cosmoflow-nvidia:0.4 --container-name=cosmoflow-preprocess --container-workdir=/mnt/mxnet --container-mounts=/mnt/lustre:/mnt bash tools/init_datasets.sh /mnt/Cosmo-Small /mnt/processed 16
srun: select/cons_res: common_init: select/cons_res loaded
srun: select/cons_tres: common_init: select/cons_tres loaded
srun: select/linear: init: Linear node selection plugin loaded with argument 4
srun: debug: switch/none: init: switch NONE plugin loaded
srun: debug: spank: opening plugin stack /cm/shared/apps/slurm/var/etc/slurm/plugstack.conf
srun: debug: /cm/shared/apps/slurm/var/etc/slurm/plugstack.conf: 1: include "/cm/shared/apps/slurm/var/etc/slurm/plugstack.conf.d/*"
srun: debug: spank: opening plugin stack /cm/shared/apps/slurm/var/etc/slurm/plugstack.conf.d/pyxis.conf
srun: debug: spank: /cm/shared/apps/slurm/var/etc/slurm/plugstack.conf.d/pyxis.conf:1: Loaded plugin spank_pyxis.so
srun: debug: SPANK: appending plugin option "container-image"
srun: debug: SPANK: appending plugin option "container-mounts"
srun: debug: SPANK: appending plugin option "container-workdir"
srun: debug: SPANK: appending plugin option "container-name"
srun: debug: SPANK: appending plugin option "container-save"
srun: debug: SPANK: appending plugin option "container-mount-home"
srun: debug: SPANK: appending plugin option "no-container-mount-home"
srun: debug: SPANK: appending plugin option "container-remap-root"
srun: debug: SPANK: appending plugin option "no-container-remap-root"
srun: debug: SPANK: appending plugin option "container-entrypoint"
srun: debug: SPANK: appending plugin option "no-container-entrypoint"
srun: debug: SPANK: appending plugin option "container-writable"
srun: debug: SPANK: appending plugin option "container-readonly"
srun: launch/slurm: init: launch Slurm plugin loaded
srun: debug: mpi type = pmix_v3
srun: debug: mpi/pmix_v3: init: PMIx plugin loaded
srun: debug: propagating RLIMIT_CPU=18446744073709551615
srun: debug: propagating RLIMIT_FSIZE=18446744073709551615
srun: debug: propagating RLIMIT_DATA=18446744073709551615
srun: debug: propagating RLIMIT_STACK=18446744073709551615
srun: debug: propagating RLIMIT_CORE=0
srun: debug: propagating RLIMIT_RSS=18446744073709551615
srun: debug: propagating RLIMIT_NPROC=255101
srun: debug: propagating RLIMIT_NOFILE=131072
srun: debug: propagating RLIMIT_MEMLOCK=18446744073709551615
srun: debug: propagating RLIMIT_AS=18446744073709551615
srun: debug: propagating SLURM_PRIO_PROCESS=0
srun: debug: propagating UMASK=0022
srun: debug: Entering slurm_allocation_msg_thr_create()
srun: debug: port from net_stream_listen is 37225
srun: debug: Entering _msg_thr_internal
srun: debug: auth/munge: init: Munge authentication plugin loaded
srun: Waiting for nodes to boot (delay looping 450 times @ 0.100000 secs x index)
srun: Nodes node004 are ready for job
srun: jobid 15: nodes(1):`node004', cpu counts: 16(x1)
srun: debug: requesting job 15, user 0, nodes 1 including (node004)
srun: debug: cpus 16, tasks 16, name bash, relative 65534
srun: launch/slurm: launch_p_step_launch: CpuBindType=(null type)
srun: debug: Entering slurm_step_launch
srun: debug: mpi type = (null)
srun: debug: mpi/pmix_v3: pmixp_abort_agent_start: (null) [0]: pmixp_agent.c:376: Abort agent port: 39687
srun: debug: mpi/pmix_v3: p_mpi_hook_client_prelaunch: (null) [0]: mpi_pmix.c:224: setup process mapping in srun
srun: debug: mpi/pmix_v3: _pmix_abort_thread: (null) [0]: pmixp_agent.c:352: Start abort thread
srun: debug: Entering _msg_thr_create()
srun: debug: initialized stdio listening socket, port 43741
srun: debug: Started IO server thread (23456086173440)
srun: debug: Entering _launch_tasks
srun: launching StepId=15.0 on host node004, 16 tasks: [0-15]
srun: route/default: init: route default plugin loaded
srun: debug: launch returned msg_rc=0 err=0 type=8001
slurmstepd: error: pyxis: child 3229033 failed with error code: 1
slurmstepd: error: pyxis: couldn't execute enroot command
slurmstepd: error: pyxis: printing enroot log file:
slurmstepd: error: pyxis: couldn't get list of existing container filesystems
slurmstepd: error: pyxis: couldn't get list of containers
srun: launch/slurm: _task_start: Node node004, 16 tasks started
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: Failed to invoke spank plugin stack
slurmstepd: error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1
slurmstepd: error: Failed to invoke spank plugin stack
srun: launch/slurm: _task_finish: Received task exit notification for 16 tasks of StepId=15.0 (status=0x0100).
srun: error: node004: tasks 0-15: Exited with exit code 1
srun: debug: task 0 done
srun: debug: task 1 done
srun: debug: task 2 done
srun: debug: task 3 done
srun: debug: task 4 done
srun: debug: task 5 done
srun: debug: task 6 done
srun: debug: task 7 done
srun: debug: task 8 done
srun: debug: task 9 done
srun: debug: task 10 done
srun: debug: task 11 done
srun: debug: task 12 done
srun: debug: task 13 done
srun: debug: task 14 done
srun: debug: task 15 done
srun: debug: IO thread exiting
srun: debug: mpi/pmix_v3: _conn_readable: (null) [0]: pmixp_agent.c:103: false, shutdown
srun: debug: mpi/pmix_v3: _pmix_abort_thread: (null) [0]: pmixp_agent.c:354: Abort thread exit
srun: debug: Leaving _msg_thr_internal
[root@master88 ~]# cat /etc/centos-release
Rocky Linux release 8.6 (Green Obsidian)