@mdevries1 @gkloosterman , there is a issue when running nvidia container images on BCM k8s cluster , some libcuda error . Please refer below logs for more clarity
[root@master88 ~]# kubectl get pod
NAME READY STATUS RESTARTS AGE
imagenet 0/1 Error 0 8s
[root@master88 ~]# kubectl describe pod imagenet
Name: imagenet
Namespace: default
Priority: 0
Node: node002/192.168.61.92
Start Time: Thu, 20 Apr 2023 10:34:02 +0900
Labels: app=imagenet
devcloud=autorun
Annotations: cni.projectcalico.org/containerID: 71ce9e570739342da99c38af4fdf5c8ca17faddc3b3b5715c07bbe8b4394a7ef
cni.projectcalico.org/podIP:
cni.projectcalico.org/podIPs:
Status: Failed
IP: 172.29.112.137
IPs:
IP: 172.29.112.137
Containers:
imagenet:
Container ID: containerd://9816339387560a7108e51af6abc94a7928602993b43fbbf376db7300ebe7d4aa
Image: 192.168.61.4:5000/nvidia_rn50_mx:0.2
Image ID: 192.168.61.4:5000/nvidia_rn50_mx@sha256:3ce82feea033006c7f1488babc5f3839defa5df0b85177e9d7fabc23b91e8a11
Port: <none>
Host Port: <none>
Command:
/bin/sh
-c
Args:
./scripts/prepare_imagenet.sh /data/imagenet/train-val-recordio-passthrough/ILSVRC/Data/CLS-LOC/ /data/imagenet/train-val-recordio-passthrough/ILSVRC/Data/CLS-LOC/ 80
State: Terminated
Reason: Error
Exit Code: 1
Started: Thu, 20 Apr 2023 10:34:03 +0900
Finished: Thu, 20 Apr 2023 10:34:03 +0900
Ready: False
Restart Count: 0
Environment: <none>
Mounts:
/data/imagenet/train-val-recordio-passthrough from imagenet-volume (rw)
/tmp from imagenet-log (rw)
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-8875p (ro)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
imagenet-volume:
Type: PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
ClaimName: mlperf-imagenet-lustre-p1743-n4-gpu-image-pvc
ReadOnly: false
imagenet-log:
Type: HostPath (bare host directory volume)
Path: /mnt/
HostPathType:
kube-api-access-8875p:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: kubernetes.io/hostname=node002
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 20s default-scheduler Successfully assigned default/imagenet to node002
Warning DNSConfigForming 19s (x2 over 20s) kubelet Search Line limits were exceeded, some search paths have been omitted, the applied search line is: default.svc.cluster.local svc.cluster.local cluster.local cm.cluster brightcomputing.com idrac.cluster
Normal Pulled 19s kubelet Container image "192.168.61.4:5000/nvidia_rn50_mx:0.2" already present on machine
Normal Created 19s kubelet Created container imagenet
Normal Started 19s kubelet Started container imagenet
[root@master88 ~]# kubectl logs imagenet --timestamps
2023-04-20T10:34:03.377991374+09:00 Traceback (most recent call last):
2023-04-20T10:34:03.378009893+09:00 File "/opt/mxnet/tools/im2rec.py", line 26, in <module>
2023-04-20T10:34:03.378013003+09:00 import mxnet as mx
2023-04-20T10:34:03.378015060+09:00 File "/opt/mxnet/python/mxnet/__init__.py", line 23, in <module>
2023-04-20T10:34:03.378016898+09:00 from .context import Context, current_context, cpu, gpu, cpu_pinned
2023-04-20T10:34:03.378018634+09:00 File "/opt/mxnet/python/mxnet/context.py", line 23, in <module>
2023-04-20T10:34:03.378020679+09:00 from .base import classproperty, with_metaclass, _MXClassPropertyMetaClass
2023-04-20T10:34:03.378022980+09:00 File "/opt/mxnet/python/mxnet/base.py", line 356, in <module>
2023-04-20T10:34:03.378024591+09:00 _LIB = _load_lib()
2023-04-20T10:34:03.378026233+09:00 File "/opt/mxnet/python/mxnet/base.py", line 347, in _load_lib
2023-04-20T10:34:03.378027899+09:00 lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_LOCAL)
2023-04-20T10:34:03.378029504+09:00 File "/usr/lib/python3.8/ctypes/__init__.py", line 373, in __init__
2023-04-20T10:34:03.378031151+09:00 self._handle = _dlopen(self._name, mode)
2023-04-20T10:34:03.378032848+09:00 OSError: /usr/lib/x86_64-linux-gnu/libcuda.so.1: file too short
@mdevries1 @gkloosterman , looks like libcuda version issue . How can we export latest libcuda versions to pods so that it can see the version with which the host nvidia driver is installed
Below output is from master and pod running on compute node002
[root@master88 ~]# find / -name libcuda.so.1
/cm/local/apps/cuda-driver/libs/515.65.01/lib/libcuda.so.1
/cm/local/apps/cuda-driver/libs/515.65.01/lib64/libcuda.so.1
/cm/shared/apps/cuda11.7/toolkit/11.7.1/compat/libcuda.so.1
/cm/shared/apps/cuda12.0/toolkit/12.0.1/compat/libcuda.so.1
/cm/images/default-image/cm/local/apps/cuda-driver/libs/515.65.01/lib/libcuda.so.1
/cm/images/default-image/cm/local/apps/cuda-driver/libs/515.65.01/lib64/libcuda.so.1
/cm/images/default-image-node002/cm/local/apps/cuda-driver/libs/525.85.12/lib/libcuda.so.1
/cm/images/default-image-node002/cm/local/apps/cuda-driver/libs/525.85.12/lib64/libcuda.so.1
/cm/images/default-image-node002/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/127/fs/usr/local/cuda-11.8/compat/lib.real/libcuda.so.1
/cm/images/default-image-node004/cm/local/apps/cuda-driver/libs/525.85.12/lib/libcuda.so.1
/cm/images/default-image-node004/cm/local/apps/cuda-driver/libs/525.85.12/lib64/libcuda.so.1
/cm/images/default-image-node003/cm/local/apps/cuda-driver/libs/525.85.12/lib/libcuda.so.1
/cm/images/default-image-node003/cm/local/apps/cuda-driver/libs/525.85.12/lib64/libcuda.so.1
[root@master88 ~]#
[root@master88 ~]#
[root@master88 ~]# kubectl get pod
NAME READY STATUS RESTARTS AGE
imagenet 1/1 Running 0 3h56m
[root@master88 ~]#
[root@master88 ~]#
[root@master88 ~]#
[root@master88 ~]#
[root@master88 ~]# kubectl exec -it imagenet bash
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
root@imagenet:/workspace/rn50#
root@imagenet:/workspace/rn50#
root@imagenet:/workspace/rn50#
root@imagenet:/workspace/rn50# echo $LD_LIBRARY_PATH
/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/lib:/usr/local/cuda/lib64
root@imagenet:/workspace/rn50#
root@imagenet:/workspace/rn50#
root@imagenet:/workspace/rn50# find / -name libcuda.so.1
/usr/lib/x86_64-linux-gnu/libcuda.so.1
/usr/local/cuda-11.8/compat/lib.real/libcuda.so.1
/usr/local/cuda-11.8/targets/x86_64-linux/lib/libcuda.so.1
root@imagenet:/workspace/rn50#
Below output is from compute node002
[root@node002 ~]# nvidia-smi
Thu Apr 20 20:19:39 2023
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA A100 80G... On | 00000000:17:00.0 Off | 0 |
| N/A 37C P0 45W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA A100 80G... On | 00000000:65:00.0 Off | 0 |
| N/A 40C P0 43W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 2 NVIDIA A100 80G... On | 00000000:CA:00.0 Off | 0 |
| N/A 34C P0 42W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 3 NVIDIA A100 80G... On | 00000000:E3:00.0 Off | 0 |
| N/A 33C P0 41W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
[root@node002 ~]#
[root@node002 ~]#
[root@node002 ~]# find / -name libcuda.so.1
/cm/local/apps/cuda-driver/libs/525.85.12/lib/libcuda.so.1
/cm/local/apps/cuda-driver/libs/525.85.12/lib64/libcuda.so.1
/cm/shared/apps/cuda11.7/toolkit/11.7.1/compat/libcuda.so.1
/cm/shared/apps/cuda12.0/toolkit/12.0.1/compat/libcuda.so.1
/run/containerd/io.containerd.runtime.v2.task/k8s.io/25843041f7b4fa4263bcbea5983efd8535321d3052311a4d95eae486ed9a7e1e/rootfs/usr/lib/x86_64-linux-gnu/libcuda.so.1
/run/containerd/io.containerd.runtime.v2.task/k8s.io/25843041f7b4fa4263bcbea5983efd8535321d3052311a4d95eae486ed9a7e1e/rootfs/usr/local/cuda-11.8/compat/lib.real/libcuda.so.1
/run/containerd/io.containerd.runtime.v2.task/k8s.io/25843041f7b4fa4263bcbea5983efd8535321d3052311a4d95eae486ed9a7e1e/rootfs/usr/local/cuda-11.8/targets/x86_64-linux/lib/stubs/libcuda.so.1
/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/153/fs/usr/local/cuda-11.8/targets/x86_64-linux/lib/stubs/libcuda.so.1
/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/154/fs/usr/local/cuda-11.8/targets/x86_64-linux/lib/stubs/libcuda.so.1
/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/158/fs/usr/local/cuda-11.8/targets/x86_64-linux/lib/stubs/libcuda.so.1
/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/177/fs/usr/lib/x86_64-linux-gnu/libcuda.so.1
/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/127/fs/usr/local/cuda-11.8/compat/lib.real/libcuda.so.1
@mdevries1 @gkloosterman , guys this issue is really hurting , I am not able to export cuda libs to a pod so that mxnet can be imported
Please help !!!
Have you updated the cm demon packages as they asked you to?
This was not troublesome for us to do, but we’re using a different flavour.
You might also want to take a peek here: About the Bright Cluster Manager User forum