edge01,docker,cri-docker,
root@zckmaster:/home/zck# kubectl get pods -A -o wide
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
default gpu-deployment-64f6b8cb49-vsr82 0/1 RunContainerError 55 (12s ago) 3h56m 10.88.0.31 edge01
kube-flannel kube-flannel-cloud-ds-vks67 1/1 Running 5 (8h ago) 3d8h 192.168.31.155 zckmaster
kube-system coredns-66f779496c-t96dj 1/1 Running 4 (8h ago) 3d8h 10.244.0.20 zckmaster
kube-system coredns-66f779496c-vwvhk 1/1 Running 4 (8h ago) 3d8h 10.244.0.19 zckmaster
kube-system etcd-zckmaster 1/1 Running 4 (8h ago) 3d8h 192.168.31.155 zckmaster
kube-system kube-apiserver-zckmaster 1/1 Running 8 (8h ago) 3d8h 192.168.31.155 zckmaster
kube-system kube-controller-manager-zckmaster 1/1 Running 6 (8h ago) 3d8h 192.168.31.155 zckmaster
kube-system kube-proxy-jgmjm 1/1 Running 4 (8h ago) 3d8h 192.168.31.155 zckmaster
kube-system kube-scheduler-zckmaster 1/1 Running 5 (8h ago) 3d8h 192.168.31.155 zckmaster
kube-system metrics-server-98cfbdb89-cv5jw 1/1 Running 2 (8h ago) 32h 192.168.31.155 zckmaster
kube-system nvidia-device-plugin-daemonset-24q4f 0/1 Terminating 0 2d6h nvidia-desktop
kube-system nvidia-device-plugin-daemonset-d6zq5 1/1 Running 0 3h58m 10.88.0.30 edge01
kubeedge cloudcore-6c6b957ff8-wkcrt 1/1 Running 16 (8h ago) 9h 192.168.31.155 zckmaster
kubeedge edge-eclipse-mosquitto-2dpf2 1/1 Running 6 (34h ago) 2d10h 192.168.31.175 nvidia-desktop
kubeedge edge-eclipse-mosquitto-ddw89 1/1 Running 3 (5h33m ago) 32h 192.168.31.175 edge01
kubeedge edge-eclipse-mosquitto-r4zwr 1/1 Running 1 (8h ago) 3d6h 192.168.159.186 zck-virtual-machine
root@zckmaster:/home/zck#
find gpu-deployment-64f6b8cb49-vsr82 is bad
root@zckmaster:/home/zck# kubectl describe pod gpu-deployment-64f6b8cb49-vsr82
Name: gpu-deployment-64f6b8cb49-vsr82
Namespace: default
Priority: 0
Service Account: default
Node: edge01/192.168.31.175
Start Time: Fri, 25 Jul 2025 20:30:02 +0800
Labels: app=gpu
pod-template-hash=64f6b8cb49
Annotations:
Status: Running
IP: 10.88.0.31
IPs:
IP: 10.88.0.31
Controlled By: ReplicaSet/gpu-deployment-64f6b8cb49
Containers:
gpu-container:
Container ID: docker://a63efd122233f14aa86bf09c1effe02f4b62f46cb3888e0a272089cc838e427e
Image: mnist:2.0
Image ID: docker://sha256:152ccec105291c7dc8aecab2cd0344aa3b7e708bf1af0af351505571297ab038
Port: 80/TCP
Host Port: 0/TCP
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: ContainerCannotRun
Message: failed to create task for container: failed to create shim task: OCI runtime create failed: could not apply required modification to OCI specification: error modifying OCI spec: failed to inject CDI devices: unresolvable CDI devices nvidia.com/gpu=tegra: unknown
Exit Code: 128
Started: Sat, 26 Jul 2025 00:26:14 +0800
Finished: Sat, 26 Jul 2025 00:26:14 +0800
Ready: False
Restart Count: 55
Limits:
cpu: 1
memory: 512Mi
nvidia.com/gpu: 1
Requests:
cpu: 1
memory: 512Mi
nvidia.com/gpu: 1
Environment:
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-7xbb6 (ro)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
kube-api-access-7xbb6:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional:
DownwardAPI: true
QoS Class: Guaranteed
Node-Selectors:
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
root@zckmaster:/home/zck#
find nvidia-device-plugin-daemonset-d6zq5 is good
oot@zckmaster:/home/zck# kubectl logs nvidia-device-plugin-daemonset-d6zq5 -n kube-system
I0725 12:27:59.174360 1 main.go:199] Starting FS watcher.
I0725 12:27:59.174677 1 main.go:206] Starting OS watcher.
I0725 12:27:59.175839 1 main.go:221] Starting Plugins.
I0725 12:27:59.175876 1 main.go:278] Loading configuration.
I0725 12:27:59.179561 1 main.go:303] Updating config with default resource matching patterns.
I0725 12:27:59.179967 1 main.go:314]
Running with config:
{
“version”: “v1”,
“flags”: {
“migStrategy”: “none”,
“failOnInitError”: false,
“mpsRoot”: “”,
“nvidiaDriverRoot”: “/”,
“nvidiaDevRoot”: “/”,
“gdsEnabled”: false,
“mofedEnabled”: false,
“useNodeFeatureAPI”: null,
“deviceDiscoveryStrategy”: “auto”,
“plugin”: {
“passDeviceSpecs”: false,
“deviceListStrategy”: [
“envvar”
],
“deviceIDStrategy”: “uuid”,
“cdiAnnotationPrefix”: “cdi.k8s.io/”,
“nvidiaCTKPath”: “/usr/bin/nvidia-ctk”,
“containerDriverRoot”: “/driver-root”
}
},
“resources”: {
“gpus”: [
{
“pattern”: “",
“name”: “nvidia.com/gpu”
}
]
},
“sharing”: {
“timeSlicing”: {}
}
}
I0725 12:27:59.179988 1 main.go:317] Retrieving plugins.
I0725 12:27:59.200510 1 server.go:216] Starting GRPC server for ‘nvidia.com/gpu’
I0725 12:27:59.201963 1 server.go:147] Starting to serve ‘nvidia.com/gpu’ on /var/lib/kubelet/device-plugins/nvidia-gpu.sock
I0725 12:27:59.205694 1 server.go:154] Registered device plugin for ‘nvidia.com/gpu’ with Kubelet
I0725 16:10:13.496282 1 main.go:246] inotify: /var/lib/kubelet/device-plugins/kubelet.sock created, restarting.
I0725 16:10:13.496431 1 main.go:353] Stopping plugins.
I0725 16:10:13.496455 1 server.go:185] Stopping to serve ‘nvidia.com/gpu’ on /var/lib/kubelet/device-plugins/nvidia-gpu.sock
I0725 16:10:13.496807 1 main.go:221] Starting Plugins.
I0725 16:10:13.496825 1 main.go:278] Loading configuration.
I0725 16:10:13.497593 1 main.go:303] Updating config with default resource matching patterns.
I0725 16:10:13.497829 1 main.go:314]
Running with config:
{
“version”: “v1”,
“flags”: {
“migStrategy”: “none”,
“failOnInitError”: false,
“mpsRoot”: “”,
“nvidiaDriverRoot”: “/”,
“nvidiaDevRoot”: “/”,
“gdsEnabled”: false,
“mofedEnabled”: false,
“useNodeFeatureAPI”: null,
“deviceDiscoveryStrategy”: “auto”,
“plugin”: {
“passDeviceSpecs”: false,
“deviceListStrategy”: [
“envvar”
],
“deviceIDStrategy”: “uuid”,
“cdiAnnotationPrefix”: “cdi.k8s.io/”,
“nvidiaCTKPath”: “/usr/bin/nvidia-ctk”,
“containerDriverRoot”: “/driver-root”
}
},
“resources”: {
“gpus”: [
{
“pattern”: "”,
“name”: “nvidia.com/gpu”
}
]
},
“sharing”: {
“timeSlicing”: {}
}
}
I0725 16:10:13.497854 1 main.go:317] Retrieving plugins.
I0725 16:10:13.517453 1 server.go:216] Starting GRPC server for ‘nvidia.com/gpu’
I0725 16:10:13.519003 1 server.go:147] Starting to serve ‘nvidia.com/gpu’ on /var/lib/kubelet/device-plugins/nvidia-gpu.sock
I0725 16:10:13.525814 1 server.go:154] Registered device plugin for ‘nvidia.com/gpu’ with Kubelet
root@zckmaster:/home/zck#
i donot know where wrong please help me