Add liveness&readiness probes for NFD master and topologyUpdater

Signed-off-by: Omer Aplatony <omerap12@gmail.com>
kubernetes-sigs · Jul 19, 2024 · dd4e88b · dd4e88b
1 parent 51f2794
commit dd4e88b
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 37 deletions.
diff --git a/deployment/helm/node-feature-discovery/templates/master.yaml b/deployment/helm/node-feature-discovery/templates/master.yaml
@@ -46,16 +46,9 @@ spec:
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
           livenessProbe:
-            grpc:
-              port: 8082
-            initialDelaySeconds: 10
-            periodSeconds: 10
+            {{- toYaml .Values.master.readinessProbe | nindent 12 }}
           readinessProbe:
-            grpc:
-              port: 8082
-            initialDelaySeconds: 5
-            periodSeconds: 10
-            failureThreshold: 10
+            {{- toYaml .Values.master.readinessProbe | nindent 12 }}
           ports:
           - containerPort: {{ .Values.master.port | default "8080" }}
             name: grpc

diff --git a/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml b/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml
@@ -43,16 +43,9 @@ spec:
         image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
         imagePullPolicy: "{{ .Values.image.pullPolicy }}"
         livenessProbe:
-          grpc:
-            port: 8082
-          initialDelaySeconds: 10
-          periodSeconds: 10
+          {{- toYaml .Values.topologyUpdater.livenessProbe | nindent 10 }}
         readinessProbe:
-          grpc:
-            port: 8082
-          initialDelaySeconds: 5
-          periodSeconds: 10
-          failureThreshold: 10
+          {{- toYaml .Values.topologyUpdater.readinessProbe | nindent 10 }}
         env:
         - name: NODE_NAME
           valueFrom:

diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml
@@ -140,6 +140,18 @@ master:
               - key: "node-role.kubernetes.io/control-plane"
                 operator: In
                 values: [""]
+
+  livenessProbe:
+    grpc:
+      port: 8082
+    initialDelaySeconds: 10
+    periodSeconds: 10
+  readinessProbe:
+    grpc:
+      port: 8082
+    initialDelaySeconds: 5
+    periodSeconds: 10
+    failureThreshold: 10
 
 worker:
   enable: true
@@ -492,20 +504,19 @@ topologyUpdater:
       drop: [ "ALL" ]
     readOnlyRootFilesystem: true
     runAsUser: 0
-
-  # livenessProbe: {}
-    ## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation.
-    # grpc:
-    #   port: 8082
-    # initialDelaySeconds: 10
-    # periodSeconds: 10
-  # readinessProbe: {}
-    ## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation.
-    # grpc:
-    #   port: 8082
-    # initialDelaySeconds: 5
-    # periodSeconds: 10
-    # failureThreshold: 10
+
+  livenessProbe:
+    grpc:
+      port: 8082
+    initialDelaySeconds: 10
+    periodSeconds: 10
+
+  readinessProbe:
+    grpc:
+      port: 8082
+    initialDelaySeconds: 5
+    periodSeconds: 10
+    failureThreshold: 10
 
   resources:
     limits:

diff --git a/docs/deployment/helm.md b/docs/deployment/helm.md
@@ -134,16 +134,18 @@ API's you need to install the prometheus operator in your cluster.
 | `master.rbac.create`                | bool    | true                             | Specifies whether to create [RBAC][rbac] configuration for nfd-master                                                                                                                                  |
 | `master.service.type`               | string  | ClusterIP                        | NFD master service type. **NOTE**: this parameter is related to the deprecated gRPC API and will be removed with it in a future release                                                                |
 | `master.service.port`               | integer | 8080                             | NFD master service port. **NOTE**: this parameter is related to the deprecated gRPC API and will be removed with it in a future release                                                                |
-| `master.resources.limits`           | dict    | {memory: 4Gi}                    | NFD master pod [resources limits](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#requests-and-limits)                                                                  |
-| `master.resources.requests`         | dict    | {cpu: 100m, memory: 128Mi}       | NFD master pod [resources requests](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#requests-and-limits). You may want to use the same value for `requests.memory` and `limits.memory`. The “requests” value affects scheduling to accommodate pods on nodes. If there is a large difference between “requests” and “limits” and nodes experience memory pressure, the kernel may invoke the OOM Killer, even if the memory does not exceed the “limits” threshold. This can cause unexpected pod evictions. Memory cannot be compressed and once allocated to a pod, it can only be reclaimed by killing the pod.  [Natan Yellin 22/09/2022](https://home.robusta.dev/blog/kubernetes-memory-limit) that discusses this issue. |
-| `master.tolerations`                | dict    | _Schedule to control-plane node_ | NFD master pod [tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)                                                                                            |
-| `master.annotations`                | dict    | {}                               | NFD master pod [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/)                                                                                           |
-| `master.affinity`                   | dict    |                                  | NFD master pod required [node affinity](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/)                                                               |
+| `master.resources.limits`           | dict    | {memory: 4Gi}                    | topologyUpdater [resources limits](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#requests-and-limits)                                                                  |
+| `master.resources.requests`         | dict    | {cpu: 100m, memory: 128Mi}       | topologyUpdater [resources requests](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#requests-and-limits). You may want to use the same value for `requests.memory` and `limits.memory`. The “requests” value affects scheduling to accommodate pods on nodes. If there is a large difference between “requests” and “limits” and nodes experience memory pressure, the kernel may invoke the OOM Killer, even if the memory does not exceed the “limits” threshold. This can cause unexpected pod evictions. Memory cannot be compressed and once allocated to a pod, it can only be reclaimed by killing the pod.  [Natan Yellin 22/09/2022](https://home.robusta.dev/blog/kubernetes-memory-limit) that discusses this issue. |
+| `master.tolerations`                | dict    | _Schedule to control-plane node_ | topologyUpdater [tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)                                                                                            |
+| `master.annotations`                | dict    | {}                               | topologyUpdater [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/)                                                                                           |
+| `master.affinity`                   | dict    |                                  | topologyUpdater required [node affinity](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/)                                                               |
 | `master.deploymentAnnotations`      | dict    | {}                               | NFD master deployment [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/)                                                                                    |
 | `master.nfdApiParallelism`          | integer | 10                               | Specifies the maximum number of concurrent node updates.                                                                                                                                               |
 | `master.config`                     | dict    |                                  | NFD master [configuration](../reference/master-configuration-reference)                                                                                                                                |
 | `master.args`                       | array   | []                               | Additional [command line arguments](../reference/master-commandline-reference.md) to pass to nfd-master                                                                                                |
 | `master.revisionHistoryLimit`       | integer |                                  | Specify how many old ReplicaSets for this Deployment you want to retain. [revisionHistoryLimit](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#revision-history-limit)          |
+| `master.livenessProbe`              | dict    | {"grpc":{"port":8082},"initialDelaySeconds":10,"periodSeconds":10}                                 | topologyUpdater [liveness probe](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/#liveness-probe)  |
+| `master.readinessProbe`             | dict    | {"grpc":{"port":8082},"initialDelaySeconds":5,"periodSeconds":10,"failureThreshold": 10}           | topologyUpdater [readiness probe](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/#readiness-probe)|
 
 ### Worker pod parameters
 
@@ -200,6 +202,8 @@ API's you need to install the prometheus operator in your cluster.
 | `topologyUpdater.kubeletStateDir`             | string  | /var/lib/kubelet         | Specifies kubelet state directory path for watching state and checkpoint files. Empty value disables kubelet state tracking.                                                                     |
 | `topologyUpdater.args`                        | array   | []                       | Additional [command line arguments](../reference/topology-updater-commandline-reference.md) to pass to nfd-topology-updater                                                                      |
 | `topologyUpdater.revisionHistoryLimit`       | integer |                           | Specify how many old ControllerRevisions for this DaemonSet you want to retain. [revisionHistoryLimit](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/daemon-set-v1/#DaemonSetSpec)          |
+| `topologyUpdater.livenessProbe`              | dict    | {"grpc":{"port":8082},"initialDelaySeconds":10,"periodSeconds":10}                                 | Topology updater pod [liveness probe](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/#liveness-probe)  |
+| `topologyUpdater.readinessProbe`             | dict    | {"grpc":{"port":8082},"initialDelaySeconds":5,"periodSeconds":10,"failureThreshold": 10}           | Topology updater pod [readiness probe](https://kubernetes.io/docs/concepts/configuration/liveness-readiness-startup-probes/#readiness-probe)|
 
 ### Garbage collector parameters