From d588532c9ba4e6e70c7328ce8672e44cc26cd6b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kr=C3=BCger?= <ak@patientsky.com>
Date: Tue, 23 Apr 2019 23:46:02 +0200
Subject: [PATCH] Update probe timeouts, delays etc. (#4612)

* Fix merge conflict

* Add check delay

* Add more liveness and readiness options to metrics-server
---
 .../provision/templates/glusterfs-daemonset.json.j2  |  4 ++--
 .../provision/templates/heketi-bootstrap.json.j2     |  2 +-
 .../provision/templates/heketi-deployment.json.j2    |  2 +-
 .../ansible/templates/coredns-deployment.yml.j2      |  6 ++----
 .../ansible/templates/dashboard.yml.j2               |  2 --
 .../ansible/templates/nodelocaldns-daemonset.yml.j2  | 12 ++++++++++--
 .../templates/ds-ingress-nginx-controller.yml.j2     | 10 +++++-----
 .../templates/metrics-server-deployment.yaml.j2      | 10 ++++------
 .../calico/templates/calico-node.yml.j2              |  5 ++---
 .../canal/templates/canal-node.yaml.j2               |  2 +-
 .../network_plugin/cilium/templates/cilium-ds.yml.j2 |  5 +----
 .../kube-router/templates/kube-router.yml.j2         |  2 +-
 tests/files/gce_centos7-flannel-addons.yml           |  1 +
 tests/files/packet_centos7-flannel-addons.yml        |  3 +--
 14 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/contrib/network-storage/heketi/roles/provision/templates/glusterfs-daemonset.json.j2 b/contrib/network-storage/heketi/roles/provision/templates/glusterfs-daemonset.json.j2
index eddd57eb8..74c031ffe 100644
--- a/contrib/network-storage/heketi/roles/provision/templates/glusterfs-daemonset.json.j2
+++ b/contrib/network-storage/heketi/roles/provision/templates/glusterfs-daemonset.json.j2
@@ -69,7 +69,7 @@
                         },
                         "readinessProbe": {
                             "timeoutSeconds": 3,
-                            "initialDelaySeconds": 60,
+                            "initialDelaySeconds": 3,
                             "exec": {
                                 "command": [
                                     "/bin/bash",
@@ -80,7 +80,7 @@
                         },
                         "livenessProbe": {
                             "timeoutSeconds": 3,
-                            "initialDelaySeconds": 60,
+                            "initialDelaySeconds": 10,
                             "exec": {
                                 "command": [
                                     "/bin/bash",
diff --git a/contrib/network-storage/heketi/roles/provision/templates/heketi-bootstrap.json.j2 b/contrib/network-storage/heketi/roles/provision/templates/heketi-bootstrap.json.j2
index bdcf3e958..43048c6b6 100644
--- a/contrib/network-storage/heketi/roles/provision/templates/heketi-bootstrap.json.j2
+++ b/contrib/network-storage/heketi/roles/provision/templates/heketi-bootstrap.json.j2
@@ -106,7 +106,7 @@
                 },
                 "livenessProbe": {
                   "timeoutSeconds": 3,
-                  "initialDelaySeconds": 30,
+                  "initialDelaySeconds": 10,
                   "httpGet": {
                     "path": "/hello",
                     "port": 8080
diff --git a/contrib/network-storage/heketi/roles/provision/templates/heketi-deployment.json.j2 b/contrib/network-storage/heketi/roles/provision/templates/heketi-deployment.json.j2
index 5eb71718c..247f1fd9d 100644
--- a/contrib/network-storage/heketi/roles/provision/templates/heketi-deployment.json.j2
+++ b/contrib/network-storage/heketi/roles/provision/templates/heketi-deployment.json.j2
@@ -122,7 +122,7 @@
                 },
                 "livenessProbe": {
                   "timeoutSeconds": 3,
-                  "initialDelaySeconds": 30,
+                  "initialDelaySeconds": 10,
                   "httpGet": {
                     "path": "/hello",
                     "port": 8080
diff --git a/roles/kubernetes-apps/ansible/templates/coredns-deployment.yml.j2 b/roles/kubernetes-apps/ansible/templates/coredns-deployment.yml.j2
index fd7bfc9fa..c20e6cf16 100644
--- a/roles/kubernetes-apps/ansible/templates/coredns-deployment.yml.j2
+++ b/roles/kubernetes-apps/ansible/templates/coredns-deployment.yml.j2
@@ -91,19 +91,17 @@ spec:
             path: /health
             port: 8080
             scheme: HTTP
-          initialDelaySeconds: 60
           timeoutSeconds: 5
           successThreshold: 1
-          failureThreshold: 5
+          failureThreshold: 10
         readinessProbe:
           httpGet:
             path: /health
             port: 8080
             scheme: HTTP
-          initialDelaySeconds: 60
           timeoutSeconds: 5
           successThreshold: 1
-          failureThreshold: 5
+          failureThreshold: 10
       dnsPolicy: Default
       volumes:
         - name: config-volume
diff --git a/roles/kubernetes-apps/ansible/templates/dashboard.yml.j2 b/roles/kubernetes-apps/ansible/templates/dashboard.yml.j2
index f079bf122..c14d65af1 100644
--- a/roles/kubernetes-apps/ansible/templates/dashboard.yml.j2
+++ b/roles/kubernetes-apps/ansible/templates/dashboard.yml.j2
@@ -184,8 +184,6 @@ spec:
             scheme: HTTPS
             path: /
             port: 8443
-          initialDelaySeconds: 30
-          timeoutSeconds: 30
       volumes:
       - name: kubernetes-dashboard-certs
         secret:
diff --git a/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2 b/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2
index 204a2838b..96e404edc 100644
--- a/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2
+++ b/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2
@@ -60,10 +60,18 @@ spec:
             path: /health
             port: 8080
             scheme: HTTP
-          initialDelaySeconds: 60
           timeoutSeconds: 5
           successThreshold: 1
-          failureThreshold: 3
+          failureThreshold: 10
+        readinessProbe:
+          httpGet:
+            host: {{ nodelocaldns_ip }}
+            path: /health
+            port: 8080
+            scheme: HTTP
+          timeoutSeconds: 5
+          successThreshold: 1
+          failureThreshold: 10
         volumeMounts:
         - name: config-volume
           mountPath: /etc/coredns
diff --git a/roles/kubernetes-apps/ingress_controller/ingress_nginx/templates/ds-ingress-nginx-controller.yml.j2 b/roles/kubernetes-apps/ingress_controller/ingress_nginx/templates/ds-ingress-nginx-controller.yml.j2
index 792a3f55c..06a9a9018 100644
--- a/roles/kubernetes-apps/ingress_controller/ingress_nginx/templates/ds-ingress-nginx-controller.yml.j2
+++ b/roles/kubernetes-apps/ingress_controller/ingress_nginx/templates/ds-ingress-nginx-controller.yml.j2
@@ -76,16 +76,16 @@ spec:
               path: /healthz
               port: 10254
               scheme: HTTP
-            initialDelaySeconds: 10
-            periodSeconds: 10
+            initialDelaySeconds: 5
+            timeoutSeconds: 5
             successThreshold: 1
-            timeoutSeconds: 1
+            failureThreshold: 10
           readinessProbe:
             failureThreshold: 3
             httpGet:
               path: /healthz
               port: 10254
               scheme: HTTP
-            periodSeconds: 10
+            timeoutSeconds: 5
             successThreshold: 1
-            timeoutSeconds: 1
+            failureThreshold: 10
diff --git a/roles/kubernetes-apps/metrics_server/templates/metrics-server-deployment.yaml.j2 b/roles/kubernetes-apps/metrics_server/templates/metrics-server-deployment.yaml.j2
index aa54bd373..69bb0f7ab 100644
--- a/roles/kubernetes-apps/metrics_server/templates/metrics-server-deployment.yaml.j2
+++ b/roles/kubernetes-apps/metrics_server/templates/metrics-server-deployment.yaml.j2
@@ -44,24 +44,22 @@ spec:
           name: https
           protocol: TCP
         livenessProbe:
-          failureThreshold: 3
           httpGet:
             path: /healthz
             port: https
             scheme: HTTPS
-          initialDelaySeconds: 30
-          periodSeconds: 30
           successThreshold: 1
+          initialDelaySeconds: 20
+          failureThreshold: 3
           timeoutSeconds: 10
         readinessProbe:
-          failureThreshold: 3
           httpGet:
             path: /healthz
             port: 443
             scheme: HTTPS
-          initialDelaySeconds: 30
-          periodSeconds: 30
           successThreshold: 1
+          initialDelaySeconds: 20
+          failureThreshold: 3
           timeoutSeconds: 10
         securityContext:
           # Currently non root is not supported:
diff --git a/roles/network_plugin/calico/templates/calico-node.yml.j2 b/roles/network_plugin/calico/templates/calico-node.yml.j2
index f851d3930..d8ea78a44 100644
--- a/roles/network_plugin/calico/templates/calico-node.yml.j2
+++ b/roles/network_plugin/calico/templates/calico-node.yml.j2
@@ -218,10 +218,10 @@ spec:
               host: 127.0.0.1
               path: /liveness
               port: 9099
-            periodSeconds: 10
-            initialDelaySeconds: 10
+            initialDelaySeconds: 5
             failureThreshold: 6
           readinessProbe:
+            failureThreshold: 6
 {% if calico_version is version('v3.3.0', '<') %}
             httpGet:
               host: 127.0.0.1
@@ -234,7 +234,6 @@ spec:
               - -bird-ready
               - -felix-ready
 {% endif %}
-            periodSeconds: 10
           volumeMounts:
             - mountPath: /lib/modules
               name: lib-modules
diff --git a/roles/network_plugin/canal/templates/canal-node.yaml.j2 b/roles/network_plugin/canal/templates/canal-node.yaml.j2
index 7d9da1cf2..7b3cba83e 100644
--- a/roles/network_plugin/canal/templates/canal-node.yaml.j2
+++ b/roles/network_plugin/canal/templates/canal-node.yaml.j2
@@ -253,7 +253,7 @@ spec:
               path: /liveness
               port: 9099
             periodSeconds: 10
-            initialDelaySeconds: 10
+            initialDelaySeconds: 5
             failureThreshold: 6
           readinessProbe:
 {% if calico_version is version('v3.3.0', '<')%}
diff --git a/roles/network_plugin/cilium/templates/cilium-ds.yml.j2 b/roles/network_plugin/cilium/templates/cilium-ds.yml.j2
index 21b167724..073da965d 100755
--- a/roles/network_plugin/cilium/templates/cilium-ds.yml.j2
+++ b/roles/network_plugin/cilium/templates/cilium-ds.yml.j2
@@ -132,10 +132,7 @@ spec:
               command:
                 - cilium
                 - status
-            # The initial delay for the liveness probe is intentionally large to
-            # avoid an endless kill & restart cycle if in the event that the initial
-            # bootstrapping takes longer than expected.
-            initialDelaySeconds: 120
+            initialDelaySeconds: 15
             failureThreshold: 10
             periodSeconds: 10
           readinessProbe:
diff --git a/roles/network_plugin/kube-router/templates/kube-router.yml.j2 b/roles/network_plugin/kube-router/templates/kube-router.yml.j2
index 2e50fd171..52fd47ae1 100644
--- a/roles/network_plugin/kube-router/templates/kube-router.yml.j2
+++ b/roles/network_plugin/kube-router/templates/kube-router.yml.j2
@@ -104,7 +104,7 @@ spec:
           httpGet:
             path: /healthz
             port: 20244
-          initialDelaySeconds: 10
+          initialDelaySeconds: 5
           periodSeconds: 3
         resources:
           requests:
diff --git a/tests/files/gce_centos7-flannel-addons.yml b/tests/files/gce_centos7-flannel-addons.yml
index d430d8436..cf9fc5d7a 100644
--- a/tests/files/gce_centos7-flannel-addons.yml
+++ b/tests/files/gce_centos7-flannel-addons.yml
@@ -22,6 +22,7 @@ kube_encrypt_secret_data: true
 cert_manager_enabled: true
 # Disabled temporarily
 metrics_server_enabled: false
+metrics_server_kubelet_insecure_tls: true
 kube_token_auth: true
 kube_basic_auth: true
 enable_nodelocaldns: false
diff --git a/tests/files/packet_centos7-flannel-addons.yml b/tests/files/packet_centos7-flannel-addons.yml
index 451f414e8..2979e6b14 100644
--- a/tests/files/packet_centos7-flannel-addons.yml
+++ b/tests/files/packet_centos7-flannel-addons.yml
@@ -19,8 +19,7 @@ ingress_nginx_enabled: true
 cert_manager_enabled: true
 # Disabled temporarily
 metrics_server_enabled: false
+metrics_server_kubelet_insecure_tls: true
 kube_token_auth: true
 kube_basic_auth: true
 enable_nodelocaldns: false
-
-vm_memory: 6144Mi
-- 
GitLab