From 039205560a5a38dac7e180ec4c46ce4edd404d39 Mon Sep 17 00:00:00 2001
From: Cristian Calin <6627509+cristicalin@users.noreply.github.com>
Date: Tue, 9 Nov 2021 19:57:47 +0200
Subject: [PATCH] nodelocaldns: allow a secondary pod for nodelocaldns for
 local-HA (#8100)

* nodelocaldns: allow a secondary pod for nodelocaldns for local-HA

* CI: add job to test nodelocaldns secondary
---
 .gitlab-ci/packet.yml                         |   5 +
 docs/dns-stack.md                             |  16 +++
 .../group_vars/k8s_cluster/k8s-cluster.yml    |   3 +
 roles/download/defaults/main.yml              |   2 +-
 .../kubernetes-apps/ansible/defaults/main.yml |   2 +
 roles/kubernetes-apps/ansible/tasks/main.yml  |   1 +
 .../ansible/tasks/nodelocaldns.yml            |  28 +++++
 .../templates/nodelocaldns-config.yml.j2      |  88 ++++++++++++++-
 .../templates/nodelocaldns-daemonset.yml.j2   |  32 ++++--
 .../nodelocaldns-second-daemonset.yml.j2      | 103 ++++++++++++++++++
 roles/kubespray-defaults/defaults/main.yaml   |   3 +
 ..._centos8-calico-nodelocaldns-secondary.yml |  15 +++
 12 files changed, 281 insertions(+), 17 deletions(-)
 create mode 100644 roles/kubernetes-apps/ansible/templates/nodelocaldns-second-daemonset.yml.j2
 create mode 100644 tests/files/packet_centos8-calico-nodelocaldns-secondary.yml

diff --git a/.gitlab-ci/packet.yml b/.gitlab-ci/packet.yml
index 9b432a19a..6e72a4cd8 100644
--- a/.gitlab-ci/packet.yml
+++ b/.gitlab-ci/packet.yml
@@ -194,6 +194,11 @@ packet_amazon-linux-2-aio:
   extends: .packet_pr
   when: manual
 
+packet_centos8-calico-nodelocaldns-secondary:
+  stage: deploy-part2
+  extends: .packet_pr
+  when: manual
+
 packet_fedora34-kube-ovn-containerd:
   stage: deploy-part2
   extends: .packet_periodic
diff --git a/docs/dns-stack.md b/docs/dns-stack.md
index 7771c26bb..b6d2064a6 100644
--- a/docs/dns-stack.md
+++ b/docs/dns-stack.md
@@ -212,6 +212,22 @@ nodelocaldns_external_zones:
 
 See [dns_etchosts](#dns_etchosts-coredns) above.
 
+### Nodelocal DNS HA
+
+Under some circumstances the single POD nodelocaldns implementation may not be able to be replaced soon enough and a cluster upgrade or a nodelocaldns upgrade can cause DNS requests to time out for short intervals. If for any reason your applications cannot tollerate this behavior you can enable a redundant nodelocal DNS pod on each node:
+
+```yaml
+enable_nodelocaldns_secondary: true
+```
+
+**Note:** when the nodelocaldns secondary is enabled, the primary is instructed to no longer tear down the iptables rules it sets up to direct traffic to itself. In case both daemonsets have failing pods on the same node, this can cause a DNS blackout with traffic no longer being forwarded to the coredns central service as a fallback. Please ensure you account for this also if you decide to disable the nodelocaldns cache.
+
+There is a time delta (in seconds) allowed for the secondary nodelocaldns to survive in case both primary and secondary daemonsets are updated at the same time. It is advised to tune this variable after you have performed some tests in your own environment.
+
+```yaml
+nodelocaldns_secondary_skew_seconds: 5
+```
+
 ## Limitations
 
 * Kubespray has yet ways to configure Kubedns addon to forward requests SkyDns can
diff --git a/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml b/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml
index 4248832eb..dbd66d3dd 100644
--- a/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml
+++ b/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml
@@ -166,9 +166,12 @@ dns_mode: coredns
 # manual_dns_server: 10.x.x.x
 # Enable nodelocal dns cache
 enable_nodelocaldns: true
+enable_nodelocaldns_secondary: false
 nodelocaldns_ip: 169.254.25.10
 nodelocaldns_health_port: 9254
+nodelocaldns_second_health_port: 9256
 nodelocaldns_bind_metrics_host_ip: false
+nodelocaldns_secondary_skew_seconds: 5
 # nodelocaldns_external_zones:
 # - zones:
 #   - example.com
diff --git a/roles/download/defaults/main.yml b/roles/download/defaults/main.yml
index 8e858bb3a..6d19e2324 100644
--- a/roles/download/defaults/main.yml
+++ b/roles/download/defaults/main.yml
@@ -610,7 +610,7 @@ coredns_image_is_namespaced: "{{ (kube_version is version('v1.21.0','>=')) or (c
 coredns_image_repo: "{{ kube_image_repo }}{{'/coredns/coredns' if (coredns_image_is_namespaced | bool) else '/coredns' }}"
 coredns_image_tag: "{{ coredns_version if (coredns_image_is_namespaced | bool) else (coredns_version | regex_replace('^v', '')) }}"
 
-nodelocaldns_version: "1.17.1"
+nodelocaldns_version: "1.21.1"
 nodelocaldns_image_repo: "{{ kube_image_repo }}/dns/k8s-dns-node-cache"
 nodelocaldns_image_tag: "{{ nodelocaldns_version }}"
 
diff --git a/roles/kubernetes-apps/ansible/defaults/main.yml b/roles/kubernetes-apps/ansible/defaults/main.yml
index 411260551..fa06b2e0d 100644
--- a/roles/kubernetes-apps/ansible/defaults/main.yml
+++ b/roles/kubernetes-apps/ansible/defaults/main.yml
@@ -17,6 +17,8 @@ nodelocaldns_cpu_requests: 100m
 nodelocaldns_memory_limit: 170Mi
 nodelocaldns_memory_requests: 70Mi
 nodelocaldns_ds_nodeselector: "kubernetes.io/os: linux"
+nodelocaldns_prometheus_port: 9253
+nodelocaldns_secondary_prometheus_port: 9255
 
 # Limits for dns-autoscaler
 dns_autoscaler_cpu_requests: 20m
diff --git a/roles/kubernetes-apps/ansible/tasks/main.yml b/roles/kubernetes-apps/ansible/tasks/main.yml
index 75ee477b0..d59f0e0b6 100644
--- a/roles/kubernetes-apps/ansible/tasks/main.yml
+++ b/roles/kubernetes-apps/ansible/tasks/main.yml
@@ -48,6 +48,7 @@
     - "{{ coredns_manifests.results | default({}) }}"
     - "{{ coredns_secondary_manifests.results | default({}) }}"
     - "{{ nodelocaldns_manifests.results | default({}) }}"
+    - "{{ nodelocaldns_second_manifests.results | default({}) }}"
   when:
     - dns_mode != 'none'
     - inventory_hostname == groups['kube_control_plane'][0]
diff --git a/roles/kubernetes-apps/ansible/tasks/nodelocaldns.yml b/roles/kubernetes-apps/ansible/tasks/nodelocaldns.yml
index ce79ceed4..4809aa9b8 100644
--- a/roles/kubernetes-apps/ansible/tasks/nodelocaldns.yml
+++ b/roles/kubernetes-apps/ansible/tasks/nodelocaldns.yml
@@ -43,3 +43,31 @@
   tags:
     - nodelocaldns
     - coredns
+
+- name: Kubernetes Apps | Lay Down nodelocaldns-secondary Template
+  template:
+    src: "{{ item.file }}.j2"
+    dest: "{{ kube_config_dir }}/{{ item.file }}"
+  with_items:
+    - { name: nodelocaldns, file: nodelocaldns-second-daemonset.yml, type: daemonset }
+  register: nodelocaldns_second_manifests
+  vars:
+    forwardTarget: >-
+      {%- if secondaryclusterIP is defined and dns_mode == 'coredns_dual' -%}
+      {{ primaryClusterIP }} {{ secondaryclusterIP }}
+      {%- else -%}
+      {{ primaryClusterIP }}
+      {%- endif -%}
+    upstreamForwardTarget: >-
+      {%- if resolvconf_mode == 'host_resolvconf' and upstream_dns_servers is defined and upstream_dns_servers|length > 0 -%}
+      {{ upstream_dns_servers|join(' ') }}
+      {%- else -%}
+      /etc/resolv.conf
+      {%- endif -%}
+  when:
+    - enable_nodelocaldns
+    - enable_nodelocaldns_secondary
+    - inventory_hostname == groups['kube_control_plane'] | first
+  tags:
+    - nodelocaldns
+    - coredns
diff --git a/roles/kubernetes-apps/ansible/templates/nodelocaldns-config.yml.j2 b/roles/kubernetes-apps/ansible/templates/nodelocaldns-config.yml.j2
index 18abf8ea3..0244c04a4 100644
--- a/roles/kubernetes-apps/ansible/templates/nodelocaldns-config.yml.j2
+++ b/roles/kubernetes-apps/ansible/templates/nodelocaldns-config.yml.j2
@@ -17,7 +17,7 @@ data:
         loop
         bind {{ nodelocaldns_ip }}
         forward . {{ block['nameservers'] | join(' ') }}
-        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }}
         log
 {% if dns_etchosts | default(None) %}
         hosts /etc/coredns/hosts {
@@ -39,7 +39,7 @@ data:
         forward . {{ forwardTarget }} {
             force_tcp
         }
-        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }}
         health {{ nodelocaldns_ip }}:{{ nodelocaldns_health_port }}
 {% if dns_etchosts | default(None) %}
         hosts /etc/coredns/hosts {
@@ -56,7 +56,7 @@ data:
         forward . {{ forwardTarget }} {
             force_tcp
         }
-        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }}
     }
     ip6.arpa:53 {
         errors
@@ -67,7 +67,7 @@ data:
         forward . {{ forwardTarget }} {
             force_tcp
         }
-        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }}
     }
     .:53 {
         errors
@@ -76,13 +76,91 @@ data:
         loop
         bind {{ nodelocaldns_ip }}
         forward . {{ upstreamForwardTarget }}
-        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }}
 {% if dns_etchosts | default(None) %}
         hosts /etc/coredns/hosts {
           fallthrough
         }
 {% endif %}
     }
+{% if enable_nodelocaldns_secondary %}
+  Corefile-second: |
+{% if nodelocaldns_external_zones is defined and nodelocaldns_external_zones|length > 0 %}
+{% for block in nodelocaldns_external_zones %}
+    {{ block['zones'] | join(' ') }} {
+        errors
+        cache {{ block['cache'] | default(30) }}
+        reload
+        loop
+        bind {{ nodelocaldns_ip }}
+        forward . {{ block['nameservers'] | join(' ') }}
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }}
+        log
+{% if dns_etchosts | default(None) %}
+        hosts /etc/coredns/hosts {
+          fallthrough
+        }
+{% endif %}
+    }
+{% endfor %}
+{% endif %}
+    {{ dns_domain }}:53 {
+        errors
+        cache {
+            success 9984 30
+            denial 9984 5
+        }
+        reload
+        loop
+        bind {{ nodelocaldns_ip }}
+        forward . {{ forwardTarget }} {
+            force_tcp
+        }
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }}
+        health {{ nodelocaldns_ip }}:{{ nodelocaldns_second_health_port }}
+{% if dns_etchosts | default(None) %}
+        hosts /etc/coredns/hosts {
+          fallthrough
+        }
+{% endif %}
+    }
+    in-addr.arpa:53 {
+        errors
+        cache 30
+        reload
+        loop
+        bind {{ nodelocaldns_ip }}
+        forward . {{ forwardTarget }} {
+            force_tcp
+        }
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }}
+    }
+    ip6.arpa:53 {
+        errors
+        cache 30
+        reload
+        loop
+        bind {{ nodelocaldns_ip }}
+        forward . {{ forwardTarget }} {
+            force_tcp
+        }
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }}
+    }
+    .:53 {
+        errors
+        cache 30
+        reload
+        loop
+        bind {{ nodelocaldns_ip }}
+        forward . {{ upstreamForwardTarget }}
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }}
+{% if dns_etchosts | default(None) %}
+        hosts /etc/coredns/hosts {
+          fallthrough
+        }
+{% endif %}
+    }
+{% endif %}
 {% if dns_etchosts | default(None) %}
   hosts: |
     {{ dns_etchosts | indent(width=4, indentfirst=None) }}
diff --git a/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2 b/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2
index 7abd28ffa..7c63e28fa 100644
--- a/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2
+++ b/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2
@@ -16,7 +16,7 @@ spec:
         k8s-app: nodelocaldns
       annotations:
         prometheus.io/scrape: 'true'
-        prometheus.io/port: '9253'
+        prometheus.io/port: '{{ nodelocaldns_prometheus_port }}'
     spec:
       nodeSelector:
         {{ nodelocaldns_ds_nodeselector }}
@@ -38,16 +38,16 @@ spec:
           requests:
             cpu: {{ nodelocaldns_cpu_requests }}
             memory: {{ nodelocaldns_memory_requests }}
-        args: [ "-localip", "{{ nodelocaldns_ip }}", "-conf", "/etc/coredns/Corefile", "-upstreamsvc", "coredns" ]
-        securityContext:
-          privileged: true
-{% if nodelocaldns_bind_metrics_host_ip %}
-        env:
-          - name: MY_HOST_IP
-            valueFrom:
-              fieldRef:
-                fieldPath: status.hostIP
-{% endif %}
+        args:
+        - -localip
+        - {{ nodelocaldns_ip }}
+        - -conf
+        - /etc/coredns/Corefile
+        - -upstreamsvc
+        - coredns
+{% if enable_nodelocaldns_secondary %}
+        - -skipteardown
+{% else %}
         ports:
         - containerPort: 53
           name: dns
@@ -58,6 +58,16 @@ spec:
         - containerPort: 9253
           name: metrics
           protocol: TCP
+{% endif %}
+        securityContext:
+          privileged: true
+{% if nodelocaldns_bind_metrics_host_ip %}
+        env:
+          - name: MY_HOST_IP
+            valueFrom:
+              fieldRef:
+                fieldPath: status.hostIP
+{% endif %}
         livenessProbe:
           httpGet:
             host: {{ nodelocaldns_ip }}
diff --git a/roles/kubernetes-apps/ansible/templates/nodelocaldns-second-daemonset.yml.j2 b/roles/kubernetes-apps/ansible/templates/nodelocaldns-second-daemonset.yml.j2
new file mode 100644
index 000000000..037bf446e
--- /dev/null
+++ b/roles/kubernetes-apps/ansible/templates/nodelocaldns-second-daemonset.yml.j2
@@ -0,0 +1,103 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nodelocaldns-second
+  namespace: kube-system
+  labels:
+    k8s-app: kube-dns
+    addonmanager.kubernetes.io/mode: Reconcile
+spec:
+  selector:
+    matchLabels:
+      k8s-app: nodelocaldns-second
+  template:
+    metadata:
+      labels:
+        k8s-app: nodelocaldns-second
+      annotations:
+        prometheus.io/scrape: 'true'
+        prometheus.io/port: '{{ nodelocaldns_secondary_prometheus_port }}'
+    spec:
+      nodeSelector:
+        {{ nodelocaldns_ds_nodeselector }}
+      priorityClassName: system-cluster-critical
+      serviceAccountName: nodelocaldns
+      hostNetwork: true
+      dnsPolicy: Default  # Don't use cluster DNS.
+      tolerations:
+      - effect: NoSchedule
+        operator: "Exists"
+      - effect: NoExecute
+        operator: "Exists"
+      containers:
+      - name: node-cache
+        image: "{{ nodelocaldns_image_repo }}:{{ nodelocaldns_image_tag }}"
+        resources:
+          limits:
+            memory: {{ nodelocaldns_memory_limit }}
+          requests:
+            cpu: {{ nodelocaldns_cpu_requests }}
+            memory: {{ nodelocaldns_memory_requests }}
+        args: [ "-localip", "{{ nodelocaldns_ip }}", "-conf", "/etc/coredns/Corefile", "-upstreamsvc", "coredns", "-skipteardown" ]
+        securityContext:
+          privileged: true
+{% if nodelocaldns_bind_metrics_host_ip %}
+        env:
+          - name: MY_HOST_IP
+            valueFrom:
+              fieldRef:
+                fieldPath: status.hostIP
+{% endif %}
+        livenessProbe:
+          httpGet:
+            host: {{ nodelocaldns_ip }}
+            path: /health
+            port: {{ nodelocaldns_health_port }}
+            scheme: HTTP
+          timeoutSeconds: 5
+          successThreshold: 1
+          failureThreshold: 10
+        readinessProbe:
+          httpGet:
+            host: {{ nodelocaldns_ip }}
+            path: /health
+            port: {{ nodelocaldns_health_port }}
+            scheme: HTTP
+          timeoutSeconds: 5
+          successThreshold: 1
+          failureThreshold: 10
+        volumeMounts:
+        - name: config-volume
+          mountPath: /etc/coredns
+        - name: xtables-lock
+          mountPath: /run/xtables.lock
+        lifecycle:
+          preStop:
+            exec:
+              command:
+                - sh
+                - -c
+                - sleep {{ nodelocaldns_secondary_skew_seconds }} && kill -9 1
+      volumes:
+        - name: config-volume
+          configMap:
+            name: nodelocaldns
+            items:
+            - key: Corefile-second
+              path: Corefile
+{% if dns_etchosts | default(None) %}
+            - key: hosts
+              path: hosts
+{% endif %}
+        - name: xtables-lock
+          hostPath:
+            path: /run/xtables.lock
+            type: FileOrCreate
+      # Implement a time skew between the main nodelocaldns and this secondary.
+      # Since the two nodelocaldns instances share the :53 port, we want to keep
+      # at least one running at any time enven if the manifests are replaced simultaneously
+      terminationGracePeriodSeconds: {{ nodelocaldns_secondary_skew_seconds }}
+  updateStrategy:
+    rollingUpdate:
+      maxUnavailable: {{ serial | default('20%') }}
+    type: RollingUpdate
diff --git a/roles/kubespray-defaults/defaults/main.yaml b/roles/kubespray-defaults/defaults/main.yaml
index 99aec470e..488e1ae5b 100644
--- a/roles/kubespray-defaults/defaults/main.yaml
+++ b/roles/kubespray-defaults/defaults/main.yaml
@@ -93,9 +93,12 @@ dns_mode: coredns
 
 # Enable nodelocal dns cache
 enable_nodelocaldns: true
+enable_nodelocaldns_secondary: false
 nodelocaldns_ip: 169.254.25.10
 nodelocaldns_health_port: 9254
+nodelocaldns_second_health_port: 9256
 nodelocaldns_bind_metrics_host_ip: false
+nodelocaldns_secondary_skew_seconds: 5
 
 # Should be set to a cluster IP if using a custom cluster DNS
 manual_dns_server: ""
diff --git a/tests/files/packet_centos8-calico-nodelocaldns-secondary.yml b/tests/files/packet_centos8-calico-nodelocaldns-secondary.yml
new file mode 100644
index 000000000..600ce6017
--- /dev/null
+++ b/tests/files/packet_centos8-calico-nodelocaldns-secondary.yml
@@ -0,0 +1,15 @@
+---
+# Instance settings
+cloud_image: centos-8
+mode: default
+vm_memory: 3072Mi
+
+# Kubespray settings
+kube_network_plugin: calico
+deploy_netchecker: true
+dns_min_replicas: 1
+enable_nodelocaldns_secondary: true
+loadbalancer_apiserver_type: haproxy
+
+# required
+calico_iptables_backend: "Auto"
-- 
GitLab