From b5542465020288b85f546e6079957e27139ff8b0 Mon Sep 17 00:00:00 2001
From: Mac Chaffee <me@macchaffee.com>
Date: Sat, 26 Feb 2022 13:29:23 -0500
Subject: [PATCH] Fix host DNS config 1) being edited too soon and 2) not
 working with NM (#8575)

Signed-off-by: Mac Chaffee <me@macchaffee.com>
---
 cluster.yml                                   |  3 +-
 roles/kubernetes/preinstall/defaults/main.yml |  1 +
 roles/kubernetes/preinstall/handlers/main.yml |  3 +-
 .../preinstall/tasks/0040-set_facts.yml       | 29 +++++++++++++++----
 .../preinstall/tasks/0060-resolvconf.yml      |  5 +++-
 .../0062-networkmanager-unmanaged-devices.yml | 11 -------
 .../tasks/0063-networkmanager-dns.yml         |  8 ++---
 roles/kubernetes/preinstall/tasks/main.yml    |  6 ++--
 scale.yml                                     |  5 ++--
 upgrade-cluster.yml                           |  5 ++--
 10 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/cluster.yml b/cluster.yml
index e13575e9c..cc169f80b 100644
--- a/cluster.yml
+++ b/cluster.yml
@@ -118,7 +118,8 @@
     - { role: kubernetes-apps/external_provisioner, tags: external-provisioner }
     - { role: kubernetes-apps, tags: apps }
 
-- hosts: k8s_cluster
+- name: Apply resolv.conf changes now that cluster DNS is up
+  hosts: k8s_cluster
   gather_facts: False
   any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
   environment: "{{ proxy_disable_env }}"
diff --git a/roles/kubernetes/preinstall/defaults/main.yml b/roles/kubernetes/preinstall/defaults/main.yml
index 10f22cb5b..fc17b79d4 100644
--- a/roles/kubernetes/preinstall/defaults/main.yml
+++ b/roles/kubernetes/preinstall/defaults/main.yml
@@ -3,6 +3,7 @@
 ignore_assert_errors: false
 
 epel_enabled: false
+# Kubespray sets this to true after clusterDNS is running to apply changes to the host resolv.conf
 dns_late: false
 
 common_required_pkgs:
diff --git a/roles/kubernetes/preinstall/handlers/main.yml b/roles/kubernetes/preinstall/handlers/main.yml
index 667465b6f..078833186 100644
--- a/roles/kubernetes/preinstall/handlers/main.yml
+++ b/roles/kubernetes/preinstall/handlers/main.yml
@@ -23,12 +23,11 @@
   command: /usr/bin/coreos-cloudinit --from-file {{ resolveconf_cloud_init_conf }}
   when: ansible_os_family in ["Flatcar", "Flatcar Container Linux by Kinvolk"]
 
-- name: Preinstall | update resolvconf for Fedora CoreOS
+- name: Preinstall | update resolvconf for networkmanager
   command: /bin/true
   notify:
     - Preinstall | reload NetworkManager
     - Preinstall | reload kubelet
-  when: is_fedora_coreos
 
 - name: Preinstall | reload NetworkManager
   service:
diff --git a/roles/kubernetes/preinstall/tasks/0040-set_facts.yml b/roles/kubernetes/preinstall/tasks/0040-set_facts.yml
index fce7c485c..6c5ba4b89 100644
--- a/roles/kubernetes/preinstall/tasks/0040-set_facts.yml
+++ b/roles/kubernetes/preinstall/tasks/0040-set_facts.yml
@@ -67,6 +67,14 @@
 
   when: resolvconf_stat.stat.exists is defined and resolvconf_stat.stat.exists
 
+- name: NetworkManager | Check if host has NetworkManager
+  # noqa 303 Should we use service_facts for this?
+  command: systemctl is-active --quiet NetworkManager.service
+  register: networkmanager_enabled
+  failed_when: false
+  changed_when: false
+  check_mode: false
+
 - name: check systemd-resolved
   # noqa 303 Should we use service_facts for this?
   command: systemctl is-active systemd-resolved
@@ -98,8 +106,7 @@
 
 - name: check if early DNS configuration stage
   set_fact:
-    dns_early: >-
-      {%- if kubelet_configured.stat.exists -%}false{%- else -%}true{%- endif -%}
+    dns_early: "{{ not kubelet_configured.stat.exists }}"
 
 - name: target resolv.conf files
   set_fact:
@@ -177,12 +184,24 @@
         {{ upstream_dns_servers|default([]) }}
       {%- endif -%}
 
-- name: generate nameservers to resolvconf
+# This task should only run after cluster/nodelocal DNS is up, otherwise all DNS lookups will timeout
+- name: generate nameservers for resolvconf, including cluster DNS
   set_fact:
-    nameserverentries:
-      nameserver {{ ( ( [nodelocaldns_ip] if enable_nodelocaldns else []) + coredns_server|d([]) + nameservers|d([]) + cloud_resolver|d([]) + configured_nameservers|d([])) | unique | join(',nameserver ') }}
+    nameserverentries: |-
+      {{ ( ( [nodelocaldns_ip] if enable_nodelocaldns else []) + coredns_server|d([]) + nameservers|d([]) + cloud_resolver|d([]) + configured_nameservers|d([])) | unique | join(',') }}
     supersede_nameserver:
       supersede domain-name-servers {{ ( coredns_server|d([]) + nameservers|d([]) + cloud_resolver|d([])) | unique | join(', ') }};
+  when: not dns_early or dns_late
+
+# This task should run instead of the above task when cluster/nodelocal DNS hasn't
+# been deployed yet (like scale.yml/cluster.yml) or when it's down (reset.yml)
+- name: generate nameservers for resolvconf, not including cluster DNS
+  set_fact:
+    nameserverentries: |-
+      {{ ( nameservers|d([]) + cloud_resolver|d([]) + configured_nameservers|d([])) | unique | join(',') }}
+    supersede_nameserver:
+      supersede domain-name-servers {{ ( nameservers|d([]) + cloud_resolver|d([])) | unique | join(', ') }};
+  when: dns_early and not dns_late
 
 - name: gather os specific variables
   include_vars: "{{ item }}"
diff --git a/roles/kubernetes/preinstall/tasks/0060-resolvconf.yml b/roles/kubernetes/preinstall/tasks/0060-resolvconf.yml
index 65b55d7fb..2759b53e1 100644
--- a/roles/kubernetes/preinstall/tasks/0060-resolvconf.yml
+++ b/roles/kubernetes/preinstall/tasks/0060-resolvconf.yml
@@ -7,9 +7,12 @@
   blockinfile:
     path: "{{ resolvconffile }}"
     block: |-
-      {% for item in [domainentry] + [searchentries] + nameserverentries.split(',') -%}
+      {% for item in [domainentry] + [searchentries] -%}
       {{ item }}
       {% endfor %}
+      {% for item in nameserverentries.split(',') %}
+      nameserver {{ item }}
+      {% endfor %}
       options ndots:{{ ndots }}
       options timeout:2
       options attempts:2
diff --git a/roles/kubernetes/preinstall/tasks/0062-networkmanager-unmanaged-devices.yml b/roles/kubernetes/preinstall/tasks/0062-networkmanager-unmanaged-devices.yml
index 4afde1fdb..1cd56d496 100644
--- a/roles/kubernetes/preinstall/tasks/0062-networkmanager-unmanaged-devices.yml
+++ b/roles/kubernetes/preinstall/tasks/0062-networkmanager-unmanaged-devices.yml
@@ -1,18 +1,9 @@
 ---
-- name: NetworkManager | Check if host has NetworkManager
-  # noqa 303 Should we use service_facts for this?
-  command: systemctl is-active --quiet NetworkManager.service
-  register: nm_check
-  failed_when: false
-  changed_when: false
-  check_mode: false
-
 - name: NetworkManager | Ensure NetworkManager conf.d dir
   file:
     path: "/etc/NetworkManager/conf.d"
     state: directory
     recurse: yes
-  when: nm_check.rc == 0
 
 - name: NetworkManager | Prevent NetworkManager from managing Calico interfaces (cali*/tunl*/vxlan.calico)
   copy:
@@ -22,7 +13,6 @@
     dest: /etc/NetworkManager/conf.d/calico.conf
     mode: 0644
   when:
-    - nm_check.rc == 0
     - kube_network_plugin == "calico"
   notify: Preinstall | reload NetworkManager
 
@@ -35,5 +25,4 @@
       unmanaged-devices+=interface-name:kube-ipvs0;interface-name:nodelocaldns
     dest: /etc/NetworkManager/conf.d/k8s.conf
     mode: 0644
-  when: nm_check.rc == 0
   notify: Preinstall | reload NetworkManager
diff --git a/roles/kubernetes/preinstall/tasks/0063-networkmanager-dns.yml b/roles/kubernetes/preinstall/tasks/0063-networkmanager-dns.yml
index 50f933a2e..851f236ac 100644
--- a/roles/kubernetes/preinstall/tasks/0063-networkmanager-dns.yml
+++ b/roles/kubernetes/preinstall/tasks/0063-networkmanager-dns.yml
@@ -4,10 +4,10 @@
     path: /etc/NetworkManager/conf.d/dns.conf
     section: global-dns-domain-*
     option: servers
-    value: "{{ ( coredns_server + nameservers|d([]) + cloud_resolver|d([])) | unique | join(',') }}"
+    value: "{{ nameserverentries }}"
     mode: '0600'
     backup: yes
-  notify: Preinstall | update resolvconf for Fedora CoreOS
+  notify: Preinstall | update resolvconf for networkmanager
 
 - name: NetworkManager | Add DNS search to NM configuration
   ini_file:
@@ -17,7 +17,7 @@
     value: "{{ ([ 'default.svc.' + dns_domain, 'svc.' + dns_domain ] + searchdomains|default([])) | join(',') }}"
     mode: '0600'
     backup: yes
-  notify: Preinstall | update resolvconf for Fedora CoreOS
+  notify: Preinstall | update resolvconf for networkmanager
 
 - name: NetworkManager | Add DNS options to NM configuration
   ini_file:
@@ -27,4 +27,4 @@
     value: "ndots:{{ ndots }};timeout:2;attempts:2;"
     mode: '0600'
     backup: yes
-  notify: Preinstall | update resolvconf for Fedora CoreOS
+  notify: Preinstall | update resolvconf for networkmanager
diff --git a/roles/kubernetes/preinstall/tasks/main.yml b/roles/kubernetes/preinstall/tasks/main.yml
index 1e3383cde..2429cd1ad 100644
--- a/roles/kubernetes/preinstall/tasks/main.yml
+++ b/roles/kubernetes/preinstall/tasks/main.yml
@@ -25,7 +25,7 @@
     - dns_mode != 'none'
     - resolvconf_mode == 'host_resolvconf'
     - systemd_resolved_enabled.rc != 0
-    - not is_fedora_coreos
+    - networkmanager_enabled.rc != 0
   tags:
     - bootstrap-os
     - resolvconf
@@ -40,6 +40,8 @@
     - resolvconf
 
 - import_tasks: 0062-networkmanager-unmanaged-devices.yml
+  when:
+    - networkmanager_enabled.rc == 0
   tags:
     - bootstrap-os
 
@@ -47,7 +49,7 @@
   when:
     - dns_mode != 'none'
     - resolvconf_mode == 'host_resolvconf'
-    - is_fedora_coreos
+    - networkmanager_enabled.rc == 0
   tags:
     - bootstrap-os
     - resolvconf
diff --git a/scale.yml b/scale.yml
index de1d9b5fe..089201ef4 100644
--- a/scale.yml
+++ b/scale.yml
@@ -99,10 +99,11 @@
     - { role: kubernetes/node-label, tags: node-label }
     - { role: network_plugin, tags: network }
 
-- hosts: k8s_cluster
+- name: Apply resolv.conf changes now that cluster DNS is up
+  hosts: k8s_cluster
   gather_facts: False
   any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
   environment: "{{ proxy_disable_env }}"
   roles:
     - { role: kubespray-defaults }
-    - { role: kubernetes/preinstall, when: "dns_mode != 'none' and resolvconf_mode == 'host_resolvconf'", tags: resolvconf }
+    - { role: kubernetes/preinstall, when: "dns_mode != 'none' and resolvconf_mode == 'host_resolvconf'", tags: resolvconf, dns_late: true }
diff --git a/upgrade-cluster.yml b/upgrade-cluster.yml
index 010d64271..c4e1fa4c6 100644
--- a/upgrade-cluster.yml
+++ b/upgrade-cluster.yml
@@ -155,10 +155,11 @@
     - { role: kubespray-defaults }
     - { role: kubernetes-apps, tags: apps }
 
-- hosts: k8s_cluster
+- name: Apply resolv.conf changes now that cluster DNS is up
+  hosts: k8s_cluster
   gather_facts: False
   any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
   environment: "{{ proxy_disable_env }}"
   roles:
     - { role: kubespray-defaults }
-    - { role: kubernetes/preinstall, when: "dns_mode != 'none' and resolvconf_mode == 'host_resolvconf'", tags: resolvconf }
+    - { role: kubernetes/preinstall, when: "dns_mode != 'none' and resolvconf_mode == 'host_resolvconf'", tags: resolvconf, dns_late: true }
-- 
GitLab