From 68718dcb6f00a93f7f9177c41ef7002acf5d686d Mon Sep 17 00:00:00 2001
From: Max Gautier <mg@max.gautier.name>
Date: Fri, 15 Nov 2024 07:34:52 +0100
Subject: [PATCH] Stricter kubeadm validation (config and runtime checks)
 (#11710)

* kubeadm: do not ignore preflight errors blindly

The "ignoring all errors" seems to date back to the inception of the
kubeadm support (it was --skip-preflight-check before).

This can mask real errors and prevent users from seeing them.

Do not ignore any errors by default and make the set of ignored errors
configurable.

* download/kubeadm: remove redundant task

The mode is already set by the previous `copy` task.

* Validate kubeadm configs

This should help to fail early when we have invalid kubeadm configs (from
a kubespray bug or a misconfiguration).

* kubeadm-upgrade: remove unnecessary bool cast

* Convert kubeadm join discovery timeout to v1beta4 config

* CI: Ignore kubeadm:Mem errors on some setup.
---
 roles/download/tasks/prep_kubeadm_images.yml  | 21 ++++-----
 .../control-plane/tasks/kubeadm-secondary.yml |  3 +-
 .../control-plane/tasks/kubeadm-setup.yml     |  3 +-
 .../control-plane/tasks/kubeadm-upgrade.yml   |  8 ++--
 .../templates/kubeadm-controlplane.yaml.j2    |  8 +++-
 .../kubeadm/tasks/kubeadm_etcd_node.yml       |  1 +
 roles/kubernetes/kubeadm/tasks/main.yml       | 43 +++++--------------
 .../kubeadm/templates/kubeadm-client.conf.j2  |  8 +++-
 .../kubeadm_common/defaults/main.yml          |  3 ++
 .../packet_ubuntu22-calico-all-in-one.yml     |  2 +
 .../packet_ubuntu24-calico-etcd-datastore.yml |  2 +
 11 files changed, 49 insertions(+), 53 deletions(-)

diff --git a/roles/download/tasks/prep_kubeadm_images.yml b/roles/download/tasks/prep_kubeadm_images.yml
index c1e8c6cdc..8725393ed 100644
--- a/roles/download/tasks/prep_kubeadm_images.yml
+++ b/roles/download/tasks/prep_kubeadm_images.yml
@@ -7,14 +7,6 @@
     - not skip_downloads | default(false)
     - downloads.kubeadm.enabled
 
-- name: Prep_kubeadm_images | Create kubeadm config
-  template:
-    src: "kubeadm-images.yaml.j2"
-    dest: "{{ kube_config_dir }}/kubeadm-images.yaml"
-    mode: "0644"
-  when:
-    - not skip_kubeadm_images | default(false)
-
 - name: Prep_kubeadm_images | Copy kubeadm binary from download dir to system path
   copy:
     src: "{{ downloads.kubeadm.dest }}"
@@ -22,11 +14,14 @@
     mode: "0755"
     remote_src: true
 
-- name: Prep_kubeadm_images | Set kubeadm binary permissions
-  file:
-    path: "{{ bin_dir }}/kubeadm"
-    mode: "0755"
-    state: file
+- name: Prep_kubeadm_images | Create kubeadm config
+  template:
+    src: "kubeadm-images.yaml.j2"
+    dest: "{{ kube_config_dir }}/kubeadm-images.yaml"
+    mode: "0644"
+    validate: "{{ bin_dir }}/kubeadm config validate --config %s"
+  when:
+    - not skip_kubeadm_images | default(false)
 
 - name: Prep_kubeadm_images | Generate list of required images
   shell: "set -o pipefail && {{ bin_dir }}/kubeadm config images list --config={{ kube_config_dir }}/kubeadm-images.yaml | grep -Ev 'coredns|pause'"
diff --git a/roles/kubernetes/control-plane/tasks/kubeadm-secondary.yml b/roles/kubernetes/control-plane/tasks/kubeadm-secondary.yml
index 58a229816..6dbf4df1b 100644
--- a/roles/kubernetes/control-plane/tasks/kubeadm-secondary.yml
+++ b/roles/kubernetes/control-plane/tasks/kubeadm-secondary.yml
@@ -36,6 +36,7 @@
     dest: "{{ kube_config_dir }}/kubeadm-controlplane.yaml"
     mode: "0640"
     backup: true
+    validate: "{{ bin_dir }}/kubeadm config validate --config %s"
   when:
     - inventory_hostname != first_kube_control_plane
     - not kubeadm_already_run.stat.exists
@@ -87,7 +88,7 @@
   command: >-
     {{ bin_dir }}/kubeadm join
     --config {{ kube_config_dir }}/kubeadm-controlplane.yaml
-    --ignore-preflight-errors=all
+    --ignore-preflight-errors={{ kubeadm_ignore_preflight_errors | join(',') }}
     --skip-phases={{ kubeadm_join_phases_skip | join(',') }}
   environment:
     PATH: "{{ bin_dir }}:{{ ansible_env.PATH }}"
diff --git a/roles/kubernetes/control-plane/tasks/kubeadm-setup.yml b/roles/kubernetes/control-plane/tasks/kubeadm-setup.yml
index 5e8a677e5..ae7b7506f 100644
--- a/roles/kubernetes/control-plane/tasks/kubeadm-setup.yml
+++ b/roles/kubernetes/control-plane/tasks/kubeadm-setup.yml
@@ -93,6 +93,7 @@
     src: "kubeadm-config.{{ kubeadm_config_api_version }}.yaml.j2"
     dest: "{{ kube_config_dir }}/kubeadm-config.yaml"
     mode: "0640"
+    validate: "{{ bin_dir }}/kubeadm config validate --config %s"
 
 - name: Kubeadm | Create directory to store admission control configurations
   file:
@@ -168,7 +169,7 @@
     timeout -k {{ kubeadm_init_timeout }} {{ kubeadm_init_timeout }}
     {{ bin_dir }}/kubeadm init
     --config={{ kube_config_dir }}/kubeadm-config.yaml
-    --ignore-preflight-errors=all
+    --ignore-preflight-errors={{ kubeadm_ignore_preflight_errors | join(',') }}
     --skip-phases={{ kubeadm_init_phases_skip | join(',') }}
     {{ kube_external_ca_mode | ternary('', '--upload-certs') }}
   register: kubeadm_init
diff --git a/roles/kubernetes/control-plane/tasks/kubeadm-upgrade.yml b/roles/kubernetes/control-plane/tasks/kubeadm-upgrade.yml
index c9dbabd44..d8afd69dc 100644
--- a/roles/kubernetes/control-plane/tasks/kubeadm-upgrade.yml
+++ b/roles/kubernetes/control-plane/tasks/kubeadm-upgrade.yml
@@ -15,9 +15,9 @@
     {{ bin_dir }}/kubeadm
     upgrade apply -y {{ kube_version }}
     --certificate-renewal={{ kubeadm_upgrade_auto_cert_renewal }}
-    --ignore-preflight-errors=all
+    --ignore-preflight-errors={{ kubeadm_ignore_preflight_errors | join(',') }}
     --allow-experimental-upgrades
-    --etcd-upgrade={{ (etcd_deployment_type == "kubeadm") | bool | lower }}
+    --etcd-upgrade={{ (etcd_deployment_type == "kubeadm") | lower }}
     {% if kubeadm_patches | length > 0 %}--patches={{ kubeadm_patches_dir }}{% endif %}
     --force
   register: kubeadm_upgrade
@@ -36,9 +36,9 @@
     {{ bin_dir }}/kubeadm
     upgrade apply -y {{ kube_version }}
     --certificate-renewal={{ kubeadm_upgrade_auto_cert_renewal }}
-    --ignore-preflight-errors=all
+    --ignore-preflight-errors={{ kubeadm_ignore_preflight_errors | join(',') }}
     --allow-experimental-upgrades
-    --etcd-upgrade={{ (etcd_deployment_type == "kubeadm") | bool | lower }}
+    --etcd-upgrade={{ (etcd_deployment_type == "kubeadm") | lower }}
     {% if kubeadm_patches | length > 0 %}--patches={{ kubeadm_patches_dir }}{% endif %}
     --force
   register: kubeadm_upgrade
diff --git a/roles/kubernetes/control-plane/templates/kubeadm-controlplane.yaml.j2 b/roles/kubernetes/control-plane/templates/kubeadm-controlplane.yaml.j2
index 24a6c23c0..d057256fc 100644
--- a/roles/kubernetes/control-plane/templates/kubeadm-controlplane.yaml.j2
+++ b/roles/kubernetes/control-plane/templates/kubeadm-controlplane.yaml.j2
@@ -14,8 +14,14 @@ discovery:
     token: {{ kubeadm_token }}
     unsafeSkipCAVerification: true
 {% endif %}
-  timeout: {{ discovery_timeout }}
   tlsBootstrapToken: {{ kubeadm_token }}
+{# TODO: drop the if when we drop support for k8s<1.31 #}
+{% if kubeadm_config_api_version == 'v1beta3' %}
+  timeout: {{ discovery_timeout }}
+{% else %}
+timeouts:
+  discovery: {{ discovery_timeout }}
+{% endif %}
 controlPlane:
   localAPIEndpoint:
     advertiseAddress: {{ kube_apiserver_address }}
diff --git a/roles/kubernetes/kubeadm/tasks/kubeadm_etcd_node.yml b/roles/kubernetes/kubeadm/tasks/kubeadm_etcd_node.yml
index 38fd8378b..2805e192e 100644
--- a/roles/kubernetes/kubeadm/tasks/kubeadm_etcd_node.yml
+++ b/roles/kubernetes/kubeadm/tasks/kubeadm_etcd_node.yml
@@ -9,6 +9,7 @@
     src: "kubeadm-client.conf.j2"
     dest: "{{ kube_config_dir }}/kubeadm-cert-controlplane.conf"
     mode: "0640"
+    validate: "{{ bin_dir }}/kubeadm config validate --config %s"
   vars:
     kubeadm_cert_controlplane: true
 
diff --git a/roles/kubernetes/kubeadm/tasks/main.yml b/roles/kubernetes/kubeadm/tasks/main.yml
index 6b575df0f..bc69d7824 100644
--- a/roles/kubernetes/kubeadm/tasks/main.yml
+++ b/roles/kubernetes/kubeadm/tasks/main.yml
@@ -77,6 +77,7 @@
     dest: "{{ kube_config_dir }}/kubeadm-client.conf"
     backup: true
     mode: "0640"
+    validate: "{{ bin_dir }}/kubeadm config validate --config %s"
   when: ('kube_control_plane' not in group_names)
 
 - name: Join to cluster if needed
@@ -85,38 +86,16 @@
   when:
     - ('kube_control_plane' not in group_names)
     - not kubelet_conf.stat.exists
-  block:
-
-    - name: Join to cluster
-      command: >-
-        timeout -k {{ kubeadm_join_timeout }} {{ kubeadm_join_timeout }}
-        {{ bin_dir }}/kubeadm join
-        --config {{ kube_config_dir }}/kubeadm-client.conf
-        --ignore-preflight-errors=DirAvailable--etc-kubernetes-manifests
-        --skip-phases={{ kubeadm_join_phases_skip | join(',') }}
-      register: kubeadm_join
-      changed_when: kubeadm_join is success
-
-  rescue:
-
-    - name: Join to cluster with ignores
-      command: >-
-        timeout -k {{ kubeadm_join_timeout }} {{ kubeadm_join_timeout }}
-        {{ bin_dir }}/kubeadm join
-        --config {{ kube_config_dir }}/kubeadm-client.conf
-        --ignore-preflight-errors=all
-        --skip-phases={{ kubeadm_join_phases_skip | join(',') }}
-      register: kubeadm_join
-      changed_when: kubeadm_join is success
-
-  always:
-
-    - name: Display kubeadm join stderr if any
-      when: kubeadm_join is failed
-      debug:
-        msg: |
-          Joined with warnings
-          {{ kubeadm_join.stderr_lines }}
+  vars:
+    ignored:
+      - DirAvailable--etc-kubernetes-manifests
+      - "{{ kubeadm_ignore_preflight_errors }}"
+  command: >-
+    timeout -k {{ kubeadm_join_timeout }} {{ kubeadm_join_timeout }}
+    {{ bin_dir }}/kubeadm join
+    --config {{ kube_config_dir }}/kubeadm-client.conf
+    --ignore-preflight-errors={{ ignored | flatten | join(',') }}
+    --skip-phases={{ kubeadm_join_phases_skip | join(',') }}
 
 - name: Update server field in kubelet kubeconfig
   lineinfile:
diff --git a/roles/kubernetes/kubeadm/templates/kubeadm-client.conf.j2 b/roles/kubernetes/kubeadm/templates/kubeadm-client.conf.j2
index a4e014ca7..3735936f9 100644
--- a/roles/kubernetes/kubeadm/templates/kubeadm-client.conf.j2
+++ b/roles/kubernetes/kubeadm/templates/kubeadm-client.conf.j2
@@ -20,8 +20,14 @@ discovery:
     unsafeSkipCAVerification: true
 {% endif %}
 {% endif %}
-  timeout: {{ discovery_timeout }}
   tlsBootstrapToken: {{ kubeadm_token }}
+{# TODO: drop the if when we drop support for k8s<1.31 #}
+{% if kubeadm_config_api_version == 'v1beta3' %}
+  timeout: {{ discovery_timeout }}
+{% else %}
+timeouts:
+  discovery: {{ discovery_timeout }}
+{% endif %}
 caCertPath: {{ kube_cert_dir }}/ca.crt
 {% if kubeadm_cert_controlplane is defined and kubeadm_cert_controlplane %}
 controlPlane:
diff --git a/roles/kubernetes/kubeadm_common/defaults/main.yml b/roles/kubernetes/kubeadm_common/defaults/main.yml
index acbcdcf5f..7051f730c 100644
--- a/roles/kubernetes/kubeadm_common/defaults/main.yml
+++ b/roles/kubernetes/kubeadm_common/defaults/main.yml
@@ -18,3 +18,6 @@ kubeadm_patches: []
 #        example.com/prod_level: "{{ prod_level }}"
 # - ...
 # Patches are applied in the order they are specified.
+
+# List of errors to ignore during kubeadm preflight checks
+kubeadm_ignore_preflight_errors: []
diff --git a/tests/files/packet_ubuntu22-calico-all-in-one.yml b/tests/files/packet_ubuntu22-calico-all-in-one.yml
index 615530107..5d3db3794 100644
--- a/tests/files/packet_ubuntu22-calico-all-in-one.yml
+++ b/tests/files/packet_ubuntu22-calico-all-in-one.yml
@@ -6,6 +6,8 @@ vm_memory: 1600
 
 # Kubespray settings
 auto_renew_certificates: true
+kubeadm_ignore_preflight_errors:
+  - Mem
 
 # Currently ipvs not available on KVM: https://packages.ubuntu.com/search?suite=focal&arch=amd64&mode=exactfilename&searchon=contents&keywords=ip_vs_sh.ko
 kube_proxy_mode: iptables
diff --git a/tests/files/packet_ubuntu24-calico-etcd-datastore.yml b/tests/files/packet_ubuntu24-calico-etcd-datastore.yml
index 7a27ce566..f4a5dada0 100644
--- a/tests/files/packet_ubuntu24-calico-etcd-datastore.yml
+++ b/tests/files/packet_ubuntu24-calico-etcd-datastore.yml
@@ -6,6 +6,8 @@ vm_memory: 1600
 
 # Kubespray settings
 auto_renew_certificates: true
+kubeadm_ignore_preflight_errors:
+  - Mem
 
 # Currently ipvs not available on KVM: https://packages.ubuntu.com/search?suite=noble&arch=amd64&mode=exactfilename&searchon=contents&keywords=ip_vs_sh.ko
 kube_proxy_mode: iptables
-- 
GitLab