From c1bc4615fe67b49ade9abbb91b08dff522542634 Mon Sep 17 00:00:00 2001
From: "Christopher J. Ruwe" <cjr@cruwe.de>
Date: Tue, 15 May 2018 14:34:03 +0000
Subject: [PATCH] assert that number of pods on node does not exceed CIDR
 address range

The number of pods on a given node is determined by the  --max-pods=k
directive. When the address space is exhausted, no more pods can be
scheduled even if from the --max-pods-perspective, the node still has
capacity.

The special case that a pod is scheduled and uses the node IP in the
host network namespace is too "soft" to derive a guarantee.

Comparing kubelet_max_pods with kube_network_node_prefix when given
allows to assert that pod limits match the CIDR address space.
---
 roles/kubernetes/node/defaults/main.yml            |  4 ++++
 .../node/templates/kubelet.kubeadm.env.j2          |  1 +
 .../node/templates/kubelet.standard.env.j2         |  1 +
 .../preinstall/tasks/verify-settings.yml           | 14 ++++++++++++++
 4 files changed, 20 insertions(+)

diff --git a/roles/kubernetes/node/defaults/main.yml b/roles/kubernetes/node/defaults/main.yml
index c85de5ad0..9a3a08e5b 100644
--- a/roles/kubernetes/node/defaults/main.yml
+++ b/roles/kubernetes/node/defaults/main.yml
@@ -71,6 +71,10 @@ kube_apiserver_node_port_range: "30000-32767"
 
 kubelet_load_modules: false
 
+# Configure the amount of pods able to run on single node
+# default is equal to application default
+kubelet_max_pods: 110
+
 ## Support custom flags to be passed to kubelet
 kubelet_custom_flags: []
 
diff --git a/roles/kubernetes/node/templates/kubelet.kubeadm.env.j2 b/roles/kubernetes/node/templates/kubelet.kubeadm.env.j2
index acc7411e3..fa2db11f4 100644
--- a/roles/kubernetes/node/templates/kubelet.kubeadm.env.j2
+++ b/roles/kubernetes/node/templates/kubelet.kubeadm.env.j2
@@ -33,6 +33,7 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}"
 --pod-infra-container-image={{ pod_infra_image_repo }}:{{ pod_infra_image_tag }} \
 --node-status-update-frequency={{ kubelet_status_update_frequency }} \
 --cgroup-driver={{ kubelet_cgroup_driver|default(kubelet_cgroup_driver_detected) }} \
+--max-pods={{ kubelet_max_pods }} \
 --docker-disable-shared-pid={{ kubelet_disable_shared_pid }} \
 --anonymous-auth=false \
 --read-only-port={{ kube_read_only_port }} \
diff --git a/roles/kubernetes/node/templates/kubelet.standard.env.j2 b/roles/kubernetes/node/templates/kubelet.standard.env.j2
index 19100c1a7..83d657f7e 100644
--- a/roles/kubernetes/node/templates/kubelet.standard.env.j2
+++ b/roles/kubernetes/node/templates/kubelet.standard.env.j2
@@ -28,6 +28,7 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}"
 {% endif %}
 --cgroup-driver={{ kubelet_cgroup_driver|default(kubelet_cgroup_driver_detected) }} \
 --cgroups-per-qos={{ kubelet_cgroups_per_qos }} \
+--max-pods={{ kubelet_max_pods }} \
 {% if kube_version | version_compare('v1.8', '<') %}
 --experimental-fail-swap-on={{ kubelet_fail_swap_on|default(true)}} \
 {% else %}
diff --git a/roles/kubernetes/preinstall/tasks/verify-settings.yml b/roles/kubernetes/preinstall/tasks/verify-settings.yml
index 5f647101d..0f7c8bdc3 100644
--- a/roles/kubernetes/preinstall/tasks/verify-settings.yml
+++ b/roles/kubernetes/preinstall/tasks/verify-settings.yml
@@ -61,6 +61,20 @@
   ignore_errors: "{{ ignore_assert_errors }}"
   when: inventory_hostname in groups['kube-node']
 
+# This assertion will fail on the safe side: One can indeed schedule more pods
+# on a node than the CIDR-range has space for when additional pods use the host
+# network namespace. It is impossible to ascertain the number of such pods at
+# provisioning time, so to establish a guarantee, we factor these out.
+# NOTICE: the check blatantly ignores the inet6-case
+- name: Guarantee that enough network address space is available for all pods
+  assert:
+    that: "{{ kubelet_max_pods <= ((32 - kube_network_node_prefix) ** 2) - 2 }}"
+    msg: "Do not schedule more pods on a node than inet addresses are available."
+  ignore_errors: "{{ ignore_assert_errors }}"
+  when:
+    - inventory_hostname in groups['kube-node']
+    - kube_network_node_prefix is defined
+
 - name: Stop if ip var does not match local ips
   assert:
     that: ip in ansible_all_ipv4_addresses
-- 
GitLab