From 1c4db6132d9a2bf79e8d72c09cbdb12f3fef572a Mon Sep 17 00:00:00 2001
From: "Shelming.Song" <Shelming.Song@gmail.com>
Date: Sat, 31 Dec 2022 00:05:30 +0800
Subject: [PATCH] optimize cgroups settings for node reserved (#9209)

* optimize cgroups settings for node reserved

* fix

* set cgroup slice for multi container engine

* set cgroup slice for crio

* add reserved cgroups variables to sample files

* Compatible with cgroup path for different container managers

* add cgroups doc

* fix markdown
---
 docs/cgroups.md                               | 72 +++++++++++++++++++
 .../group_vars/k8s_cluster/k8s-cluster.yml    | 27 +++++++
 .../templates/containerd.service.j2           |  4 ++
 .../templates/cri-dockerd.service.j2          |  4 ++
 .../cri-o/templates/crio.conf.j2              |  4 ++
 .../docker/templates/docker.service.j2        |  4 ++
 roles/kubernetes/node/defaults/main.yml       | 12 +++-
 .../templates/kubelet-config.v1beta1.yaml.j2  |  6 +-
 .../node/templates/kubelet.service.j2         | 18 +++++
 9 files changed, 147 insertions(+), 4 deletions(-)
 create mode 100644 docs/cgroups.md

diff --git a/docs/cgroups.md b/docs/cgroups.md
new file mode 100644
index 000000000..30ca7778e
--- /dev/null
+++ b/docs/cgroups.md
@@ -0,0 +1,72 @@
+# cgroups
+
+To avoid the rivals for resources between containers or the impact on the host in Kubernetes, the kubelet components will rely on cgroups to limit the container’s resources usage.
+
+## Enforcing Node Allocatable
+
+You can use `kubelet_enforce_node_allocatable` to set node allocatable enforcement.
+
+```yaml
+# A comma separated list of levels of node allocatable enforcement to be enforced by kubelet.
+kubelet_enforce_node_allocatable: "pods"
+# kubelet_enforce_node_allocatable: "pods,kube-reserved"
+# kubelet_enforce_node_allocatable: "pods,kube-reserved,system-reserved"
+```
+
+Note that to enforce kube-reserved or system-reserved, `kube_reserved_cgroups` or `system_reserved_cgroups` needs to be specified respectively.
+
+Here is an example:
+
+```yaml
+kubelet_enforce_node_allocatable: "pods,kube-reserved,system-reserved"
+
+# Reserve this space for kube resources
+# Set to true to reserve resources for kube daemons
+kube_reserved: true
+kube_reserved_cgroups_for_service_slice: kube.slice
+kube_reserved_cgroups: "/{{ kube_reserved_cgroups_for_service_slice }}"
+kube_memory_reserved: 256Mi
+kube_cpu_reserved: 100m
+# kube_ephemeral_storage_reserved: 2Gi
+# kube_pid_reserved: "1000"
+# Reservation for master hosts
+kube_master_memory_reserved: 512Mi
+kube_master_cpu_reserved: 200m
+# kube_master_ephemeral_storage_reserved: 2Gi
+# kube_master_pid_reserved: "1000"
+
+# Set to true to reserve resources for system daemons
+system_reserved: true
+system_reserved_cgroups_for_service_slice: system.slice
+system_reserved_cgroups: "/{{ system_reserved_cgroups_for_service_slice }}"
+system_memory_reserved: 512Mi
+system_cpu_reserved: 500m
+# system_ephemeral_storage_reserved: 2Gi
+# system_pid_reserved: "1000"
+# Reservation for master hosts
+system_master_memory_reserved: 256Mi
+system_master_cpu_reserved: 250m
+# system_master_ephemeral_storage_reserved: 2Gi
+# system_master_pid_reserved: "1000"
+```
+
+After the setup, the cgroups hierarchy is as follows:
+
+```bash
+/ (Cgroups Root)
+├── kubepods.slice
+│   ├── ...
+│   ├── kubepods-besteffort.slice
+│   ├── kubepods-burstable.slice
+│   └── ...
+├── kube.slice
+│   ├── ...
+│   ├── {{container_manager}}.service
+│   ├── kubelet.service
+│   └── ...
+├── system.slice
+│   └── ...
+└── ...
+```
+
+You can learn more in the [official kubernetes documentation](https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/).
diff --git a/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml b/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml
index b4c1de7dc..189157d59 100644
--- a/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml
+++ b/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml
@@ -261,9 +261,36 @@ podsecuritypolicy_enabled: false
 # Acceptable options are 'pods', 'system-reserved', 'kube-reserved' and ''. Default is "".
 # kubelet_enforce_node_allocatable: pods
 
+## Set runtime and kubelet cgroups when using systemd as cgroup driver (default)
+# kubelet_runtime_cgroups: "{{ kube_reserved_cgroups }}/{{ container_manager }}.service"
+# kubelet_kubelet_cgroups: "{{ kube_reserved_cgroups }}/kubelet.service"
+
+## Set runtime and kubelet cgroups when using cgroupfs as cgroup driver
+# kubelet_runtime_cgroups_cgroupfs: "/system.slice/{{ container_manager }}.service"
+# kubelet_kubelet_cgroups_cgroupfs: "/system.slice/kubelet.service"
+
+# Optionally reserve this space for kube daemons.
+# kube_reserved: true
+## Uncomment to override default values
+## The following two items need to be set when kube_reserved is true
+# kube_reserved_cgroups_for_service_slice: kube.slice
+# kube_reserved_cgroups: "/{{ kube_reserved_cgroups_for_service_slice }}"
+# kube_memory_reserved: 256Mi
+# kube_cpu_reserved: 100m
+# kube_ephemeral_storage_reserved: 2Gi
+# kube_pid_reserved: "1000"
+# Reservation for master hosts
+# kube_master_memory_reserved: 512Mi
+# kube_master_cpu_reserved: 200m
+# kube_master_ephemeral_storage_reserved: 2Gi
+# kube_master_pid_reserved: "1000"
+
 ## Optionally reserve resources for OS system daemons.
 # system_reserved: true
 ## Uncomment to override default values
+## The following two items need to be set when system_reserved is true
+# system_reserved_cgroups_for_service_slice: system.slice
+# system_reserved_cgroups: "/{{ system_reserved_cgroups_for_service_slice }}"
 # system_memory_reserved: 512Mi
 # system_cpu_reserved: 500m
 # system_ephemeral_storage_reserved: 2Gi
diff --git a/roles/container-engine/containerd/templates/containerd.service.j2 b/roles/container-engine/containerd/templates/containerd.service.j2
index adebcf218..06b229084 100644
--- a/roles/container-engine/containerd/templates/containerd.service.j2
+++ b/roles/container-engine/containerd/templates/containerd.service.j2
@@ -36,6 +36,10 @@ LimitMEMLOCK={{ containerd_limit_mem_lock }}
 # Only systemd 226 and above support this version.
 TasksMax=infinity
 OOMScoreAdjust=-999
+# Set the cgroup slice of the service so that kube reserved takes effect
+{% if kube_reserved is defined and kube_reserved|bool %}
+Slice={{ kube_reserved_cgroups_for_service_slice }}
+{% endif %}
 
 [Install]
 WantedBy=multi-user.target
diff --git a/roles/container-engine/cri-dockerd/templates/cri-dockerd.service.j2 b/roles/container-engine/cri-dockerd/templates/cri-dockerd.service.j2
index 078f66651..ec128150f 100644
--- a/roles/container-engine/cri-dockerd/templates/cri-dockerd.service.j2
+++ b/roles/container-engine/cri-dockerd/templates/cri-dockerd.service.j2
@@ -35,6 +35,10 @@ LimitCORE=infinity
 TasksMax=infinity
 Delegate=yes
 KillMode=process
+# Set the cgroup slice of the service so that kube reserved takes effect
+{% if kube_reserved is defined and kube_reserved|bool %}
+Slice={{ kube_reserved_cgroups_for_service_slice }}
+{% endif %}
 
 [Install]
 WantedBy=multi-user.target
diff --git a/roles/container-engine/cri-o/templates/crio.conf.j2 b/roles/container-engine/cri-o/templates/crio.conf.j2
index 1a25e0929..d209b2bef 100644
--- a/roles/container-engine/cri-o/templates/crio.conf.j2
+++ b/roles/container-engine/cri-o/templates/crio.conf.j2
@@ -113,8 +113,12 @@ conmon = "{{ crio_conmon }}"
 {% if crio_cgroup_manager == "cgroupfs" %}
 conmon_cgroup = "pod"
 {% else %}
+{% if kube_reserved is defined and kube_reserved|bool %}
+conmon_cgroup = "{{ kube_reserved_cgroups_for_service_slice }}
+{% else %}
 conmon_cgroup = "system.slice"
 {% endif %}
+{% endif %}
 
 # Environment variable list for the conmon process, used for passing necessary
 # environment variables to conmon or the runtime.
diff --git a/roles/container-engine/docker/templates/docker.service.j2 b/roles/container-engine/docker/templates/docker.service.j2
index fd1d06121..539c3a5c4 100644
--- a/roles/container-engine/docker/templates/docker.service.j2
+++ b/roles/container-engine/docker/templates/docker.service.j2
@@ -42,6 +42,10 @@ TimeoutStartSec=1min
 Restart=on-failure
 StartLimitBurst=3
 StartLimitInterval=60s
+# Set the cgroup slice of the service so that kube reserved takes effect
+{% if kube_reserved is defined and kube_reserved|bool %}
+Slice={{ kube_reserved_cgroups_for_service_slice }}
+{% endif %}
 
 [Install]
 WantedBy=multi-user.target
diff --git a/roles/kubernetes/node/defaults/main.yml b/roles/kubernetes/node/defaults/main.yml
index 8be61744f..0c6b57b8b 100644
--- a/roles/kubernetes/node/defaults/main.yml
+++ b/roles/kubernetes/node/defaults/main.yml
@@ -12,11 +12,11 @@ kube_resolv_conf: "/etc/resolv.conf"
 kubelet_enforce_node_allocatable: "\"\""
 
 # Set runtime and kubelet cgroups when using systemd as cgroup driver (default)
-kubelet_runtime_cgroups: "/systemd/system.slice"
-kubelet_kubelet_cgroups: "/systemd/system.slice"
+kubelet_runtime_cgroups: "{{ kube_reserved_cgroups }}/{{ container_manager }}.service"
+kubelet_kubelet_cgroups: "{{ kube_reserved_cgroups }}/kubelet.service"
 
 # Set runtime and kubelet cgroups when using cgroupfs as cgroup driver
-kubelet_runtime_cgroups_cgroupfs: "/system.slice/containerd.service"
+kubelet_runtime_cgroups_cgroupfs: "/system.slice/{{ container_manager }}.service"
 kubelet_kubelet_cgroups_cgroupfs: "/system.slice/kubelet.service"
 
 ### fail with swap on (default true)
@@ -32,6 +32,10 @@ kubelet_secure_addresses: >-
   {%- endfor -%}
 
 # Reserve this space for kube resources
+# Set to true to reserve resources for kube daemons
+kube_reserved: false
+kube_reserved_cgroups_for_service_slice: kube.slice
+kube_reserved_cgroups: "/{{ kube_reserved_cgroups_for_service_slice }}"
 kube_memory_reserved: 256Mi
 kube_cpu_reserved: 100m
 # kube_ephemeral_storage_reserved: 2Gi
@@ -44,6 +48,8 @@ kube_master_cpu_reserved: 200m
 
 # Set to true to reserve resources for system daemons
 system_reserved: false
+system_reserved_cgroups_for_service_slice: system.slice
+system_reserved_cgroups: "/{{ system_reserved_cgroups_for_service_slice }}"
 system_memory_reserved: 512Mi
 system_cpu_reserved: 500m
 # system_ephemeral_storage_reserved: 2Gi
diff --git a/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2 b/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2
index 9982f62aa..885fc2ed7 100644
--- a/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2
+++ b/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2
@@ -60,6 +60,8 @@ clusterDNS:
 - {{ dns_address }}
 {% endfor %}
 {# Node reserved CPU/memory #}
+{% if kube_reserved|bool %}
+kubeReservedCgroup: {{ kube_reserved_cgroups }}
 kubeReserved:
 {% if is_kube_master|bool %}
   cpu: {{ kube_master_cpu_reserved }}
@@ -80,7 +82,9 @@ kubeReserved:
   pid: "{{ kube_pid_reserved }}"
 {% endif %}
 {% endif %}
-{% if system_reserved is defined and system_reserved %}
+{% endif %}
+{% if system_reserved|bool %}
+systemReservedCgroup: {{ system_reserved_cgroups }}
 systemReserved:
 {% if is_kube_master|bool %}
   cpu: {{ system_master_cpu_reserved }}
diff --git a/roles/kubernetes/node/templates/kubelet.service.j2 b/roles/kubernetes/node/templates/kubelet.service.j2
index feb837424..9df98e09e 100644
--- a/roles/kubernetes/node/templates/kubelet.service.j2
+++ b/roles/kubernetes/node/templates/kubelet.service.j2
@@ -10,6 +10,24 @@ Wants={{ container_manager }}.service
 
 [Service]
 EnvironmentFile=-{{ kube_config_dir }}/kubelet.env
+{% if system_reserved|bool %}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpu/{{ system_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuacct/{{ system_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuset/{{ system_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/hugetlb/{{ system_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/memory/{{ system_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/pids/{{ system_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/systemd/{{ system_reserved_cgroups_for_service_slice }}
+{% endif %}
+{% if kube_reserved|bool %}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpu/{{ kube_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuacct/{{ kube_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuset/{{ kube_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/hugetlb/{{ kube_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/memory/{{ kube_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/pids/{{ kube_reserved_cgroups_for_service_slice }}
+ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/systemd/{{ kube_reserved_cgroups_for_service_slice }}
+{% endif %}
 ExecStart={{ bin_dir }}/kubelet \
 		$KUBE_LOGTOSTDERR \
 		$KUBE_LOG_LEVEL \
-- 
GitLab