From 726711513f0cc95e0372429526ec0ccbb2335516 Mon Sep 17 00:00:00 2001
From: Ilya Margolin <ilya@ulani.de>
Date: Fri, 23 Sep 2022 19:38:27 +0200
Subject: [PATCH] [containerd] Allow configuring base_runtime_spec per
 containerd runtime (#9302)

and supply a default runtime spec.
---
 docs/containerd.md                            |  61 +++++
 .../containerd/defaults/main.yml              |   4 +
 .../containerd/files/cri-base.json            | 214 ++++++++++++++++++
 .../containerd/tasks/main.yml                 |   9 +
 .../containerd/templates/config.toml.j2       |   4 +
 5 files changed, 292 insertions(+)
 create mode 100644 roles/container-engine/containerd/files/cri-base.json

diff --git a/docs/containerd.md b/docs/containerd.md
index 32de17683..847f7c9ca 100644
--- a/docs/containerd.md
+++ b/docs/containerd.md
@@ -39,4 +39,65 @@ containerd_registries:
 image_command_tool: crictl
 ```
 
+### Containerd Runtimes
+
+Containerd supports multiple runtime configurations that can be used with
+[RuntimeClass] Kubernetes feature. See [runtime classes in containerd] for the
+details of containerd configuration.
+
+In kubespray, the default runtime name is "runc", and it can be configured with the `containerd_runc_runtime` dictionary:
+
+```yaml
+containerd_runc_runtime:
+  name: runc
+  type: "io.containerd.runc.v2"
+  engine: ""
+  root: ""
+  options:
+    systemdCgroup: "false"
+    binaryName: /usr/local/bin/my-runc
+  base_runtime_spec: cri-base.json
+```
+
+Further runtimes can be configured with `containerd_additional_runtimes`, which
+is a list of such dictionaries.
+
+Default runtime can be changed by setting `containerd_default_runtime`.
+
+#### base_runtime_spec
+
+`base_runtime_spec` key in a runtime dictionary can be used to explicitly
+specify a runtime spec json file. We ship the default one which is generated
+with `ctr oci spec > /etc/containerd/cri-base.json`. It will be used if you set
+`base_runtime_spec: cri-base.json`. The main advantage of doing so is the presence of
+`rlimits` section in this configuration, which will restrict the maximum number
+of file descriptors(open files) per container to 1024.
+
+You can tune many more [settings][runtime-spec] by supplying your own file name and content with `containerd_base_runtime_specs`:
+
+```yaml
+containerd_base_runtime_specs:
+  cri-spec-custom.json: |
+    {
+      "ociVersion": "1.0.2-dev",
+      "process": {
+        "user": {
+          "uid": 0,
+    ...
+```
+
+The files in this dict will be placed in containerd config directory,
+`/etc/containerd` by default. The files can then be referenced by filename in a
+runtime:
+
+```yaml
+containerd_runc_runtime:
+  name: runc
+  base_runtime_spec: cri-spec-custom.json
+  ...
+```
+
 [containerd]: https://containerd.io/
+[RuntimeClass]: https://kubernetes.io/docs/concepts/containers/runtime-class/
+[runtime classes in containerd]: https://github.com/containerd/containerd/blob/main/docs/cri/config.md#runtime-classes
+[runtime-spec]: https://github.com/opencontainers/runtime-spec
diff --git a/roles/container-engine/containerd/defaults/main.yml b/roles/container-engine/containerd/defaults/main.yml
index 403f1a9c4..af5f54379 100644
--- a/roles/container-engine/containerd/defaults/main.yml
+++ b/roles/container-engine/containerd/defaults/main.yml
@@ -12,6 +12,7 @@ containerd_runc_runtime:
   type: "io.containerd.runc.v2"
   engine: ""
   root: ""
+  # base_runtime_spec: cri-base.json # use this to limit number of file descriptors per container
   options:
     systemdCgroup: "{{ containerd_use_systemd_cgroup | ternary('true', 'false') }}"
 
@@ -22,6 +23,9 @@ containerd_additional_runtimes: []
 #    engine: ""
 #    root: ""
 
+containerd_base_runtime_specs:
+  cri-base.json: "{{ lookup('file', 'cri-base.json') }}"
+
 containerd_grpc_max_recv_message_size: 16777216
 containerd_grpc_max_send_message_size: 16777216
 
diff --git a/roles/container-engine/containerd/files/cri-base.json b/roles/container-engine/containerd/files/cri-base.json
new file mode 100644
index 000000000..f022438a4
--- /dev/null
+++ b/roles/container-engine/containerd/files/cri-base.json
@@ -0,0 +1,214 @@
+{
+    "ociVersion": "1.0.2-dev",
+    "process": {
+        "user": {
+            "uid": 0,
+            "gid": 0
+        },
+        "cwd": "/",
+        "capabilities": {
+            "bounding": [
+                "CAP_CHOWN",
+                "CAP_DAC_OVERRIDE",
+                "CAP_FSETID",
+                "CAP_FOWNER",
+                "CAP_MKNOD",
+                "CAP_NET_RAW",
+                "CAP_SETGID",
+                "CAP_SETUID",
+                "CAP_SETFCAP",
+                "CAP_SETPCAP",
+                "CAP_NET_BIND_SERVICE",
+                "CAP_SYS_CHROOT",
+                "CAP_KILL",
+                "CAP_AUDIT_WRITE"
+            ],
+            "effective": [
+                "CAP_CHOWN",
+                "CAP_DAC_OVERRIDE",
+                "CAP_FSETID",
+                "CAP_FOWNER",
+                "CAP_MKNOD",
+                "CAP_NET_RAW",
+                "CAP_SETGID",
+                "CAP_SETUID",
+                "CAP_SETFCAP",
+                "CAP_SETPCAP",
+                "CAP_NET_BIND_SERVICE",
+                "CAP_SYS_CHROOT",
+                "CAP_KILL",
+                "CAP_AUDIT_WRITE"
+            ],
+            "inheritable": [
+                "CAP_CHOWN",
+                "CAP_DAC_OVERRIDE",
+                "CAP_FSETID",
+                "CAP_FOWNER",
+                "CAP_MKNOD",
+                "CAP_NET_RAW",
+                "CAP_SETGID",
+                "CAP_SETUID",
+                "CAP_SETFCAP",
+                "CAP_SETPCAP",
+                "CAP_NET_BIND_SERVICE",
+                "CAP_SYS_CHROOT",
+                "CAP_KILL",
+                "CAP_AUDIT_WRITE"
+            ],
+            "permitted": [
+                "CAP_CHOWN",
+                "CAP_DAC_OVERRIDE",
+                "CAP_FSETID",
+                "CAP_FOWNER",
+                "CAP_MKNOD",
+                "CAP_NET_RAW",
+                "CAP_SETGID",
+                "CAP_SETUID",
+                "CAP_SETFCAP",
+                "CAP_SETPCAP",
+                "CAP_NET_BIND_SERVICE",
+                "CAP_SYS_CHROOT",
+                "CAP_KILL",
+                "CAP_AUDIT_WRITE"
+            ]
+        },
+        "rlimits": [
+            {
+                "type": "RLIMIT_NOFILE",
+                "hard": 1024,
+                "soft": 1024
+            }
+        ],
+        "noNewPrivileges": true
+    },
+    "root": {
+        "path": "rootfs"
+    },
+    "mounts": [
+        {
+            "destination": "/proc",
+            "type": "proc",
+            "source": "proc",
+            "options": [
+                "nosuid",
+                "noexec",
+                "nodev"
+            ]
+        },
+        {
+            "destination": "/dev",
+            "type": "tmpfs",
+            "source": "tmpfs",
+            "options": [
+                "nosuid",
+                "strictatime",
+                "mode=755",
+                "size=65536k"
+            ]
+        },
+        {
+            "destination": "/dev/pts",
+            "type": "devpts",
+            "source": "devpts",
+            "options": [
+                "nosuid",
+                "noexec",
+                "newinstance",
+                "ptmxmode=0666",
+                "mode=0620",
+                "gid=5"
+            ]
+        },
+        {
+            "destination": "/dev/shm",
+            "type": "tmpfs",
+            "source": "shm",
+            "options": [
+                "nosuid",
+                "noexec",
+                "nodev",
+                "mode=1777",
+                "size=65536k"
+            ]
+        },
+        {
+            "destination": "/dev/mqueue",
+            "type": "mqueue",
+            "source": "mqueue",
+            "options": [
+                "nosuid",
+                "noexec",
+                "nodev"
+            ]
+        },
+        {
+            "destination": "/sys",
+            "type": "sysfs",
+            "source": "sysfs",
+            "options": [
+                "nosuid",
+                "noexec",
+                "nodev",
+                "ro"
+            ]
+        },
+        {
+            "destination": "/run",
+            "type": "tmpfs",
+            "source": "tmpfs",
+            "options": [
+                "nosuid",
+                "strictatime",
+                "mode=755",
+                "size=65536k"
+            ]
+        }
+    ],
+    "linux": {
+        "resources": {
+            "devices": [
+                {
+                    "allow": false,
+                    "access": "rwm"
+                }
+            ]
+        },
+        "cgroupsPath": "/default",
+        "namespaces": [
+            {
+                "type": "pid"
+            },
+            {
+                "type": "ipc"
+            },
+            {
+                "type": "uts"
+            },
+            {
+                "type": "mount"
+            },
+            {
+                "type": "network"
+            }
+        ],
+        "maskedPaths": [
+            "/proc/acpi",
+            "/proc/asound",
+            "/proc/kcore",
+            "/proc/keys",
+            "/proc/latency_stats",
+            "/proc/timer_list",
+            "/proc/timer_stats",
+            "/proc/sched_debug",
+            "/sys/firmware",
+            "/proc/scsi"
+        ],
+        "readonlyPaths": [
+            "/proc/bus",
+            "/proc/fs",
+            "/proc/irq",
+            "/proc/sys",
+            "/proc/sysrq-trigger"
+        ]
+    }
+}
diff --git a/roles/container-engine/containerd/tasks/main.yml b/roles/container-engine/containerd/tasks/main.yml
index e2c447607..5415059f3 100644
--- a/roles/container-engine/containerd/tasks/main.yml
+++ b/roles/container-engine/containerd/tasks/main.yml
@@ -84,6 +84,15 @@
   notify: restart containerd
   when: http_proxy is defined or https_proxy is defined
 
+- name: containerd | Write base_runtime_specs
+  copy:
+    content: "{{ item.value }}"
+    dest: "{{ containerd_cfg_dir }}/{{ item.key }}"
+    owner: "root"
+    mode: 0644
+  with_dict: "{{ containerd_base_runtime_specs | default({}) }}"
+  notify: restart containerd
+
 - name: containerd | Copy containerd config file
   template:
     src: config.toml.j2
diff --git a/roles/container-engine/containerd/templates/config.toml.j2 b/roles/container-engine/containerd/templates/config.toml.j2
index 6ab414dc0..7ffe37045 100644
--- a/roles/container-engine/containerd/templates/config.toml.j2
+++ b/roles/container-engine/containerd/templates/config.toml.j2
@@ -27,6 +27,10 @@ oom_score = {{ containerd_oom_score }}
           runtime_type = "{{ runtime.type }}"
           runtime_engine = "{{ runtime.engine }}"
           runtime_root = "{{ runtime.root }}"
+{% if runtime.base_runtime_spec is defined %}
+          base_runtime_spec = "{{ containerd_cfg_dir }}/{{ runtime.base_runtime_spec }}"
+{% endif %}
+
           [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.{{ runtime.name }}.options]
 {% for key, value in runtime.options.items() %}
             {{ key }} = {{ value }}
-- 
GitLab