From 5a8cf824f60c37dc79c3b1947bd727726424668c Mon Sep 17 00:00:00 2001
From: Ilya Margolin <ilya@ulani.de>
Date: Tue, 8 Nov 2022 15:44:32 +0100
Subject: [PATCH] [containerd] Simplify limiting number of open files per
 container (#9319)

by setting a default runtime spec with a patch for RLIMIT_NOFILE.

- Introduces containerd_base_runtime_spec_rlimit_nofile.
- Generates base_runtime_spec on-the-fly, to use the containerd version
  of the node.
---
 docs/containerd.md                            |  19 +-
 .../containerd/defaults/main.yml              |  13 +-
 .../containerd/files/cri-base.json            | 214 ------------------
 .../containerd/tasks/main.yml                 |  10 +
 4 files changed, 32 insertions(+), 224 deletions(-)
 delete mode 100644 roles/container-engine/containerd/files/cri-base.json

diff --git a/docs/containerd.md b/docs/containerd.md
index 847f7c9ca..5b20e7f16 100644
--- a/docs/containerd.md
+++ b/docs/containerd.md
@@ -64,14 +64,17 @@ is a list of such dictionaries.
 
 Default runtime can be changed by setting `containerd_default_runtime`.
 
-#### base_runtime_spec
-
-`base_runtime_spec` key in a runtime dictionary can be used to explicitly
-specify a runtime spec json file. We ship the default one which is generated
-with `ctr oci spec > /etc/containerd/cri-base.json`. It will be used if you set
-`base_runtime_spec: cri-base.json`. The main advantage of doing so is the presence of
-`rlimits` section in this configuration, which will restrict the maximum number
-of file descriptors(open files) per container to 1024.
+#### Base runtime specs and limiting number of open files
+
+`base_runtime_spec` key in a runtime dictionary is used to explicitly
+specify a runtime spec json file. `runc` runtime has it set to `cri-base.json`,
+which is generated with `ctr oci spec > /etc/containerd/cri-base.json` and
+updated to include a custom setting for maximum number of file descriptors per
+container.
+
+You can change maximum number of file descriptors per container for the default
+`runc` runtime by setting the `containerd_base_runtime_spec_rlimit_nofile`
+variable.
 
 You can tune many more [settings][runtime-spec] by supplying your own file name and content with `containerd_base_runtime_specs`:
 
diff --git a/roles/container-engine/containerd/defaults/main.yml b/roles/container-engine/containerd/defaults/main.yml
index 5f82fae59..cc630ff20 100644
--- a/roles/container-engine/containerd/defaults/main.yml
+++ b/roles/container-engine/containerd/defaults/main.yml
@@ -15,7 +15,7 @@ containerd_runc_runtime:
   type: "io.containerd.runc.v2"
   engine: ""
   root: ""
-  # base_runtime_spec: cri-base.json # use this to limit number of file descriptors per container
+  base_runtime_spec: cri-base.json
   options:
     systemdCgroup: "{{ containerd_use_systemd_cgroup | ternary('true', 'false') }}"
 
@@ -26,8 +26,17 @@ containerd_additional_runtimes: []
 #    engine: ""
 #    root: ""
 
+containerd_base_runtime_spec_rlimit_nofile: 16384
+
+containerd_default_base_runtime_spec_patch:
+  process:
+    rlimits:
+      - type: RLIMIT_NOFILE
+        hard: "{{ containerd_base_runtime_spec_rlimit_nofile }}"
+        soft: "{{ containerd_base_runtime_spec_rlimit_nofile }}"
+
 containerd_base_runtime_specs:
-  cri-base.json: "{{ lookup('file', 'cri-base.json') }}"
+  cri-base.json: "{{ containerd_default_base_runtime_spec | combine(containerd_default_base_runtime_spec_patch,recursive=1) }}"
 
 containerd_grpc_max_recv_message_size: 16777216
 containerd_grpc_max_send_message_size: 16777216
diff --git a/roles/container-engine/containerd/files/cri-base.json b/roles/container-engine/containerd/files/cri-base.json
deleted file mode 100644
index f022438a4..000000000
--- a/roles/container-engine/containerd/files/cri-base.json
+++ /dev/null
@@ -1,214 +0,0 @@
-{
-    "ociVersion": "1.0.2-dev",
-    "process": {
-        "user": {
-            "uid": 0,
-            "gid": 0
-        },
-        "cwd": "/",
-        "capabilities": {
-            "bounding": [
-                "CAP_CHOWN",
-                "CAP_DAC_OVERRIDE",
-                "CAP_FSETID",
-                "CAP_FOWNER",
-                "CAP_MKNOD",
-                "CAP_NET_RAW",
-                "CAP_SETGID",
-                "CAP_SETUID",
-                "CAP_SETFCAP",
-                "CAP_SETPCAP",
-                "CAP_NET_BIND_SERVICE",
-                "CAP_SYS_CHROOT",
-                "CAP_KILL",
-                "CAP_AUDIT_WRITE"
-            ],
-            "effective": [
-                "CAP_CHOWN",
-                "CAP_DAC_OVERRIDE",
-                "CAP_FSETID",
-                "CAP_FOWNER",
-                "CAP_MKNOD",
-                "CAP_NET_RAW",
-                "CAP_SETGID",
-                "CAP_SETUID",
-                "CAP_SETFCAP",
-                "CAP_SETPCAP",
-                "CAP_NET_BIND_SERVICE",
-                "CAP_SYS_CHROOT",
-                "CAP_KILL",
-                "CAP_AUDIT_WRITE"
-            ],
-            "inheritable": [
-                "CAP_CHOWN",
-                "CAP_DAC_OVERRIDE",
-                "CAP_FSETID",
-                "CAP_FOWNER",
-                "CAP_MKNOD",
-                "CAP_NET_RAW",
-                "CAP_SETGID",
-                "CAP_SETUID",
-                "CAP_SETFCAP",
-                "CAP_SETPCAP",
-                "CAP_NET_BIND_SERVICE",
-                "CAP_SYS_CHROOT",
-                "CAP_KILL",
-                "CAP_AUDIT_WRITE"
-            ],
-            "permitted": [
-                "CAP_CHOWN",
-                "CAP_DAC_OVERRIDE",
-                "CAP_FSETID",
-                "CAP_FOWNER",
-                "CAP_MKNOD",
-                "CAP_NET_RAW",
-                "CAP_SETGID",
-                "CAP_SETUID",
-                "CAP_SETFCAP",
-                "CAP_SETPCAP",
-                "CAP_NET_BIND_SERVICE",
-                "CAP_SYS_CHROOT",
-                "CAP_KILL",
-                "CAP_AUDIT_WRITE"
-            ]
-        },
-        "rlimits": [
-            {
-                "type": "RLIMIT_NOFILE",
-                "hard": 1024,
-                "soft": 1024
-            }
-        ],
-        "noNewPrivileges": true
-    },
-    "root": {
-        "path": "rootfs"
-    },
-    "mounts": [
-        {
-            "destination": "/proc",
-            "type": "proc",
-            "source": "proc",
-            "options": [
-                "nosuid",
-                "noexec",
-                "nodev"
-            ]
-        },
-        {
-            "destination": "/dev",
-            "type": "tmpfs",
-            "source": "tmpfs",
-            "options": [
-                "nosuid",
-                "strictatime",
-                "mode=755",
-                "size=65536k"
-            ]
-        },
-        {
-            "destination": "/dev/pts",
-            "type": "devpts",
-            "source": "devpts",
-            "options": [
-                "nosuid",
-                "noexec",
-                "newinstance",
-                "ptmxmode=0666",
-                "mode=0620",
-                "gid=5"
-            ]
-        },
-        {
-            "destination": "/dev/shm",
-            "type": "tmpfs",
-            "source": "shm",
-            "options": [
-                "nosuid",
-                "noexec",
-                "nodev",
-                "mode=1777",
-                "size=65536k"
-            ]
-        },
-        {
-            "destination": "/dev/mqueue",
-            "type": "mqueue",
-            "source": "mqueue",
-            "options": [
-                "nosuid",
-                "noexec",
-                "nodev"
-            ]
-        },
-        {
-            "destination": "/sys",
-            "type": "sysfs",
-            "source": "sysfs",
-            "options": [
-                "nosuid",
-                "noexec",
-                "nodev",
-                "ro"
-            ]
-        },
-        {
-            "destination": "/run",
-            "type": "tmpfs",
-            "source": "tmpfs",
-            "options": [
-                "nosuid",
-                "strictatime",
-                "mode=755",
-                "size=65536k"
-            ]
-        }
-    ],
-    "linux": {
-        "resources": {
-            "devices": [
-                {
-                    "allow": false,
-                    "access": "rwm"
-                }
-            ]
-        },
-        "cgroupsPath": "/default",
-        "namespaces": [
-            {
-                "type": "pid"
-            },
-            {
-                "type": "ipc"
-            },
-            {
-                "type": "uts"
-            },
-            {
-                "type": "mount"
-            },
-            {
-                "type": "network"
-            }
-        ],
-        "maskedPaths": [
-            "/proc/acpi",
-            "/proc/asound",
-            "/proc/kcore",
-            "/proc/keys",
-            "/proc/latency_stats",
-            "/proc/timer_list",
-            "/proc/timer_stats",
-            "/proc/sched_debug",
-            "/sys/firmware",
-            "/proc/scsi"
-        ],
-        "readonlyPaths": [
-            "/proc/bus",
-            "/proc/fs",
-            "/proc/irq",
-            "/proc/sys",
-            "/proc/sysrq-trigger"
-        ]
-    }
-}
diff --git a/roles/container-engine/containerd/tasks/main.yml b/roles/container-engine/containerd/tasks/main.yml
index 6bb536413..50efd4add 100644
--- a/roles/container-engine/containerd/tasks/main.yml
+++ b/roles/container-engine/containerd/tasks/main.yml
@@ -84,6 +84,16 @@
   notify: restart containerd
   when: http_proxy is defined or https_proxy is defined
 
+- name: containerd | Generate default base_runtime_spec
+  register: ctr_oci_spec
+  command: "{{ containerd_bin_dir }}/ctr oci spec"
+  check_mode: false
+  changed_when: false
+
+- name: containerd | Store generated default base_runtime_spec
+  set_fact:
+    containerd_default_base_runtime_spec: "{{ ctr_oci_spec.stdout | from_json }}"
+
 - name: containerd | Write base_runtime_specs
   copy:
     content: "{{ item.value }}"
-- 
GitLab