From 0d4f57aa22efa93a038e361dce745a07058b9dd7 Mon Sep 17 00:00:00 2001
From: Max Gautier <mg@max.gautier.name>
Date: Fri, 17 Nov 2023 20:01:23 +0100
Subject: [PATCH] Validate systemd unit files (#10597)

* Validate systemd unit files

This ensure that we fail early if we have a bad systemd unit file
(syntax error, using a version not available in the local version, etc)

* Hack to check systemd version for service files validation

factory-reset.target was introduced in system 250, same version as the
aliasing feature we need for verifying systemd services with ansible.
So we only actually executes the validation if that target is present.

This is an horrible hack which should be reverted as soon as we drop
support for distributions with systemd<250.
---
 roles/container-engine/containerd/tasks/main.yml  | 3 +++
 roles/container-engine/cri-dockerd/tasks/main.yml | 3 +++
 roles/etcd/tasks/configure.yml                    | 6 ++++++
 roles/kubernetes/control-plane/tasks/main.yml     | 3 +++
 roles/kubernetes/node/tasks/kubelet.yml           | 3 +++
 5 files changed, 18 insertions(+)

diff --git a/roles/container-engine/containerd/tasks/main.yml b/roles/container-engine/containerd/tasks/main.yml
index 43aa68952..f1b977717 100644
--- a/roles/container-engine/containerd/tasks/main.yml
+++ b/roles/container-engine/containerd/tasks/main.yml
@@ -61,6 +61,9 @@
     src: containerd.service.j2
     dest: /etc/systemd/system/containerd.service
     mode: 0644
+    validate: "sh -c '[ -f /usr/bin/systemd/system/factory-reset.target ] || exit 0 && systemd-analyze verify %s:containerd.service'"
+    # FIXME: check that systemd version >= 250 (factory-reset.target was introduced in that release)
+    # Remove once we drop support for systemd < 250
   notify: Restart containerd
 
 - name: Containerd | Ensure containerd directories exist
diff --git a/roles/container-engine/cri-dockerd/tasks/main.yml b/roles/container-engine/cri-dockerd/tasks/main.yml
index f8965fd04..730e379eb 100644
--- a/roles/container-engine/cri-dockerd/tasks/main.yml
+++ b/roles/container-engine/cri-dockerd/tasks/main.yml
@@ -18,6 +18,9 @@
     src: "{{ item }}.j2"
     dest: "/etc/systemd/system/{{ item }}"
     mode: 0644
+    validate: "sh -c '[ -f /usr/bin/systemd/system/factory-reset.target ] || exit 0 && systemd-analyze verify %s:{{ item }}'"
+    # FIXME: check that systemd version >= 250 (factory-reset.target was introduced in that release)
+    # Remove once we drop support for systemd < 250
   with_items:
     - cri-dockerd.service
     - cri-dockerd.socket
diff --git a/roles/etcd/tasks/configure.yml b/roles/etcd/tasks/configure.yml
index f1d6a4872..438dbc7df 100644
--- a/roles/etcd/tasks/configure.yml
+++ b/roles/etcd/tasks/configure.yml
@@ -51,6 +51,9 @@
     dest: /etc/systemd/system/etcd.service
     backup: yes
     mode: 0644
+    # FIXME: check that systemd version >= 250 (factory-reset.target was introduced in that release)
+    # Remove once we drop support for systemd < 250
+    validate: "sh -c '[ -f /usr/bin/systemd/system/factory-reset.target ] || exit 0 && systemd-analyze verify %s:etcd-{{ etcd_deployment_type }}.service'"
   when: is_etcd_master and etcd_cluster_setup
 
 - name: Configure | Copy etcd-events.service systemd file
@@ -59,6 +62,9 @@
     dest: /etc/systemd/system/etcd-events.service
     backup: yes
     mode: 0644
+    validate: "sh -c '[ -f /usr/bin/systemd/system/factory-reset.target ] || exit 0 && systemd-analyze verify %s:etcd-events-{{ etcd_deployment_type }}.service'"
+    # FIXME: check that systemd version >= 250 (factory-reset.target was introduced in that release)
+    # Remove once we drop support for systemd < 250
   when: is_etcd_master and etcd_events_cluster_setup
 
 - name: Configure | reload systemd
diff --git a/roles/kubernetes/control-plane/tasks/main.yml b/roles/kubernetes/control-plane/tasks/main.yml
index 8f57a04b4..50eccbd07 100644
--- a/roles/kubernetes/control-plane/tasks/main.yml
+++ b/roles/kubernetes/control-plane/tasks/main.yml
@@ -113,6 +113,9 @@
     src: "{{ item }}.j2"
     dest: "/etc/systemd/system/{{ item }}"
     mode: 0644
+    validate: "sh -c '[ -f /usr/bin/systemd/system/factory-reset.target ] || exit 0 && systemd-analyze verify %s:{{item}}'"
+    # FIXME: check that systemd version >= 250 (factory-reset.target was introduced in that release)
+    # Remove once we drop support for systemd < 250
   with_items:
     - k8s-certs-renew.service
     - k8s-certs-renew.timer
diff --git a/roles/kubernetes/node/tasks/kubelet.yml b/roles/kubernetes/node/tasks/kubelet.yml
index ee01d06cf..d8ff9e230 100644
--- a/roles/kubernetes/node/tasks/kubelet.yml
+++ b/roles/kubernetes/node/tasks/kubelet.yml
@@ -34,6 +34,9 @@
     dest: "/etc/systemd/system/kubelet.service"
     backup: "yes"
     mode: 0600
+    validate: "sh -c '[ -f /usr/bin/systemd/system/factory-reset.target ] || exit 0 && systemd-analyze verify %s:kubelet.service'"
+    # FIXME: check that systemd version >= 250 (factory-reset.target was introduced in that release)
+    # Remove once we drop support for systemd < 250
   notify: Node | restart kubelet
   tags:
     - kubelet
-- 
GitLab