From 1513254622e47eb38a2a888312e19d16c60aa585 Mon Sep 17 00:00:00 2001
From: Farshad Asadpour <asadpoor.f@gmail.com>
Date: Thu, 27 Mar 2025 16:40:34 +0330
Subject: [PATCH] fix(remove-node): Ensure safety and validation for node
 removal process (#12085)

This commit enhances the node removal playbook's reliability and safety by implementing the following changes:

1. **Node Validation**: Added a validation step using assert to ensure the `node` variable is defined and contains nodes. If the list is empty or undefined, the playbook fails early, preventing accidental operations on the entire cluster.

2. **Removed Defaulting for Hosts**: Updated tasks to enforce explicit `node` variable input without defaulting to critical groups (e.g., `etcd:k8s_cluster:calico_rr`). By validating `node` beforehand, tasks now solely rely on user-provided input and safely avoid unintended targeting.

3. **Explicit User Confirmation**: Enhanced the confirmation prompt to clarify the scope of the operation. The admin is now required to explicitly confirm node state deletion, ensuring a deliberate decision before proceeding.

These improvements strengthen the reliability and safety of the `remove-node.yml` playbook by eliminating ambiguous behavior, preventing misconfigurations, and ensuring clear interaction during node removal tasks.
---
 docs/getting_started/getting-started.md |  2 ++
 playbooks/remove_node.yml               | 16 +++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/docs/getting_started/getting-started.md b/docs/getting_started/getting-started.md
index 77fdf244f..18050dc4b 100644
--- a/docs/getting_started/getting-started.md
+++ b/docs/getting_started/getting-started.md
@@ -59,6 +59,8 @@ ansible-playbook -i inventory/mycluster/hosts.yml remove-node.yml -b -v \
 --extra-vars "node=nodename,nodename2"
 ```
 
+> Note: The playbook does not currently support the removal of the first control plane or etcd node. These nodes are essential for maintaining cluster operations and must remain intact.
+
 If a node is completely unreachable by ssh, add `--extra-vars reset_nodes=false`
 to skip the node reset step. If one node is unavailable, but others you wish
 to remove are able to connect via SSH, you could set `reset_nodes=false` as a host
diff --git a/playbooks/remove_node.yml b/playbooks/remove_node.yml
index 212bc0f4e..9fa2c550a 100644
--- a/playbooks/remove_node.yml
+++ b/playbooks/remove_node.yml
@@ -1,9 +1,19 @@
 ---
+- name: Validate nodes for removal
+  hosts: localhost
+  tasks:
+    - name: Assert that nodes are specified for removal
+      assert:
+        that:
+          - node is defined
+          - node | length > 0
+        msg: "No nodes specified for removal. The `node` variable must be set explicitly."
+
 - name: Common tasks for every playbooks
   import_playbook: boilerplate.yml
 
 - name: Confirm node removal
-  hosts: "{{ node | default('etcd:k8s_cluster:calico_rr') }}"
+  hosts: "{{ node | default('this_is_unreachable') }}"
   gather_facts: false
   tasks:
     - name: Confirm Execution
@@ -24,7 +34,7 @@
   when: reset_nodes | default(True) | bool
 
 - name: Reset node
-  hosts: "{{ node | default('kube_node') }}"
+  hosts: "{{ node | default('this_is_unreachable') }}"
   gather_facts: false
   environment: "{{ proxy_disable_env }}"
   pre_tasks:
@@ -40,7 +50,7 @@
 
 # Currently cannot remove first control plane node or first etcd node
 - name: Post node removal
-  hosts: "{{ node | default('kube_control_plane[1:]:etcd[1:]') }}"
+  hosts: "{{ node | default('this_is_unreachable') }}"
   gather_facts: false
   environment: "{{ proxy_disable_env }}"
   roles:
-- 
GitLab