From 10c30ea5b1e1c08c7715aa00c0e2bbecba05a14d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Utku=20=C3=96zdemir?= <uoz@protonmail.com>
Date: Wed, 20 Oct 2021 10:51:58 +0300
Subject: [PATCH] Add fallback to node drain using --disable-eviction flag
 (#8094)

* Add fallback to node drain using --disable-eviction flag

Signed-off-by: Utku Ozdemir <uoz@protonmail.com>

* Move drain fallback tasks to separate file

Signed-off-by: Utku Ozdemir <uoz@protonmail.com>

* Add delegate_facts to fix the drain fallback

Signed-off-by: Utku Ozdemir <uoz@protonmail.com>

* Fix ansible-lint error

Signed-off-by: Utku Ozdemir <uoz@protonmail.com>

* Move drain fallback into block

Signed-off-by: Utku Ozdemir <uoz@protonmail.com>
---
 roles/upgrade/pre-upgrade/defaults/main.yml |  6 ++++
 roles/upgrade/pre-upgrade/tasks/main.yml    | 37 +++++++++++++++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/roles/upgrade/pre-upgrade/defaults/main.yml b/roles/upgrade/pre-upgrade/defaults/main.yml
index 642c4cb81..900b834ee 100644
--- a/roles/upgrade/pre-upgrade/defaults/main.yml
+++ b/roles/upgrade/pre-upgrade/defaults/main.yml
@@ -6,6 +6,12 @@ drain_nodes: true
 drain_retries: 3
 drain_retry_delay_seconds: 10
 
+drain_fallback_enabled: false
+drain_fallback_grace_period: 300
+drain_fallback_timeout: 360s
+drain_fallback_retries: 0
+drain_fallback_retry_delay_seconds: 10
+
 upgrade_node_always_cordon: false
 upgrade_node_uncordon_after_drain_failure: true
 upgrade_node_fail_if_drain_fails: true
diff --git a/roles/upgrade/pre-upgrade/tasks/main.yml b/roles/upgrade/pre-upgrade/tasks/main.yml
index 192c73875..36d06224e 100644
--- a/roles/upgrade/pre-upgrade/tasks/main.yml
+++ b/roles/upgrade/pre-upgrade/tasks/main.yml
@@ -73,15 +73,48 @@
         {{ bin_dir }}/kubectl drain
         --force
         --ignore-daemonsets
-        --grace-period {{ drain_grace_period }}
-        --timeout {{ drain_timeout }}
+        --grace-period {{ hostvars['localhost']['drain_grace_period_after_failure'] | default(drain_grace_period) }}
+        --timeout {{ hostvars['localhost']['drain_timeout_after_failure'] | default(drain_timeout) }}
         --delete-emptydir-data {{ kube_override_hostname|default(inventory_hostname) }}
         {% if drain_pod_selector %}--pod-selector '{{ drain_pod_selector }}'{% endif %}
       when: drain_nodes
       register: result
+      failed_when:
+        - result.rc != 0
+        - not drain_fallback_enabled
       until: result.rc == 0
       retries: "{{ drain_retries }}"
       delay: "{{ drain_retry_delay_seconds }}"
+
+    - name: Drain fallback
+      block:
+        - name: Set facts after regular drain has failed
+          set_fact:
+            drain_grace_period_after_failure: "{{ drain_fallback_grace_period }}"
+            drain_timeout_after_failure: "{{ drain_fallback_timeout }}"
+          delegate_to: localhost
+          delegate_facts: yes
+          run_once: yes
+
+        - name: Drain node - fallback with disabled eviction
+          command: >-
+            {{ bin_dir }}/kubectl drain
+            --force
+            --ignore-daemonsets
+            --grace-period {{ drain_fallback_grace_period }}
+            --timeout {{ drain_fallback_timeout }}
+            --delete-emptydir-data {{ kube_override_hostname|default(inventory_hostname) }}
+            {% if drain_pod_selector %}--pod-selector '{{ drain_pod_selector }}'{% endif %}
+            --disable-eviction
+          register: drain_fallback_result
+          until: drain_fallback_result.rc == 0
+          retries: "{{ drain_fallback_retries }}"
+          delay: "{{ drain_fallback_retry_delay_seconds }}"
+      when:
+        - drain_nodes
+        - drain_fallback_enabled
+        - result.rc != 0
+
   rescue:
     - name: Set node back to schedulable
       command: "{{ bin_dir }}/kubectl --kubeconfig {{ kube_config_dir }}/admin.conf uncordon {{ inventory_hostname }}"
-- 
GitLab