From d378d789cff37c51cf237093e24ebc83f3867a72 Mon Sep 17 00:00:00 2001
From: David Louks <2402775+dlouks@users.noreply.github.com>
Date: Tue, 26 Jan 2021 13:10:31 -0600
Subject: [PATCH] Add retries to drain during upgrade. Allow leaving nodes
 cordoned after drain failure. Allow continuing upgrade if drain fails.
 (#7206)

---
 roles/upgrade/pre-upgrade/defaults/main.yml | 5 +++++
 roles/upgrade/pre-upgrade/tasks/main.yml    | 9 +++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/roles/upgrade/pre-upgrade/defaults/main.yml b/roles/upgrade/pre-upgrade/defaults/main.yml
index 0179ebbc6..ddff1ea55 100644
--- a/roles/upgrade/pre-upgrade/defaults/main.yml
+++ b/roles/upgrade/pre-upgrade/defaults/main.yml
@@ -3,6 +3,11 @@ drain_grace_period: 300
 drain_timeout: 360s
 drain_pod_selector: ""
 drain_nodes: true
+drain_retries: 3
+drain_retry_delay_seconds: 10
+
+upgrade_node_uncordon_after_drain_failure: true
+upgrade_node_fail_if_drain_fails: true
 
 upgrade_node_confirm: false
 upgrade_node_pause_seconds: 0
diff --git a/roles/upgrade/pre-upgrade/tasks/main.yml b/roles/upgrade/pre-upgrade/tasks/main.yml
index 18f44470a..bf436d360 100644
--- a/roles/upgrade/pre-upgrade/tasks/main.yml
+++ b/roles/upgrade/pre-upgrade/tasks/main.yml
@@ -77,14 +77,19 @@
         --timeout {{ drain_timeout }}
         --delete-local-data {{ kube_override_hostname|default(inventory_hostname) }}
         {% if drain_pod_selector %}--pod-selector '{{ drain_pod_selector }}'{% endif %}
-      when:
-        - drain_nodes
+      when: drain_nodes
+      register: result
+      until: result.rc == 0
+      retries: "{{ drain_retries }}"
+      delay: "{{ drain_retry_delay_seconds }}"
   rescue:
     - name: Set node back to schedulable
       command: "{{ bin_dir }}/kubectl --kubeconfig /etc/kubernetes/admin.conf uncordon {{ inventory_hostname }}"
+      when: upgrade_node_uncordon_after_drain_failure
     - name: Fail after rescue
       fail:
         msg: "Failed to drain node {{ inventory_hostname }}"
+      when: upgrade_node_fail_if_drain_fails
   delegate_to: "{{ groups['kube-master'][0] }}"
   when:
     - needs_cordoning
-- 
GitLab