From 390764c2b4e5777c74ef52ab5e4199ea34151e69 Mon Sep 17 00:00:00 2001
From: Bogdan Dobrelya <bdobrelia@mirantis.com>
Date: Thu, 15 Sep 2016 11:23:27 +0200
Subject: [PATCH] Add retry_stagger var for failed download/pushes.

* Add the retry_stagger var to tweak push and retry time strategies.
* Add large deployments related docs.

Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
---
 docs/large-deploymets.md                   | 19 +++++++++++++++++++
 inventory/group_vars/all.yml               |  2 ++
 roles/docker/tasks/main.yml                |  4 ++--
 roles/download/tasks/main.yml              |  6 +++---
 roles/etcd/tasks/install.yml               |  2 +-
 roles/kubernetes/master/tasks/main.yml     |  2 +-
 roles/kubernetes/preinstall/tasks/main.yml |  2 +-
 roles/network_plugin/calico/tasks/main.yml |  2 +-
 8 files changed, 30 insertions(+), 9 deletions(-)
 create mode 100644 docs/large-deploymets.md

diff --git a/docs/large-deploymets.md b/docs/large-deploymets.md
new file mode 100644
index 000000000..2a36c3ebc
--- /dev/null
+++ b/docs/large-deploymets.md
@@ -0,0 +1,19 @@
+Large deployments of K8s
+========================
+
+For a large scaled deployments, consider the following configuration changes:
+
+* Tune [ansible settings](http://docs.ansible.com/ansible/intro_configuration.html)
+  for `forks` and `timeout` vars to fit large numbers of nodes being deployed.
+
+* Override containers' `foo_image_repo` vars to point to intranet registry.
+
+* Override the ``download_run_once: true`` to download binaries and container
+  images only once then push to nodes in batches.
+
+* Adjust the `retry_stagger` global var as appropriate. It should provide sane
+  load on a delegate (the first K8s master node) then retrying failed
+  push or download operations.
+
+For example, when deploying 200 nodes, you may want to run ansible with
+``--forks=50``, ``--timeout=600`` and define the ``retry_stagger: 60``.
diff --git a/inventory/group_vars/all.yml b/inventory/group_vars/all.yml
index 91fab4c06..2de01828c 100644
--- a/inventory/group_vars/all.yml
+++ b/inventory/group_vars/all.yml
@@ -7,6 +7,8 @@ bin_dir: /usr/local/bin
 # Where the binaries will be downloaded.
 # Note: ensure that you've enough disk space (about 1G)
 local_release_dir: "/tmp/releases"
+# Random shifts for retrying failed ops like pushing/downloading
+retry_stagger: 5
 
 # Uncomment this line for CoreOS only.
 # Directory where python binary is installed
diff --git a/roles/docker/tasks/main.yml b/roles/docker/tasks/main.yml
index 826e16978..1d237f5e9 100644
--- a/roles/docker/tasks/main.yml
+++ b/roles/docker/tasks/main.yml
@@ -30,7 +30,7 @@
   register: keyserver_task_result
   until: keyserver_task_result|success
   retries: 4
-  delay: "{{ 20 | random + 3 }}"
+  delay: "{{ retry_stagger | random + 3 }}"
   with_items: "{{ docker_repo_key_info.repo_keys }}"
   when: ansible_os_family != "CoreOS"
 
@@ -58,7 +58,7 @@
   register: docker_task_result
   until: docker_task_result|success
   retries: 4
-  delay: "{{ 20 | random + 3 }}"
+  delay: "{{ retry_stagger | random + 3 }}"
   with_items: "{{ docker_package_info.pkgs }}"
   when: (ansible_os_family != "CoreOS") and (docker_package_info.pkgs|length > 0)
 
diff --git a/roles/download/tasks/main.yml b/roles/download/tasks/main.yml
index 73622f06d..6329a1108 100644
--- a/roles/download/tasks/main.yml
+++ b/roles/download/tasks/main.yml
@@ -22,7 +22,7 @@
   register: get_url_result
   until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg"
   retries: 4
-  delay: "{{ 20 | random + 3 }}"
+  delay: "{{ retry_stagger | random + 3 }}"
   when: "{{ download.enabled|bool and not download.container|bool }}"
   delegate_to: "{{ groups['kube-master'][0] if download_run_once|bool else omit }}"
   run_once: "{{ download_run_once|bool }}"
@@ -63,7 +63,7 @@
   register: pull_task_result
   until: pull_task_result.rc == 0
   retries: 4
-  delay: "{{ 20 | random + 3 }}"
+  delay: "{{ retry_stagger | random + 3 }}"
   when: "{{ download.enabled|bool and download.container|bool }}"
   delegate_to: "{{ groups['kube-master'][0] if download_run_once|bool else omit }}"
   run_once: "{{ download_run_once|bool }}"
@@ -85,7 +85,7 @@
   register: get_task
   until: get_task|success
   retries: 4
-  delay: "{{ 20 | random + 3 }}"
+  delay: "{{ retry_stagger | random + 3 }}"
   when: ansible_os_family != "CoreOS" and inventory_hostname != groups['kube-master'][0] and download_run_once|bool
 
 - name: Download | load container images
diff --git a/roles/etcd/tasks/install.yml b/roles/etcd/tasks/install.yml
index 959133c29..aa7f32ca3 100644
--- a/roles/etcd/tasks/install.yml
+++ b/roles/etcd/tasks/install.yml
@@ -20,7 +20,7 @@
   register: etcd_task_result
   until: etcd_task_result.rc == 0
   retries: 4
-  delay: "{{ 20 | random + 3 }}"
+  delay: "{{ retry_stagger | random + 3 }}"
   changed_when: false
 
 #Plan B: looks nicer, but requires docker-py on all hosts:
diff --git a/roles/kubernetes/master/tasks/main.yml b/roles/kubernetes/master/tasks/main.yml
index e8dfe08fc..ff6abcb13 100644
--- a/roles/kubernetes/master/tasks/main.yml
+++ b/roles/kubernetes/master/tasks/main.yml
@@ -12,7 +12,7 @@
   register: kube_task_result
   until: kube_task_result.rc == 0
   retries: 4
-  delay: "{{ 20 | random + 3 }}"
+  delay: "{{ retry_stagger | random + 3 }}"
   changed_when: false
 
 - name: Write kube-apiserver manifest
diff --git a/roles/kubernetes/preinstall/tasks/main.yml b/roles/kubernetes/preinstall/tasks/main.yml
index 8c22b73bf..8c2aecec5 100644
--- a/roles/kubernetes/preinstall/tasks/main.yml
+++ b/roles/kubernetes/preinstall/tasks/main.yml
@@ -104,7 +104,7 @@
   register: pkgs_task_result
   until: pkgs_task_result|success
   retries: 4
-  delay: "{{ 20 | random + 3 }}"
+  delay: "{{ retry_stagger | random + 3 }}"
   with_items: "{{required_pkgs | default([]) | union(common_required_pkgs|default([]))}}"
   when: ansible_os_family != "CoreOS"
 
diff --git a/roles/network_plugin/calico/tasks/main.yml b/roles/network_plugin/calico/tasks/main.yml
index 1ce6c79d3..ff7bc32ae 100644
--- a/roles/network_plugin/calico/tasks/main.yml
+++ b/roles/network_plugin/calico/tasks/main.yml
@@ -48,7 +48,7 @@
   register: cni_task_result
   until: cni_task_result.rc == 0
   retries: 4
-  delay: "{{ 20 | random + 3 }}"
+  delay: "{{ retry_stagger | random + 3 }}"
   changed_when: false
   when: use_hyperkube_cni
 
-- 
GitLab