diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index faea911c80c03ab4c0df22002865335835445abd..436fdab41a773bc80fab42503ceedacb4719c1be 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,6 +26,8 @@ variables: RESET_CHECK: "false" UPGRADE_TEST: "false" LOG_LEVEL: "-vv" + RECOVER_CONTROL_PLANE_TEST: "false" + RECOVER_CONTROL_PLANE_TEST_GROUPS: "etcd[2:],kube-master[1:]" before_script: - ./tests/scripts/rebase.sh diff --git a/.gitlab-ci/packet.yml b/.gitlab-ci/packet.yml index 9aa398ee3a983ce22972c19d6bd9a55c1851773f..86164c392ae934824364e08ecef4c6dfcc0b896e 100644 --- a/.gitlab-ci/packet.yml +++ b/.gitlab-ci/packet.yml @@ -124,3 +124,19 @@ packet_amazon-linux-2-aio: stage: deploy-part2 extends: .packet when: manual + +packet_ubuntu18-calico-ha-recover: + stage: deploy-part2 + extends: .packet + when: on_success + variables: + RECOVER_CONTROL_PLANE_TEST: "true" + RECOVER_CONTROL_PLANE_TEST_GROUPS: "etcd[2:],kube-master[1:]" + +packet_ubuntu18-calico-ha-recover-noquorum: + stage: deploy-part2 + extends: .packet + when: on_success + variables: + RECOVER_CONTROL_PLANE_TEST: "true" + RECOVER_CONTROL_PLANE_TEST_GROUPS: "etcd[1:],kube-master[1:]" diff --git a/docs/recover-control-plane.md b/docs/recover-control-plane.md index 90f7895897b72e691065876012ede76f9a972ead..d24a4c73ab5e359e176ed119d213cf1f9bb879a8 100644 --- a/docs/recover-control-plane.md +++ b/docs/recover-control-plane.md @@ -17,37 +17,23 @@ Examples of what broken means in this context: __Note that you need at least one functional node to be able to recover using this method.__ -## If etcd quorum is intact +## Runbook -* Set the etcd member names of the broken node(s) in the variable "old\_etcd\_members", this variable is used to remove the broken nodes from the etcd cluster. -```old_etcd_members=etcd2,etcd3``` -* If you reuse identities for your etcd nodes add the inventory names for those nodes to the variable "old\_etcds". This will remove any previously generated certificates for those nodes. -```old_etcds=etcd2.example.com,etcd3.example.com``` -* If you would like to remove the broken node objects from the kubernetes cluster add their inventory names to the variable "old\_kube\_masters" -```old_kube_masters=master2.example.com,master3.example.com``` +* Move any broken etcd nodes into the "broken\_etcd" group, make sure the "etcd\_member\_name" variable is set. +* Move any broken master nodes into the "broken\_kube-master" group. -Then run the playbook with ```--limit etcd,kube-master``` +Then run the playbook with ```--limit etcd,kube-master``` and increase the number of ETCD retries by setting ```-e etcd_retries=10``` or something even larger. The amount of retries required is difficult to predict. -When finished you should have a fully working and highly available control plane again. +When finished you should have a fully working control plane again. -## If etcd quorum is lost +## Recover from lost quorum -* If you reuse identities for your etcd nodes add the inventory names for those nodes to the variable "old\_etcds". This will remove any previously generated certificates for those nodes. -```old_etcds=etcd2.example.com,etcd3.example.com``` -* If you would like to remove the broken node objects from the kubernetes cluster add their inventory names to the variable "old\_kube\_masters" -```old_kube_masters=master2.example.com,master3.example.com``` +The playbook attempts to figure out it the etcd quorum is intact. If quorum is lost it will attempt to take a snapshot from the first node in the "etcd" group and restore from that. If you would like to restore from an alternate snapshot set the path to that snapshot in the "etcd\_snapshot" variable. -Then run the playbook with ```--limit etcd,kube-master``` - -When finished you should have a fully working and highly available control plane again. - -The playbook will attempt to take a snapshot from the first node in the "etcd" group and restore from that. If you would like to restore from an alternate snapshot set the path to that snapshot in the "etcd\_snapshot" variable. - -```etcd_snapshot=/tmp/etcd_snapshot``` +```-e etcd_snapshot=/tmp/etcd_snapshot``` ## Caveats -* The playbook has only been tested on control planes where the etcd and kube-master nodes are the same, the playbook will warn if run on a cluster with separate etcd and kube-master nodes. * The playbook has only been tested with fairly small etcd databases. * If your new control plane nodes have new ip addresses you may have to change settings in various places. * There may be disruptions while running the playbook. diff --git a/recover-control-plane.yml b/recover-control-plane.yml index cd6482efb3386d4050be03b1b57c5d006909400f..cd6bfde2b641b387cc2d3664cc0dcce1980a00fe 100644 --- a/recover-control-plane.yml +++ b/recover-control-plane.yml @@ -22,7 +22,6 @@ - hosts: "{{ groups['etcd'] | first }}" roles: - { role: kubespray-defaults} - - { role: recover_control_plane/pre-recover } - { role: recover_control_plane/etcd } - hosts: "{{ groups['kube-master'] | first }}" diff --git a/roles/etcd/defaults/main.yml b/roles/etcd/defaults/main.yml index 48a68b61cec5e74e03488567e19efcc5af237271..ac38f6d7fc70c4d77e7aa16a5977ba04f0101c59 100644 --- a/roles/etcd/defaults/main.yml +++ b/roles/etcd/defaults/main.yml @@ -62,3 +62,6 @@ etcd_secure_client: true # Enable peer client cert authentication etcd_peer_client_auth: true + +# Number of loop retries +etcd_retries: 4 diff --git a/roles/etcd/tasks/configure.yml b/roles/etcd/tasks/configure.yml index e3f9c31dd6c32bb2be52aec66e99e3620813322e..d87917176c5c4743eb934b620dff337eba40dc10 100644 --- a/roles/etcd/tasks/configure.yml +++ b/roles/etcd/tasks/configure.yml @@ -67,7 +67,7 @@ shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_client_url }} cluster-health | grep -q 'cluster is healthy'" register: etcd_cluster_is_healthy until: etcd_cluster_is_healthy.rc == 0 - retries: 4 + retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" ignore_errors: false changed_when: false @@ -88,7 +88,7 @@ shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_events_client_url }} cluster-health | grep -q 'cluster is healthy'" register: etcd_events_cluster_is_healthy until: etcd_events_cluster_is_healthy.rc == 0 - retries: 4 + retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" ignore_errors: false changed_when: false diff --git a/roles/etcd/tasks/install_docker.yml b/roles/etcd/tasks/install_docker.yml index 7859134b013cbcec4ff63391e9e4033d5c63fbfa..6c38ad9f3e88f9d8a47ca79352391875aaff09a0 100644 --- a/roles/etcd/tasks/install_docker.yml +++ b/roles/etcd/tasks/install_docker.yml @@ -6,7 +6,7 @@ {{ docker_bin_dir }}/docker rm -f etcdctl-binarycopy" register: etcd_task_result until: etcd_task_result.rc == 0 - retries: 4 + retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" changed_when: false when: etcd_cluster_setup diff --git a/roles/etcd/tasks/join_etcd-events_member.yml b/roles/etcd/tasks/join_etcd-events_member.yml index b75460c418fa6acd7529f93b0ef91eb0a1a7ab96..0f214302e99df3a8c7251bbe3bd56b799424f45b 100644 --- a/roles/etcd/tasks/join_etcd-events_member.yml +++ b/roles/etcd/tasks/join_etcd-events_member.yml @@ -3,7 +3,7 @@ shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} member add {{ etcd_member_name }} {{ etcd_events_peer_url }}" register: member_add_result until: member_add_result.rc == 0 - retries: 4 + retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" when: target_node == inventory_hostname environment: diff --git a/roles/etcd/tasks/join_etcd_member.yml b/roles/etcd/tasks/join_etcd_member.yml index d512eb78a3b238b0725b86253cb0adf645364b7f..928d22642b2b1fa372ae581c0e40410e1a481aa9 100644 --- a/roles/etcd/tasks/join_etcd_member.yml +++ b/roles/etcd/tasks/join_etcd_member.yml @@ -3,7 +3,7 @@ shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} member add {{ etcd_member_name }} {{ etcd_peer_url }}" register: member_add_result until: member_add_result.rc == 0 - retries: 4 + retries: "{{ etcd_retries }}" delay: "{{ retry_stagger | random + 3 }}" when: target_node == inventory_hostname environment: diff --git a/roles/recover_control_plane/etcd/tasks/main.yml b/roles/recover_control_plane/etcd/tasks/main.yml index d1d2d1fa582a17878479218a5e2f02fc3989c78a..92c275a1f0951b4a4d117ce88ba0cdb133cb7a5e 100644 --- a/roles/recover_control_plane/etcd/tasks/main.yml +++ b/roles/recover_control_plane/etcd/tasks/main.yml @@ -1,7 +1,78 @@ --- -- include_tasks: prepare.yml +- name: Get etcd endpoint health + shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem --endpoints={{ etcd_access_addresses }} endpoint health" + register: etcd_endpoint_health + ignore_errors: true + changed_when: false + check_mode: no + environment: + - ETCDCTL_API: 3 + when: + - groups['broken_etcd'] + +- name: Set healthy fact + set_fact: + healthy: "{{ etcd_endpoint_health.stderr | match('Error: unhealthy cluster') }}" + when: + - groups['broken_etcd'] + +- name: Set has_quorum fact + set_fact: + has_quorum: "{{ etcd_endpoint_health.stdout_lines | select('match', '.*is healthy.*') | list | length >= etcd_endpoint_health.stderr_lines | select('match', '.*is unhealthy.*') | list | length }}" - include_tasks: recover_lost_quorum.yml when: - - has_etcdctl - - not etcd_cluster_is_healthy + - groups['broken_etcd'] + - not has_quorum + +- name: Remove etcd data dir + file: + path: "{{ etcd_data_dir }}" + state: absent + delegate_to: "{{ item }}" + with_items: "{{ groups['broken_etcd'] }}" + when: + - groups['broken_etcd'] + - has_quorum + +- name: Delete old certificates + # noqa 302 - rm is ok here for now + shell: "rm {{ etcd_cert_dir }}/*{{ item }}*" + with_items: "{{ groups['broken_etcd'] }}" + register: delete_old_cerificates + ignore_errors: true + when: groups['broken_etcd'] + +- name: Fail if unable to delete old certificates + fail: + msg: "Unable to delete old certificates for: {{ item.item }}" + loop: "{{ delete_old_cerificates.results }}" + changed_when: false + when: + - groups['broken_etcd'] + - "item.rc != 0 and not 'No such file or directory' in item.stderr" + +- name: Get etcd cluster members + shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem member list" + register: member_list + changed_when: false + check_mode: no + environment: + - ETCDCTL_API: 3 + when: + - groups['broken_etcd'] + - not healthy + - has_quorum + +- name: Remove broken cluster members + shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}" + environment: + - ETCDCTL_API: 3 + with_nested: + - "{{ groups['broken_etcd'] }}" + - "{{ member_list.stdout_lines }}" + when: + - groups['broken_etcd'] + - not healthy + - has_quorum + - hostvars[item[0]]['etcd_member_name'] == item[1].replace(' ','').split(',')[2] diff --git a/roles/recover_control_plane/etcd/tasks/prepare.yml b/roles/recover_control_plane/etcd/tasks/prepare.yml deleted file mode 100644 index d3cacb9345b8657fc2187d7b2986b4fa4a368a66..0000000000000000000000000000000000000000 --- a/roles/recover_control_plane/etcd/tasks/prepare.yml +++ /dev/null @@ -1,48 +0,0 @@ ---- -- name: Delete old certificates - # noqa 302 - rm is ok here for now - shell: "rm /etc/ssl/etcd/ssl/*{{ item }}* /etc/kubernetes/ssl/etcd/*{{ item }}*" - with_items: "{{ old_etcds.split(',') }}" - register: delete_old_cerificates - ignore_errors: true - when: old_etcds is defined - -- name: Fail if unable to delete old certificates - fail: - msg: "Unable to delete old certificates for: {{ item.item }}" - loop: "{{ delete_old_cerificates.results }}" - changed_when: false - when: - - old_etcds is defined - - "item.rc != 0 and not 'No such file or directory' in item.stderr" - -- name: Get etcd cluster members - shell: "{{ bin_dir }}/etcdctl member list" - register: member_list - changed_when: false - check_mode: no - environment: - - ETCDCTL_API: 3 - - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem - - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem" - - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem" - when: - - has_etcdctl - - etcd_cluster_is_healthy - - old_etcd_members is defined - -- name: Remove old cluster members - shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}" - environment: - - ETCDCTL_API: 3 - - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem - - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem" - - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem" - with_nested: - - "{{ old_etcd_members.split(',') }}" - - "{{ member_list.stdout_lines }}" - when: - - has_etcdctl - - etcd_cluster_is_healthy - - old_etcd_members is defined - - item[0] == item[1].replace(' ','').split(',')[2] diff --git a/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml b/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml index beb8b0daf9c82862d6006e9a3ea51755c2477053..fdd9d0b5fe62e94e53faab44c2a51dcaa30894d6 100644 --- a/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml +++ b/roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml @@ -1,11 +1,8 @@ --- - name: Save etcd snapshot - shell: "{{ bin_dir }}/etcdctl snapshot save /tmp/snapshot.db" + shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem snapshot save /tmp/snapshot.db" environment: - ETCDCTL_API: 3 - - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem - - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem" - - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem" when: etcd_snapshot is not defined - name: Transfer etcd snapshot to host @@ -25,12 +22,9 @@ state: absent - name: Restore etcd snapshot - shell: "{{ bin_dir }}/etcdctl snapshot restore /tmp/snapshot.db --name {{ etcd_member_name }} --initial-cluster {{ etcd_member_name }}={{ etcd_peer_url }} --initial-cluster-token k8s_etcd --initial-advertise-peer-urls {{ etcd_peer_url }} --data-dir {{ etcd_data_dir }}" + shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem snapshot restore /tmp/snapshot.db --name {{ etcd_member_name }} --initial-cluster {{ etcd_member_name }}={{ etcd_peer_url }} --initial-cluster-token k8s_etcd --initial-advertise-peer-urls {{ etcd_peer_url }} --data-dir {{ etcd_data_dir }}" environment: - ETCDCTL_API: 3 - - ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem - - ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem" - - ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem" - name: Remove etcd snapshot file: diff --git a/roles/recover_control_plane/master/tasks/main.yml b/roles/recover_control_plane/master/tasks/main.yml index f67742c85110643ff54f613e4c6aaea6bbe8b33e..71a0941682bfbd538ae71d7ce34f2ace072c32c2 100644 --- a/roles/recover_control_plane/master/tasks/main.yml +++ b/roles/recover_control_plane/master/tasks/main.yml @@ -8,21 +8,22 @@ retries: 6 delay: 10 changed_when: false + when: groups['broken_kube-master'] -- name: Delete old kube-master nodes from cluster +- name: Delete broken kube-master nodes from cluster shell: "{{ bin_dir }}/kubectl delete node {{ item }}" environment: - KUBECONFIG: "{{ ansible_env.HOME | default('/root') }}/.kube/config" - with_items: "{{ old_kube_masters.split(',') }}" - register: delete_old_kube_masters + with_items: "{{ groups['broken_kube-master'] }}" + register: delete_broken_kube_masters failed_when: false - when: old_kube_masters is defined + when: groups['broken_kube-master'] -- name: Fail if unable to delete old kube-master nodes from cluster +- name: Fail if unable to delete broken kube-master nodes from cluster fail: - msg: "Unable to delete old kube-master node: {{ item.item }}" - loop: "{{ delete_old_kube_masters.results }}" + msg: "Unable to delete broken kube-master node: {{ item.item }}" + loop: "{{ delete_broken_kube_masters.results }}" changed_when: false when: - - old_kube_masters is defined + - groups['broken_kube-master'] - "item.rc != 0 and not 'NotFound' in item.stderr" diff --git a/roles/recover_control_plane/pre-recover/defaults/main.yml b/roles/recover_control_plane/pre-recover/defaults/main.yml deleted file mode 100644 index a1f72dea66b679004b7d51835f72ae771e3c1b90..0000000000000000000000000000000000000000 --- a/roles/recover_control_plane/pre-recover/defaults/main.yml +++ /dev/null @@ -1,2 +0,0 @@ ---- -control_plane_is_converged: "{{ groups['etcd'] | sort == groups['kube-master'] | sort | bool }}" diff --git a/roles/recover_control_plane/pre-recover/tasks/main.yml b/roles/recover_control_plane/pre-recover/tasks/main.yml deleted file mode 100644 index 0b305ed9ea6b3805d46ee619b40eb3b6851c1afd..0000000000000000000000000000000000000000 --- a/roles/recover_control_plane/pre-recover/tasks/main.yml +++ /dev/null @@ -1,36 +0,0 @@ ---- -- name: Check for etcdctl binary - raw: "test -e {{ bin_dir }}/etcdctl" - register: test_etcdctl - -- name: Set has_etcdctl fact - set_fact: - has_etcdctl: "{{ test_etcdctl.rc == 0 | bool }}" - -- name: Check if etcd cluster is healthy - shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'" - register: etcd_cluster_health - ignore_errors: true - changed_when: false - check_mode: no - environment: - ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" - ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" - ETCDCTL_CA_FILE: "{{ etcd_cert_dir }}/ca.pem" - when: has_etcdctl - -- name: Set etcd_cluster_is_healthy fact - set_fact: - etcd_cluster_is_healthy: "{{ etcd_cluster_health.rc == 0 | bool }}" - -- name: Abort if etcd cluster is healthy and old_etcd_members is undefined - assert: - that: "{{ old_etcd_members is defined }}" - msg: "'old_etcd_members' must be defined when the etcd cluster has quorum." - when: etcd_cluster_is_healthy - -- name: Warn for untested recovery - debug: - msg: Control plane recovery of split control planes is UNTESTED! Abort or continue at your own risk. - delay: 30 - when: not control_plane_is_converged diff --git a/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml b/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml index 6f7d0cdf6b4e9b32ed1e8971e9aa48eef2b56e9e..bf4e974e3a5714c91b6e86524ceeb243d69b5eda 100644 --- a/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml +++ b/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml @@ -5,7 +5,7 @@ - name: Set VM count needed for CI test_id set_fact: - vm_count: "{%- if mode in ['separate', 'separate-scale', 'ha', 'ha-scale'] -%}{{ 3|int }}{%- elif mode == 'aio' -%}{{ 1|int }}{%- else -%}{{ 2|int }}{%- endif -%}" + vm_count: "{%- if mode in ['separate', 'separate-scale', 'ha', 'ha-scale', 'ha-recover', 'ha-recover-noquorum'] -%}{{ 3|int }}{%- elif mode == 'aio' -%}{{ 1|int }}{%- else -%}{{ 2|int }}{%- endif -%}" - import_tasks: create-vms.yml when: diff --git a/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2 b/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2 index 82293e0cdec3ae4530efc9fa9c7b15e6ba616695..b842c97a7b83e4f61d5f7565e97cdf0327154e12 100644 --- a/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2 +++ b/tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2 @@ -45,6 +45,45 @@ instance-1 [vault] instance-1 +{% elif mode == "ha-recover" %} +[kube-master] +instance-1 +instance-2 + +[kube-node] +instance-3 + +[etcd] +instance-3 +instance-1 +instance-2 + +[broken_kube-master] +instance-2 + +[broken_etcd] +instance-2 etcd_member_name=etcd3 +{% elif mode == "ha-recover-noquorum" %} +[kube-master] +instance-3 +instance-1 +instance-2 + +[kube-node] +instance-3 + +[etcd] +instance-3 +instance-1 +instance-2 + +[broken_kube-master] +instance-1 +instance-2 + +[broken_etcd] +instance-1 etcd_member_name=etcd2 +instance-2 etcd_member_name=etcd3 {% endif %} [k8s-cluster:children] diff --git a/tests/files/packet_ubuntu18-calico-ha-recover-noquorum.yml b/tests/files/packet_ubuntu18-calico-ha-recover-noquorum.yml new file mode 100644 index 0000000000000000000000000000000000000000..a011af01f716baa15fa244811383d8787cadcbf5 --- /dev/null +++ b/tests/files/packet_ubuntu18-calico-ha-recover-noquorum.yml @@ -0,0 +1,10 @@ +--- +# Instance settings +cloud_image: ubuntu-1804 +mode: ha-recover-noquorum +vm_memory: 1600Mi + +# Kubespray settings +kube_network_plugin: calico +deploy_netchecker: true +dns_min_replicas: 1 diff --git a/tests/files/packet_ubuntu18-calico-ha-recover.yml b/tests/files/packet_ubuntu18-calico-ha-recover.yml new file mode 100644 index 0000000000000000000000000000000000000000..079440a30cabe570c0d442806f36133e467258f6 --- /dev/null +++ b/tests/files/packet_ubuntu18-calico-ha-recover.yml @@ -0,0 +1,10 @@ +--- +# Instance settings +cloud_image: ubuntu-1804 +mode: ha-recover +vm_memory: 1600Mi + +# Kubespray settings +kube_network_plugin: calico +deploy_netchecker: true +dns_min_replicas: 1 diff --git a/tests/scripts/testcases_run.sh b/tests/scripts/testcases_run.sh index 69782f8628456e965ca1de813eb6610c28b972e2..81df1a1292caab2be767504f9c5820096574a7a8 100755 --- a/tests/scripts/testcases_run.sh +++ b/tests/scripts/testcases_run.sh @@ -47,6 +47,12 @@ if [ "${UPGRADE_TEST}" != "false" ]; then ansible-playbook ${LOG_LEVEL} -e @${CI_TEST_VARS} -e local_release_dir=${PWD}/downloads -e ansible_python_interpreter=${PYPATH} --limit "all:!fake_hosts" $PLAYBOOK fi +# Test control plane recovery +if [ "${RECOVER_CONTROL_PLANE_TEST}" != "false" ]; then + ansible-playbook ${LOG_LEVEL} -e @${CI_TEST_VARS} -e local_release_dir=${PWD}/downloads -e ansible_python_interpreter=${PYPATH} --limit "${RECOVER_CONTROL_PLANE_TEST_GROUPS}:!fake_hosts" -e reset_confirmation=yes reset.yml + ansible-playbook ${LOG_LEVEL} -e @${CI_TEST_VARS} -e local_release_dir=${PWD}/downloads -e ansible_python_interpreter=${PYPATH} -e etcd_retries=10 --limit etcd,kube-master:!fake_hosts recover-control-plane.yml +fi + # Tests Cases ## Test Master API ansible-playbook -e ansible_python_interpreter=${PYPATH} --limit "all:!fake_hosts" tests/testcases/010_check-apiserver.yml $LOG_LEVEL diff --git a/tests/templates/inventory-aws.j2 b/tests/templates/inventory-aws.j2 index 92f107f653d198d2c3df9611660d13f3ebd2f662..3ed86eb963ef25aa0e24df35ade63db2c8a155b6 100644 --- a/tests/templates/inventory-aws.j2 +++ b/tests/templates/inventory-aws.j2 @@ -25,3 +25,9 @@ kube-master calico-rr [calico-rr] + +[broken_kube-master] +node2 + +[broken_etcd] +node2 diff --git a/tests/templates/inventory-do.j2 b/tests/templates/inventory-do.j2 index 83a749afcfe1f09be2ec14e8cc76ff35f3a04cdb..ab7d95220b9ff80216bcdf223e6e2ed6712152f3 100644 --- a/tests/templates/inventory-do.j2 +++ b/tests/templates/inventory-do.j2 @@ -29,6 +29,12 @@ [vault] {{droplets.results[1].droplet.name}} {{droplets.results[2].droplet.name}} + +[broken_kube-master] +{{droplets.results[1].droplet.name}} + +[broken_etcd] +{{droplets.results[2].droplet.name}} {% else %} [kube-master] {{droplets.results[0].droplet.name}} diff --git a/tests/templates/inventory-gce.j2 b/tests/templates/inventory-gce.j2 index 503bb40914f95e87cf3c84faac093e608cc49c53..55f67deecccd581fe77a3b0a7a3e18fed8b486e4 100644 --- a/tests/templates/inventory-gce.j2 +++ b/tests/templates/inventory-gce.j2 @@ -37,6 +37,13 @@ {{node1}} {{node2}} {{node3}} + +[broken_kube-master] +{{node2}} + +[etcd] +{{node2}} +{{node3}} {% elif mode == "default" %} [kube-master] {{node1}}