From 3311e0a296ce544816a3b1ba1f5bd7706f9871b0 Mon Sep 17 00:00:00 2001
From: Arthur Outhenin-Chalandre <arthur.outhenin-chalandre@proton.ch>
Date: Mon, 26 Jun 2023 09:57:08 +0200
Subject: [PATCH] tests: cleanup stale packet namespace automatically (#10245)

* tests: cleanup stale packet namespace automatically

Cancelled job on Gitlab can produce stale VMs as the delete playbook
will never be executed. This commits allow removing old vms by getting
all the namespace created from the same branch with an older pipeline
id.

Signed-off-by: Arthur Outhenin-Chalandre <arthur.outhenin-chalandre@proton.ch>

* tests: cleanup stale packet namespace after 2 hours

This ensure that we don't have any packet namespace remaining for more
than 2 hours. All the jobs complete usually within 30min-1hour so 2
hours is enough to detect a stale namespace.

Signed-off-by: Arthur Outhenin-Chalandre <arthur.outhenin-chalandre@proton.ch>

* tests: ignore vm cleanup failure

Signed-off-by: Arthur Outhenin-Chalandre <arthur.outhenin-chalandre@proton.ch>

* tests: use pipeline_id var instead of fetching namespace for cleanup packet vm

Signed-off-by: Arthur Outhenin-Chalandre <arthur.outhenin-chalandre@proton.ch>

---------

Signed-off-by: Arthur Outhenin-Chalandre <arthur.outhenin-chalandre@proton.ch>
---
 .gitlab-ci/packet.yml                           |  8 ++++++++
 tests/Makefile                                  |  8 ++++++++
 tests/cloud_playbooks/cleanup-packet.yml        |  7 +++++++
 .../roles/cleanup-packet-ci/tasks/main.yml      | 16 ++++++++++++++++
 .../roles/packet-ci/tasks/cleanup-old-vms.yml   | 17 +++++++++++++++++
 .../roles/packet-ci/tasks/create-vms.yml        |  4 +++-
 .../roles/packet-ci/tasks/main.yml              |  2 ++
 7 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 tests/cloud_playbooks/cleanup-packet.yml
 create mode 100644 tests/cloud_playbooks/roles/cleanup-packet-ci/tasks/main.yml
 create mode 100644 tests/cloud_playbooks/roles/packet-ci/tasks/cleanup-old-vms.yml

diff --git a/.gitlab-ci/packet.yml b/.gitlab-ci/packet.yml
index edf8ebcdb..b6246b6fa 100644
--- a/.gitlab-ci/packet.yml
+++ b/.gitlab-ci/packet.yml
@@ -23,6 +23,14 @@
   allow_failure: true
   extends: .packet
 
+packet_cleanup_old:
+  stage: deploy-part1
+  extends: .packet_periodic
+  script:
+    - cd tests
+    - make cleanup-packet
+  after_script: []
+
 # The ubuntu20-calico-aio jobs are meant as early stages to prevent running the full CI if something is horribly broken
 packet_ubuntu20-calico-aio:
   stage: deploy-part1
diff --git a/tests/Makefile b/tests/Makefile
index 787449e5b..c9f561eee 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -64,6 +64,8 @@ create-packet: init-packet
 	$(ANSIBLE_LOG_LEVEL) \
 	-e @"files/${CI_JOB_NAME}.yml" \
 	-e test_id=$(TEST_ID) \
+	-e branch="$(CI_COMMIT_BRANCH)" \
+	-e pipeline_id="$(CI_PIPELINE_ID)" \
 	-e inventory_path=$(INVENTORY)
 
 delete-packet:
@@ -71,8 +73,14 @@ delete-packet:
 	$(ANSIBLE_LOG_LEVEL) \
 	-e @"files/${CI_JOB_NAME}.yml" \
 	-e test_id=$(TEST_ID) \
+	-e branch="$(CI_COMMIT_BRANCH)" \
+	-e pipeline_id="$(CI_PIPELINE_ID)" \
 	-e inventory_path=$(INVENTORY)
 
+cleanup-packet:
+	ansible-playbook cloud_playbooks/cleanup-packet.yml -c local \
+	$(ANSIBLE_LOG_LEVEL)
+
 create-vagrant:
 	vagrant up
 	find / -name vagrant_ansible_inventory
diff --git a/tests/cloud_playbooks/cleanup-packet.yml b/tests/cloud_playbooks/cleanup-packet.yml
new file mode 100644
index 000000000..b709d6d0d
--- /dev/null
+++ b/tests/cloud_playbooks/cleanup-packet.yml
@@ -0,0 +1,7 @@
+---
+
+- hosts: localhost
+  gather_facts: no
+  become: true
+  roles:
+    - { role: cleanup-packet-ci }
diff --git a/tests/cloud_playbooks/roles/cleanup-packet-ci/tasks/main.yml b/tests/cloud_playbooks/roles/cleanup-packet-ci/tasks/main.yml
new file mode 100644
index 000000000..9256b2d54
--- /dev/null
+++ b/tests/cloud_playbooks/roles/cleanup-packet-ci/tasks/main.yml
@@ -0,0 +1,16 @@
+---
+
+- name: Fetch a list of namespaces
+  kubernetes.core.k8s_info:
+    api_version: v1
+    kind: Namespace
+    label_selectors:
+      - cijobs = true
+  register: namespaces
+
+- name: Delete stale namespaces for more than 2 hours
+  command: "kubectl delete namespace {{ item.metadata.name }}"
+  failed_when: false
+  loop: "{{ namespaces.resources }}"
+  when:
+    - (now() - (item.metadata.creationTimestamp | to_datetime("%Y-%m-%dT%H:%M:%SZ"))).total_seconds() >= 7200
diff --git a/tests/cloud_playbooks/roles/packet-ci/tasks/cleanup-old-vms.yml b/tests/cloud_playbooks/roles/packet-ci/tasks/cleanup-old-vms.yml
new file mode 100644
index 000000000..cf81e81b5
--- /dev/null
+++ b/tests/cloud_playbooks/roles/packet-ci/tasks/cleanup-old-vms.yml
@@ -0,0 +1,17 @@
+---
+
+- name: Fetch a list of namespaces
+  kubernetes.core.k8s_info:
+    api_version: v1
+    kind: Namespace
+    label_selectors:
+      - cijobs = true
+      - branch = {{ branch }}
+  register: namespaces
+
+- name: Delete older namespaces
+  command: "kubectl delete namespace {{ item.metadata.name }}"
+  failed_when: false
+  loop: "{{ namespaces.resources }}"
+  when:
+    - (item.metadata.labels.pipeline_id | int) < (pipeline_id | int)
diff --git a/tests/cloud_playbooks/roles/packet-ci/tasks/create-vms.yml b/tests/cloud_playbooks/roles/packet-ci/tasks/create-vms.yml
index 4f0a66844..8ccf5adc5 100644
--- a/tests/cloud_playbooks/roles/packet-ci/tasks/create-vms.yml
+++ b/tests/cloud_playbooks/roles/packet-ci/tasks/create-vms.yml
@@ -1,7 +1,9 @@
 ---
 
 - name: "Create CI namespace {{ test_name }} for test vms"
-  command: "kubectl create namespace {{ test_name }}"
+  shell: |-
+    kubectl create namespace {{ test_name }} &&
+      kubectl label namespace {{ test_name }} cijobs=true branch="{{ branch }}" pipeline_id="{{ pipeline_id }}"
   changed_when: false
 
 - name: "Create temp dir /tmp/{{ test_name }} for CI files"
diff --git a/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml b/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml
index bf4e974e3..9d8e105db 100644
--- a/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml
+++ b/tests/cloud_playbooks/roles/packet-ci/tasks/main.yml
@@ -7,6 +7,8 @@
   set_fact:
     vm_count: "{%- if mode in ['separate', 'separate-scale', 'ha', 'ha-scale', 'ha-recover', 'ha-recover-noquorum'] -%}{{ 3|int }}{%- elif mode == 'aio' -%}{{ 1|int }}{%- else -%}{{ 2|int }}{%- endif -%}"
 
+- import_tasks: cleanup-old-vms.yml
+
 - import_tasks: create-vms.yml
   when:
     - not vm_cleanup
-- 
GitLab