From 61d88b8db262fd10f0645601e5e748414d0f1a36 Mon Sep 17 00:00:00 2001
From: Erwan Miran <mirwan@users.noreply.github.com>
Date: Fri, 25 Jan 2019 17:14:22 +0100
Subject: [PATCH] Fix random failure in debug: var=result.content|from_json
 (#4094)

* Fix random failure in debug: var=result.content|from_json

* netchecker agents are deployed on all k8s-cluster group members

* reducing limits/requests is not enough, switching to n1-standard-2

* gce_centos7 need more cpu
---
 tests/files/gce_centos7-calico-ha.yml      |  2 +-
 tests/files/gce_centos7-flannel-addons.yml |  4 +-
 tests/files/gce_centos7-kube-router.yml    |  2 +-
 tests/testcases/040_check-network-adv.yml  | 57 ++++++++++++++++++++--
 4 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/tests/files/gce_centos7-calico-ha.yml b/tests/files/gce_centos7-calico-ha.yml
index 8eec638ba..d45d75b8e 100644
--- a/tests/files/gce_centos7-calico-ha.yml
+++ b/tests/files/gce_centos7-calico-ha.yml
@@ -1,7 +1,7 @@
 # Instance settings
 cloud_image_family: centos-7
 cloud_region: us-central1-c
-cloud_machine_type: "n1-standard-1"
+cloud_machine_type: "n1-standard-2"
 mode: ha
 
 # Deployment settings
diff --git a/tests/files/gce_centos7-flannel-addons.yml b/tests/files/gce_centos7-flannel-addons.yml
index 5432e5488..98702cf90 100644
--- a/tests/files/gce_centos7-flannel-addons.yml
+++ b/tests/files/gce_centos7-flannel-addons.yml
@@ -1,7 +1,7 @@
 # Instance settings
 cloud_image_family: centos-7
 cloud_region: us-central1-c
-cloud_machine_type: "n1-standard-1"
+cloud_machine_type: "n1-standard-2"
 mode: ha
 
 # Deployment settings
@@ -15,7 +15,7 @@ deploy_netchecker: true
 dns_min_replicas: 1
 cloud_provider: gce
 kube_encrypt_secret_data: true
-ingress_nginx_enabled: true
+#ingress_nginx_enabled: true
 cert_manager_enabled: true
 metrics_server_enabled: true
 kube_token_auth: true
diff --git a/tests/files/gce_centos7-kube-router.yml b/tests/files/gce_centos7-kube-router.yml
index b433375a5..a7349ad4c 100644
--- a/tests/files/gce_centos7-kube-router.yml
+++ b/tests/files/gce_centos7-kube-router.yml
@@ -1,7 +1,7 @@
 # Instance settings
 cloud_image_family: centos-7
 cloud_region: us-central1-c
-cloud_machine_type: "n1-standard-1"
+cloud_machine_type: "n1-standard-2"
 mode: default
 
 # Deployment settings
diff --git a/tests/testcases/040_check-network-adv.yml b/tests/testcases/040_check-network-adv.yml
index 5dfebe4f1..f95843e63 100644
--- a/tests/testcases/040_check-network-adv.yml
+++ b/tests/testcases/040_check-network-adv.yml
@@ -37,9 +37,23 @@
       run_once: true
       delegate_to: "{{groups['kube-master'][0]}}"
       register: nca_pod
-      until: nca_pod.stdout_lines|length >= groups['kube-node']|intersect(play_hosts)|length * 2
+      until: nca_pod.stdout_lines|length >= groups['k8s-cluster']|intersect(play_hosts)|length * 2
       retries: 3
       delay: 10
+      failed_when: false
+
+    - command: "{{ bin_dir }}/kubectl -n {{netcheck_namespace}} describe pod -l app={{ item }}"
+      run_once: true
+      delegate_to: "{{groups['kube-master'][0]}}"
+      no_log: false
+      with_items:
+        - netchecker-agent
+        - netchecker-agent-hostnet
+      when: not nca_pod is success
+
+    - debug: var=nca_pod.stdout_lines
+      failed_when: not nca_pod is success
+      run_once: true
 
     - name: Get netchecker agents
       uri: url=http://{{ ansible_default_ipv4.address }}:{{netchecker_port}}/api/v1/agents/ return_content=yes
@@ -50,7 +64,7 @@
       delay: "{{ agent_report_interval }}"
       until: agents.content|length > 0 and
         agents.content[0] == '{' and
-        agents.content|from_json|length >= groups['kube-node']|intersect(play_hosts)|length * 2
+        agents.content|from_json|length >= groups['k8s-cluster']|intersect(play_hosts)|length * 2
       failed_when: false
       no_log: true
 
@@ -65,16 +79,51 @@
       register: result
       retries: 3
       delay: "{{ agent_report_interval }}"
+      until: result.content|length > 0 and
+        result.content[0] == '{'
       no_log: true
       failed_when: false
       when:
         - agents.content != '{}'
 
+    - debug: var=ncs_pod
+      run_once: true
+      when: not result is success
+
+    - command: "{{ bin_dir }}/kubectl -n kube-system logs -l k8s-app=kube-proxy"
+      run_once: true
+      when: not result is success
+      delegate_to: "{{groups['kube-master'][0]}}"
+      no_log: false
+
+    - command: "{{ bin_dir }}/kubectl -n kube-system logs -l k8s-app={{item}} --all-containers"
+      run_once: true
+      when: not result is success
+      delegate_to: "{{groups['kube-master'][0]}}"
+      no_log: false
+      with_items:
+        - kube-router
+        - flannel
+        - contiv-ovs
+        - contiv-netplugin
+        - contiv-netmaster
+        - canal-node
+        - calico-node
+        - cilium
+
     - debug: var=result.content|from_json
       failed_when: not result is success
       run_once: true
-      when: not agents.content == '{}'
-      delegate_to: "{{groups['kube-master'][0]}}"
+      when:
+        - not agents.content == '{}'
+        - result.content[0] == '{'
+
+    - debug: var=result
+      failed_when: not result is success
+      run_once: true
+      when:
+        - not agents.content == '{}'
+        - result.content[0] != '{'
 
     - debug: msg="Cannot get reports from agents, consider as PASSING"
       run_once: true
-- 
GitLab