From b7692fad09e0855c0c02619c6b47bab9ffda1b08 Mon Sep 17 00:00:00 2001
From: Bogdan Dobrelya <bdobrelia@mirantis.com>
Date: Fri, 30 Sep 2016 17:23:47 +0200
Subject: [PATCH] Add advanced net check for DNS K8s app

* Add an option to deploy K8s app to test e2e network connectivity
  and cluster DNS resolve via Kubedns for nethost/simple pods
  (defaults to false).
* Parametrize existing k8s apps templates with kube_namespace and
  kube_config_dir instead of hardcode.
* For CoreOS, ensure nameservers from inventory to be put in the
  first place to allow hostnet pods connectivity via short names
  or FQDN and hostnet agents to pass as well, if netchecker
  deployed.

Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
---
 docs/netcheck.md                              | 41 +++++++++++++++++++
 inventory/group_vars/all.yml                  |  2 +
 roles/dnsmasq/tasks/resolvconf.yml            | 17 +++++++-
 roles/download/defaults/main.yml              | 21 ++++++++++
 .../kubernetes-apps/ansible/defaults/main.yml | 12 ++++++
 .../tasks/calico-policy-controller.yml        |  6 +--
 roles/kubernetes-apps/ansible/tasks/main.yaml | 10 +++--
 .../ansible/tasks/netchecker.yml              | 20 +++++++++
 .../templates/calico-policy-controller.yml.j2 |  2 +-
 .../ansible/templates/kubedns-rc.yml          |  2 +-
 .../ansible/templates/kubedns-svc.yml         |  2 +-
 .../ansible/templates/netchecker-agent-ds.yml | 25 +++++++++++
 .../templates/netchecker-agent-hostnet-ds.yml | 26 ++++++++++++
 .../templates/netchecker-server-pod.yml       | 21 ++++++++++
 .../templates/netchecker-server-svc.yml       | 15 +++++++
 roles/kubernetes/node/meta/main.yml           |  9 ++++
 16 files changed, 220 insertions(+), 11 deletions(-)
 create mode 100644 docs/netcheck.md
 create mode 100644 roles/kubernetes-apps/ansible/tasks/netchecker.yml
 create mode 100644 roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml
 create mode 100644 roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml
 create mode 100644 roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml
 create mode 100644 roles/kubernetes-apps/ansible/templates/netchecker-server-svc.yml

diff --git a/docs/netcheck.md b/docs/netcheck.md
new file mode 100644
index 000000000..408b0fd8c
--- /dev/null
+++ b/docs/netcheck.md
@@ -0,0 +1,41 @@
+Network Checker Application
+===========================
+
+With the ``deploy_netchecker`` var enabled (defaults to false), Kargo deploys a
+Network Checker Application from the 3rd side `l23network/mcp-netchecker` docker
+images. It consists of the server and agents trying to reach the server by usual
+for Kubernetes applications network connectivity meanings. Therefore, this
+automagically verifies a pod to pod connectivity via the cluster IP and checks
+if DNS resolve is functioning as well.
+
+The checks are run by agents on a periodic basis and cover standard and host network
+pods as well. The history of performed checks may be found in the agents' application
+logs.
+
+To get the most recent and cluster-wide network connectivity report, run from
+any of the cluster nodes:
+```
+curl http://localhost:31081/api/v1/connectivity_check
+```
+Note that Kargo does not invoke the check but only deploys the application, if
+requested.
+
+There are related application specifc variables:
+```
+netchecker_port: 31081
+agent_report_interval: 15
+netcheck_namespace: default
+agent_img: "quay.io/l23network/mcp-netchecker-agent:v0.1"
+server_img: "quay.io/l23network/mcp-netchecker-server:v0.1"
+```
+
+Note that the application verifies DNS resolve for FQDNs comprising only the
+combination of the ``netcheck_namespace.dns_domain`` vars, for example the
+``netchecker-service.default.cluster.local``. If you want to deploy the application
+to the non default namespace, make sure as well to adjust the ``searchdomains`` var
+so the resulting search domain records to contain that namespace, like:
+
+```
+search: foospace.cluster.local default.cluster.local ...
+nameserver: ...
+```
diff --git a/inventory/group_vars/all.yml b/inventory/group_vars/all.yml
index 49abb1d03..daf641335 100644
--- a/inventory/group_vars/all.yml
+++ b/inventory/group_vars/all.yml
@@ -35,6 +35,8 @@ kube_users:
 cluster_name: cluster.local
 # Subdomains of DNS domain to be resolved via /etc/resolv.conf
 ndots: 5
+# Deploy netchecker app to verify DNS resolve as an HTTP service
+deploy_netchecker: false
 
 # For some environments, each node has a pubilcally accessible
 # address and an address it should bind services to.  These are
diff --git a/roles/dnsmasq/tasks/resolvconf.yml b/roles/dnsmasq/tasks/resolvconf.yml
index 9be70c7a5..ba367ac48 100644
--- a/roles/dnsmasq/tasks/resolvconf.yml
+++ b/roles/dnsmasq/tasks/resolvconf.yml
@@ -48,7 +48,20 @@
   when: resolvconf.rc == 0
   notify: Dnsmasq | update resolvconf
 
-- name: Add search domains to resolv.conf
+- name: Remove search and nameserver options from resolvconf cloud init temporary file
+  lineinfile:
+    dest: "{{resolvconffile}}"
+    state: absent
+    regexp: "^{{ item }}.*$"
+    backup: yes
+    follow: yes
+  with_items:
+    - search
+    - nameserver
+  when: ansible_os_family == "CoreOS"
+  notify: Dnsmasq | update resolvconf for CoreOS
+
+- name: Add search domains to resolvconf file
   lineinfile:
     line: "search {{searchentries}}"
     dest: "{{resolvconffile}}"
@@ -66,7 +79,7 @@
       nameserver {{ item }}
       {% endfor %}
     state: present
-    insertafter: "^search.*$"
+    insertafter: "^search default.svc.*$"
     create: yes
     backup: yes
     follow: yes
diff --git a/roles/download/defaults/main.yml b/roles/download/defaults/main.yml
index c66433c6d..966eee709 100644
--- a/roles/download/defaults/main.yml
+++ b/roles/download/defaults/main.yml
@@ -58,6 +58,12 @@ hyperkube_image_repo: "quay.io/coreos/hyperkube"
 hyperkube_image_tag: "{{ kube_version }}_coreos.0"
 pod_infra_image_repo: "gcr.io/google_containers/pause-amd64"
 pod_infra_image_tag: "{{ pod_infra_version }}"
+netcheck_tag: v0.1
+netcheck_kubectl_tag: v0.18.0-120-gaeb4ac55ad12b1-dirty
+netcheck_agent_img_repo: "quay.io/l23network/mcp-netchecker-agent"
+netcheck_server_img_repo: "quay.io/l23network/mcp-netchecker-server"
+netcheck_kubectl_img_repo: "gcr.io/google_containers/kubectl"
+
 nginx_image_repo: nginx
 nginx_image_tag: 1.11.4-alpine
 dnsmasq_version: 2.72
@@ -73,6 +79,21 @@ test_image_repo: busybox
 test_image_tag: latest
 
 downloads:
+  netcheck_server:
+    container: true
+    repo: "{{ netcheck_server_img_repo }}"
+    tag: "{{ netcheck_tag }}"
+    enabled: "{{ deploy_netchecker|bool }}"
+  netcheck_agent:
+    container: true
+    repo: "{{ netcheck_agent_img_repo }}"
+    tag: "{{ netcheck_tag }}"
+    enabled: "{{ deploy_netchecker|bool }}"
+  netcheck_kubectl:
+    container: true
+    repo: "{{ netcheck_kubectl_img_repo }}"
+    tag: "{{ netcheck_kubectl_tag }}"
+    enabled: "{{ deploy_netchecker|bool }}"
   weave:
     dest: weave/bin/weave
     version: "{{weave_version}}"
diff --git a/roles/kubernetes-apps/ansible/defaults/main.yml b/roles/kubernetes-apps/ansible/defaults/main.yml
index e064984c6..02ca7b29d 100644
--- a/roles/kubernetes-apps/ansible/defaults/main.yml
+++ b/roles/kubernetes-apps/ansible/defaults/main.yml
@@ -1,3 +1,6 @@
+kube_config_dir: /etc/kubernetes
+kube_namespace: kube-system
+
 # Versions
 kubedns_version: 1.7
 kubednsmasq_version: 1.3
@@ -13,5 +16,14 @@ exechealthz_image_tag: "{{ exechealthz_version }}"
 calico_policy_image_repo: "calico/kube-policy-controller"
 calico_policy_image_tag: latest
 
+# Netchecker
+deploy_netchecker: false
+netchecker_port: 31081
+agent_report_interval: 15
+netcheck_namespace: default
+agent_img: "quay.io/l23network/mcp-netchecker-agent:v0.1"
+server_img: "quay.io/l23network/mcp-netchecker-server:v0.1"
+kubectl_image: "gcr.io/google_containers/kubectl:v0.18.0-120-gaeb4ac55ad12b1-dirty"
+
 # SSL
 etcd_cert_dir: "/etc/ssl/etcd/ssl"
diff --git a/roles/kubernetes-apps/ansible/tasks/calico-policy-controller.yml b/roles/kubernetes-apps/ansible/tasks/calico-policy-controller.yml
index 6ad8dd220..02a49f211 100644
--- a/roles/kubernetes-apps/ansible/tasks/calico-policy-controller.yml
+++ b/roles/kubernetes-apps/ansible/tasks/calico-policy-controller.yml
@@ -1,5 +1,5 @@
 - name: Write calico-policy-controller yaml
-  template: src=calico-policy-controller.yml.j2 dest=/etc/kubernetes/calico-policy-controller.yml
+  template: src=calico-policy-controller.yml.j2 dest={{kube_config_dir}}/calico-policy-controller.yml
   when: inventory_hostname == groups['kube-master'][0]
 
 
@@ -7,7 +7,7 @@
   kube:
     name: "calico-policy-controller"
     kubectl: "{{bin_dir}}/kubectl"
-    filename: "/etc/kubernetes/calico-policy-controller.yml"
-    namespace: "kube-system"
+    filename: "{{kube_config_dir}}/calico-policy-controller.yml"
+    namespace: "{{kube_namespace}}"
     resource: "rs"
   when: inventory_hostname == groups['kube-master'][0]
diff --git a/roles/kubernetes-apps/ansible/tasks/main.yaml b/roles/kubernetes-apps/ansible/tasks/main.yaml
index 130a17a6f..a65b6b527 100644
--- a/roles/kubernetes-apps/ansible/tasks/main.yaml
+++ b/roles/kubernetes-apps/ansible/tasks/main.yaml
@@ -1,6 +1,6 @@
 ---
 - name: Kubernetes Apps | Lay Down KubeDNS Template
-  template: src={{item.file}} dest=/etc/kubernetes/{{item.file}}
+  template: src={{item.file}} dest={{kube_config_dir}}/{{item.file}}
   with_items:
     - {file: kubedns-rc.yml, type: rc}
     - {file: kubedns-svc.yml, type: svc}
@@ -10,10 +10,10 @@
 - name: Kubernetes Apps | Start Resources
   kube:
     name: kubedns
-    namespace: kube-system
+    namespace: "{{ kube_namespace }}"
     kubectl: "{{bin_dir}}/kubectl"
     resource: "{{item.item.type}}"
-    filename: /etc/kubernetes/{{item.item.file}}
+    filename: "{{kube_config_dir}}/{{item.item.file}}"
     state: "{{item.changed | ternary('latest','present') }}"
   with_items: "{{ manifests.results }}"
   when: inventory_hostname == groups['kube-master'][0]
@@ -21,3 +21,7 @@
 - include: tasks/calico-policy-controller.yml
   when: ( enable_network_policy is defined and enable_network_policy == True ) or
     ( kube_network_plugin == 'canal' )
+
+- name: Kubernetes Apps | Netchecker
+  include: tasks/netchecker.yml
+  when: deploy_netchecker
diff --git a/roles/kubernetes-apps/ansible/tasks/netchecker.yml b/roles/kubernetes-apps/ansible/tasks/netchecker.yml
new file mode 100644
index 000000000..c28d921b6
--- /dev/null
+++ b/roles/kubernetes-apps/ansible/tasks/netchecker.yml
@@ -0,0 +1,20 @@
+- name: Kubernetes Apps | Lay Down Netchecker Template
+  template: src={{item.file}} dest={{kube_config_dir}}/{{item.file}}
+  with_items:
+    - {file: netchecker-agent-ds.yml, type: ds, name: netchecker-agent}
+    - {file: netchecker-agent-hostnet-ds.yml, type: ds, name: netchecker-agent-hostnet}
+    - {file: netchecker-server-pod.yml, type: po, name: netchecker-server}
+    - {file: netchecker-server-svc.yml, type: svc, name: netchecker-service}
+  register: manifests
+  when: inventory_hostname == groups['kube-master'][0]
+
+- name: Kubernetes Apps | Start Netchecker Resources
+  kube:
+    name: "{{item.item.name}}"
+    namespace: "{{netcheck_namespace}}"
+    kubectl: "{{bin_dir}}/kubectl"
+    resource: "{{item.item.type}}"
+    filename: "{{kube_config_dir}}/{{item.item.file}}"
+    state: "{{item.changed | ternary('latest','present') }}"
+  with_items: "{{ manifests.results }}"
+  when: inventory_hostname == groups['kube-master'][0]
diff --git a/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2 b/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2
index 469060278..a522c80ad 100644
--- a/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2
+++ b/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2
@@ -2,7 +2,7 @@ apiVersion: extensions/v1beta1
 kind: ReplicaSet
 metadata:
   name: calico-policy-controller
-  namespace: kube-system
+  namespace: {{ kube_namespace }}
   labels:
     k8s-app: calico-policy
     kubernetes.io/cluster-service: "true"
diff --git a/roles/kubernetes-apps/ansible/templates/kubedns-rc.yml b/roles/kubernetes-apps/ansible/templates/kubedns-rc.yml
index fc29a0942..84e725cbc 100644
--- a/roles/kubernetes-apps/ansible/templates/kubedns-rc.yml
+++ b/roles/kubernetes-apps/ansible/templates/kubedns-rc.yml
@@ -2,7 +2,7 @@ apiVersion: v1
 kind: ReplicationController
 metadata:
   name: kubedns
-  namespace: kube-system
+  namespace: {{ kube_namespace }}
   labels:
     k8s-app: kubedns
     version: v19
diff --git a/roles/kubernetes-apps/ansible/templates/kubedns-svc.yml b/roles/kubernetes-apps/ansible/templates/kubedns-svc.yml
index 2e21bc9e6..7f88d0666 100644
--- a/roles/kubernetes-apps/ansible/templates/kubedns-svc.yml
+++ b/roles/kubernetes-apps/ansible/templates/kubedns-svc.yml
@@ -2,7 +2,7 @@ apiVersion: v1
 kind: Service
 metadata:
   name: kubedns
-  namespace: kube-system
+  namespace: {{ kube_namespace }}
   labels:
     k8s-app: kubedns
     kubernetes.io/cluster-service: "true"
diff --git a/roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml b/roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml
new file mode 100644
index 000000000..a52329e50
--- /dev/null
+++ b/roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml
@@ -0,0 +1,25 @@
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  labels:
+    app: netchecker-agent
+  name: netchecker-agent
+  namespace: {{ netcheck_namespace }}
+spec:
+  template:
+    metadata:
+      name: netchecker-agent
+      labels:
+        app: netchecker-agent
+    spec:
+      containers:
+        - name: netchecker-agent
+          image: "{{ agent_img }}"
+          env:
+            - name: MY_POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: REPORT_INTERVAL
+              value: '{{ agent_report_interval }}'
+          imagePullPolicy: {{ k8s_image_pull_policy }}
diff --git a/roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml b/roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml
new file mode 100644
index 000000000..4fd03e80a
--- /dev/null
+++ b/roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml
@@ -0,0 +1,26 @@
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  labels:
+    app: netchecker-agent-hostnet
+  name: netchecker-agent-hostnet
+  namespace: {{ netcheck_namespace }}
+spec:
+  template:
+    metadata:
+      name: netchecker-agent-hostnet
+      labels:
+        app: netchecker-agent-hostnet
+    spec:
+      hostNetwork: True
+      containers:
+        - name: netchecker-agent
+          image: "{{ agent_img }}"
+          env:
+            - name: MY_POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: REPORT_INTERVAL
+              value: '{{ agent_report_interval }}'
+          imagePullPolicy: {{ k8s_image_pull_policy }}
diff --git a/roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml b/roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml
new file mode 100644
index 000000000..6f242bc51
--- /dev/null
+++ b/roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: netchecker-server
+  labels:
+    app: netchecker-server
+  namespace: {{ netcheck_namespace }}
+spec:
+  containers:
+    - name: netchecker-server
+      image: "{{ server_img }}"
+      env:
+      imagePullPolicy: {{ k8s_image_pull_policy }}
+      ports:
+        - containerPort: 8081
+          hostPort: 8081
+    - name: kubectl-proxy
+      image: "{{ kubectl_image }}"
+      imagePullPolicy: {{ k8s_image_pull_policy }}
+      args:
+        - proxy
diff --git a/roles/kubernetes-apps/ansible/templates/netchecker-server-svc.yml b/roles/kubernetes-apps/ansible/templates/netchecker-server-svc.yml
new file mode 100644
index 000000000..dc3894676
--- /dev/null
+++ b/roles/kubernetes-apps/ansible/templates/netchecker-server-svc.yml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: netchecker-service
+  namespace: {{ netcheck_namespace }}
+spec:
+  selector:
+    app: netchecker-server
+  ports:
+    -
+      protocol: TCP
+      port: 8081
+      targetPort: 8081
+      nodePort: {{ netchecker_port }}
+  type: NodePort
diff --git a/roles/kubernetes/node/meta/main.yml b/roles/kubernetes/node/meta/main.yml
index 3e1dd5b3e..a65501113 100644
--- a/roles/kubernetes/node/meta/main.yml
+++ b/roles/kubernetes/node/meta/main.yml
@@ -9,6 +9,15 @@ dependencies:
     file: "{{ downloads.nginx }}"
   - role: download
     file: "{{ downloads.testbox }}"
+  - role: download
+    file: "{{ downloads.netcheck_server }}"
+    when: deploy_netchecker
+  - role: download
+    file: "{{ downloads.netcheck_agent }}"
+    when: deploy_netchecker
+  - role: download
+    file: "{{ downloads.netcheck_kubectl }}"
+    when: deploy_netchecker
   - role: download
     file: "{{ downloads.kubednsmasq }}"
     when: not skip_dnsmasq_k8s|default(false)
-- 
GitLab