diff --git a/roles/download/defaults/main.yml b/roles/download/defaults/main.yml index 2c80ffae55a2bff4fb6ba2c8c0e15119cabf0dca..8eee9fd2f46edf8e4bb23fcb970102678275818e 100644 --- a/roles/download/defaults/main.yml +++ b/roles/download/defaults/main.yml @@ -132,14 +132,14 @@ kubednsautoscaler_image_repo: "gcr.io/google_containers/cluster-proportional-aut kubednsautoscaler_image_tag: "{{ kubednsautoscaler_version }}" test_image_repo: busybox test_image_tag: latest -elasticsearch_version: "v2.4.1" -elasticsearch_image_repo: "gcr.io/google_containers/elasticsearch" +elasticsearch_version: "v5.6.4" +elasticsearch_image_repo: "k8s.gcr.io/elasticsearch" elasticsearch_image_tag: "{{ elasticsearch_version }}" -fluentd_version: "1.22" -fluentd_image_repo: "gcr.io/google_containers/fluentd-elasticsearch" +fluentd_version: "v2.0.4" +fluentd_image_repo: "k8s.gcr.io/fluentd-elasticsearch" fluentd_image_tag: "{{ fluentd_version }}" -kibana_version: "v4.6.1" -kibana_image_repo: "gcr.io/google_containers/kibana" +kibana_version: "5.6.4" +kibana_image_repo: "docker.elastic.co/kibana/kibana" kibana_image_tag: "{{ kibana_version }}" helm_version: "v2.9.1" helm_image_repo: "lachlanevenson/k8s-helm" diff --git a/roles/kubernetes-apps/efk/elasticsearch/templates/efk-clusterrolebinding.yml b/roles/kubernetes-apps/efk/elasticsearch/templates/efk-clusterrolebinding.yml index dd5b9b630f9d4c6561b6f84452e42516da591134..4b9ab006737bc7278c4031c373bb670469e21219 100644 --- a/roles/kubernetes-apps/efk/elasticsearch/templates/efk-clusterrolebinding.yml +++ b/roles/kubernetes-apps/efk/elasticsearch/templates/efk-clusterrolebinding.yml @@ -1,9 +1,12 @@ --- kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1beta1 +apiVersion: rbac.authorization.k8s.io/v1 metadata: name: efk namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile subjects: - kind: ServiceAccount name: efk diff --git a/roles/kubernetes-apps/efk/elasticsearch/templates/efk-sa.yml b/roles/kubernetes-apps/efk/elasticsearch/templates/efk-sa.yml index 75d75f6508c7badb071657bc007cf52cfe90db8f..01e774e966626edf6a104f59b6adf33db0538fa4 100644 --- a/roles/kubernetes-apps/efk/elasticsearch/templates/efk-sa.yml +++ b/roles/kubernetes-apps/efk/elasticsearch/templates/efk-sa.yml @@ -6,3 +6,4 @@ metadata: namespace: kube-system labels: kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile diff --git a/roles/kubernetes-apps/efk/elasticsearch/templates/elasticsearch-deployment.yml.j2 b/roles/kubernetes-apps/efk/elasticsearch/templates/elasticsearch-deployment.yml.j2 index 4cdcf33ad0dbe515cfba3eb085b08242ecea91ea..51666c1f21f896e2cb01a269efa8938757e2cc1a 100644 --- a/roles/kubernetes-apps/efk/elasticsearch/templates/elasticsearch-deployment.yml.j2 +++ b/roles/kubernetes-apps/efk/elasticsearch/templates/elasticsearch-deployment.yml.j2 @@ -1,15 +1,17 @@ --- -# https://raw.githubusercontent.com/kubernetes/kubernetes/v1.5.2/cluster/addons/fluentd-elasticsearch/es-controller.yaml -apiVersion: extensions/v1beta1 -kind: Deployment +# https://raw.githubusercontent.com/kubernetes/kubernetes/v1.10.2/cluster/addons/fluentd-elasticsearch/es-statefulset.yaml +apiVersion: apps/v1 +kind: StatefulSet metadata: - name: elasticsearch-logging-v1 + name: elasticsearch-logging namespace: kube-system labels: k8s-app: elasticsearch-logging version: "{{ elasticsearch_image_tag }}" kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile spec: + serviceName: elasticsearch-logging replicas: 2 selector: matchLabels: @@ -53,4 +55,10 @@ spec: {% if rbac_enabled %} serviceAccountName: efk {% endif %} + initContainers: + - image: alpine:3.6 + command: ["/sbin/sysctl", "-w", "vm.max_map_count=262144"] + name: elasticsearch-logging-init + securityContext: + privileged: true diff --git a/roles/kubernetes-apps/efk/fluentd/defaults/main.yml b/roles/kubernetes-apps/efk/fluentd/defaults/main.yml index e8d93732c575ba0b92ef7dde303eb365fade2f63..0305a5f7a7c7b9883d364c6c6989eeecf0c8c68a 100644 --- a/roles/kubernetes-apps/efk/fluentd/defaults/main.yml +++ b/roles/kubernetes-apps/efk/fluentd/defaults/main.yml @@ -1,7 +1,7 @@ --- fluentd_cpu_limit: 0m -fluentd_mem_limit: 200Mi +fluentd_mem_limit: 500Mi fluentd_cpu_requests: 100m fluentd_mem_requests: 200Mi -fluentd_config_dir: /etc/kubernetes/fluentd -fluentd_config_file: fluentd.conf +fluentd_config_dir: /etc/fluent/config.d +# fluentd_config_file: fluentd.conf diff --git a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2 b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2 index b7de44dc03340534374d2c6a0adcd1ec5c7732de..0b0229f69fddde55b9bb72d8bdc1b0a74d6b4ac3 100644 --- a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2 +++ b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2 @@ -1,10 +1,19 @@ +--- +# https://raw.githubusercontent.com/kubernetes/kubernetes/release-1.10/cluster/addons/fluentd-elasticsearch/fluentd-es-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: fluentd-config namespace: "kube-system" + labels: + addonmanager.kubernetes.io/mode: Reconcile data: - {{ fluentd_config_file }}: | + system.conf: |- + <system> + root_dir /tmp/fluentd-buffers/ + </system> + + containers.input.conf: |- # This configuration file for Fluentd / td-agent is used # to watch changes to Docker log files. The kubelet creates symlinks that # capture the pod name, namespace, container name & Docker container ID @@ -18,7 +27,6 @@ data: # See https://github.com/uken/fluent-plugin-elasticsearch & # https://github.com/fabric8io/fluent-plugin-kubernetes_metadata_filter for # more information about the plugins. - # Maintainer: Jimmi Dyson <jimmidyson@gmail.com> # # Example # ======= @@ -99,63 +107,87 @@ data: # This makes it easier for users to search for logs by pod name or by # the name of the Kubernetes container regardless of how many times the # Kubernetes pod has been restarted (resulting in a several Docker container IDs). - # - # TODO: Propagate the labels associated with a container along with its logs - # so users can query logs using labels as well as or instead of the pod name - # and container name. This is simply done via configuration of the Kubernetes - # fluentd plugin but requires secrets to be enabled in the fluent pod. This is a - # problem yet to be solved as secrets are not usable in static pods which the fluentd - # pod must be until a per-node controller is available in Kubernetes. - # Prevent fluentd from handling records containing its own logs. Otherwise - # it can lead to an infinite loop, when error in sending one message generates - # another message which also fails to be sent and so on. - <match fluent.**> - type null - </match> - # Example: + + # Json Log Example: # {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"} + # CRI Log Example: + # 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here <source> - type tail + @id fluentd-containers.log + @type tail path /var/log/containers/*.log pos_file /var/log/es-containers.log.pos time_format %Y-%m-%dT%H:%M:%S.%NZ - tag kubernetes.* - format json + tag raw.kubernetes.* read_from_head true + <parse> + @type multi_format + <pattern> + format json + time_key time + time_format %Y-%m-%dT%H:%M:%S.%NZ + </pattern> + <pattern> + format /^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$/ + time_format %Y-%m-%dT%H:%M:%S.%N%:z + </pattern> + </parse> </source> + + # Detect exceptions in the log output and forward them as one log entry. + <match raw.kubernetes.**> + @id raw.kubernetes + @type detect_exceptions + remove_tag_prefix raw + message log + stream stream + multiline_flush_interval 5 + max_bytes 500000 + max_lines 1000 + </match> + + system.input.conf: |- # Example: # 2015-12-21 23:17:22,066 [salt.state ][INFO ] Completed state [net.ipv4.ip_forward] at time 23:17:22.066081 <source> - type tail + @id minion + @type tail format /^(?<time>[^ ]* [^ ,]*)[^\[]*\[[^\]]*\]\[(?<severity>[^ \]]*) *\] (?<message>.*)$/ time_format %Y-%m-%d %H:%M:%S path /var/log/salt/minion - pos_file /var/log/es-salt.pos + pos_file /var/log/salt.pos tag salt </source> + # Example: # Dec 21 23:17:22 gke-foo-1-1-4b5cbd14-node-4eoj startupscript: Finished running startup script /var/run/google.startup.script <source> - type tail + @id startupscript.log + @type tail format syslog path /var/log/startupscript.log pos_file /var/log/es-startupscript.log.pos tag startupscript </source> + # Examples: # time="2016-02-04T06:51:03.053580605Z" level=info msg="GET /containers/json" # time="2016-02-04T07:53:57.505612354Z" level=error msg="HTTP Error" err="No such image: -f" statusCode=404 + # TODO(random-liu): Remove this after cri container runtime rolls out. <source> - type tail + @id docker.log + @type tail format /^time="(?<time>[^)]*)" level=(?<severity>[^ ]*) msg="(?<message>[^"]*)"( err="(?<error>[^"]*)")?( statusCode=($<status_code>\d+))?/ path /var/log/docker.log pos_file /var/log/es-docker.log.pos tag docker </source> + # Example: # 2016/02/04 06:52:38 filePurge: successfully removed file /var/etcd/data/member/wal/00000000000006d0-00000000010a23d1.wal <source> - type tail + @id etcd.log + @type tail # Not parsing this, because it doesn't have anything particularly useful to # parse out of it (like severities). format none @@ -163,13 +195,16 @@ data: pos_file /var/log/es-etcd.log.pos tag etcd </source> + # Multi-line parsing is required for all the kube logs because very large log # statements, such as those that include entire object bodies, get split into # multiple lines by glog. + # Example: # I0204 07:32:30.020537 3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537] <source> - type tail + @id kubelet.log + @type tail format multiline multiline_flush_interval 5s format_firstline /^\w\d{4}/ @@ -179,10 +214,12 @@ data: pos_file /var/log/es-kubelet.log.pos tag kubelet </source> + # Example: # I1118 21:26:53.975789 6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed <source> - type tail + @id kube-proxy.log + @type tail format multiline multiline_flush_interval 5s format_firstline /^\w\d{4}/ @@ -192,10 +229,12 @@ data: pos_file /var/log/es-kube-proxy.log.pos tag kube-proxy </source> + # Example: # I0204 07:00:19.604280 5 handlers.go:131] GET /api/v1/nodes: (1.624207ms) 200 [[kube-controller-manager/v1.1.3 (linux/amd64) kubernetes/6a81b50] 127.0.0.1:38266] <source> - type tail + @id kube-apiserver.log + @type tail format multiline multiline_flush_interval 5s format_firstline /^\w\d{4}/ @@ -205,10 +244,12 @@ data: pos_file /var/log/es-kube-apiserver.log.pos tag kube-apiserver </source> + # Example: # I0204 06:55:31.872680 5 servicecontroller.go:277] LB already exists and doesn't need update for service kube-system/kube-ui <source> - type tail + @id kube-controller-manager.log + @type tail format multiline multiline_flush_interval 5s format_firstline /^\w\d{4}/ @@ -218,10 +259,12 @@ data: pos_file /var/log/es-kube-controller-manager.log.pos tag kube-controller-manager </source> + # Example: # W0204 06:49:18.239674 7 reflector.go:245] pkg/scheduler/factory/factory.go:193: watch of *api.Service ended with: 401: The event in requested index is outdated and cleared (the requested history has been cleared [2578313/2577886]) [2579312] <source> - type tail + @id kube-scheduler.log + @type tail format multiline multiline_flush_interval 5s format_firstline /^\w\d{4}/ @@ -231,10 +274,12 @@ data: pos_file /var/log/es-kube-scheduler.log.pos tag kube-scheduler </source> + # Example: # I1104 10:36:20.242766 5 rescheduler.go:73] Running Rescheduler <source> - type tail + @id rescheduler.log + @type tail format multiline multiline_flush_interval 5s format_firstline /^\w\d{4}/ @@ -244,10 +289,12 @@ data: pos_file /var/log/es-rescheduler.log.pos tag rescheduler </source> + # Example: # I0603 15:31:05.793605 6 cluster_manager.go:230] Reading config from path /etc/gce.conf <source> - type tail + @id glbc.log + @type tail format multiline multiline_flush_interval 5s format_firstline /^\w\d{4}/ @@ -257,10 +304,12 @@ data: pos_file /var/log/es-glbc.log.pos tag glbc </source> + # Example: # I0603 15:31:05.793605 6 cluster_manager.go:230] Reading config from path /etc/gce.conf <source> - type tail + @id cluster-autoscaler.log + @type tail format multiline multiline_flush_interval 5s format_firstline /^\w\d{4}/ @@ -270,59 +319,123 @@ data: pos_file /var/log/es-cluster-autoscaler.log.pos tag cluster-autoscaler </source> + + # Logs from systemd-journal for interesting services. + # TODO(random-liu): Remove this after cri container runtime rolls out. + <source> + @id journald-docker + @type systemd + filters [{ "_SYSTEMD_UNIT": "docker.service" }] + <storage> + @type local + persistent true + </storage> + read_from_head true + tag docker + </source> + + # <source> + # @id journald-container-runtime + # @type systemd + # filters [{ "_SYSTEMD_UNIT": "{% raw %}{{ container_runtime }} {% endraw %}.service" }] + # <storage> + # @type local + # persistent true + # </storage> + # read_from_head true + # tag container-runtime + # </source> + + <source> + @id journald-kubelet + @type systemd + filters [{ "_SYSTEMD_UNIT": "kubelet.service" }] + <storage> + @type local + persistent true + </storage> + read_from_head true + tag kubelet + </source> + + <source> + @id journald-node-problem-detector + @type systemd + filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }] + <storage> + @type local + persistent true + </storage> + read_from_head true + tag node-problem-detector + </source> + + forward.input.conf: |- + # Takes the messages sent over TCP + <source> + @type forward + </source> + + monitoring.conf: |- + # Prometheus Exporter Plugin + # input plugin that exports metrics + <source> + @type prometheus + </source> + + <source> + @type monitor_agent + </source> + + # input plugin that collects metrics from MonitorAgent + <source> + @type prometheus_monitor + <labels> + host ${hostname} + </labels> + </source> + + # input plugin that collects metrics for output plugin + <source> + @type prometheus_output_monitor + <labels> + host ${hostname} + </labels> + </source> + + # input plugin that collects metrics for in_tail plugin + <source> + @type prometheus_tail_monitor + <labels> + host ${hostname} + </labels> + </source> + + output.conf: |- + # Enriches records with Kubernetes metadata <filter kubernetes.**> - type kubernetes_metadata + @type kubernetes_metadata </filter> - ## Prometheus Exporter Plugin - ## input plugin that exports metrics - #<source> - # type prometheus - #</source> - #<source> - # type monitor_agent - #</source> - #<source> - # type forward - #</source> - ## input plugin that collects metrics from MonitorAgent - #<source> - # @type prometheus_monitor - # <labels> - # host ${hostname} - # </labels> - #</source> - ## input plugin that collects metrics for output plugin - #<source> - # @type prometheus_output_monitor - # <labels> - # host ${hostname} - # </labels> - #</source> - ## input plugin that collects metrics for in_tail plugin - #<source> - # @type prometheus_tail_monitor - # <labels> - # host ${hostname} - # </labels> - #</source> + <match **> - type elasticsearch - user "#{ENV['FLUENT_ELASTICSEARCH_USER']}" - password "#{ENV['FLUENT_ELASTICSEARCH_PASSWORD']}" - log_level info - include_tag_key true - host elasticsearch-logging - port 9200 - logstash_format true - # Set the chunk limit the same as for fluentd-gcp. - buffer_chunk_limit 2M - # Cap buffer memory usage to 2MiB/chunk * 32 chunks = 64 MiB - buffer_queue_limit 32 - flush_interval 5s - # Never wait longer than 5 minutes between retries. - max_retry_wait 30 - # Disable the limit on the number of retries (retry forever). - disable_retry_limit - # Use multiple threads for processing. - num_threads 8 - </match> + @id elasticsearch + @type elasticsearch + @log_level info + include_tag_key true + host elasticsearch-logging + port 9200 + logstash_format true + <buffer> + @type file + path /var/log/fluentd-buffers/kubernetes.system.buffer + flush_mode interval + retry_type exponential_backoff + flush_thread_count 2 + flush_interval 5s + retry_forever + retry_max_interval 30 + chunk_limit_size 2M + queue_limit_length 8 + overflow_action block + </buffer> + </match> \ No newline at end of file diff --git a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2 b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2 index f23a8851c502cadb6e1ec1e1c174af2c41b3603b..3a911cf3894029fc0d6dad839a4e236e8c2a7a7f 100644 --- a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2 +++ b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2 @@ -1,32 +1,42 @@ --- -# https://raw.githubusercontent.com/kubernetes/kubernetes/v1.5.2/cluster/addons/fluentd-elasticsearch/es-controller.yaml -apiVersion: extensions/v1beta1 +# https://raw.githubusercontent.com/kubernetes/kubernetes/v1.10.2/cluster/addons/fluentd-elasticsearch/fluentd-es-ds.yaml +apiVersion: apps/v1 kind: DaemonSet metadata: - name: "fluentd-es-v{{ fluentd_version }}" + name: "fluentd-es-{{ fluentd_version }}" namespace: "kube-system" labels: k8s-app: fluentd-es + version: "{{ fluentd_version }}" kubernetes.io/cluster-service: "true" - version: "v{{ fluentd_version }}" + addonmanager.kubernetes.io/mode: Reconcile spec: + selector: + matchLabels: + k8s-app: fluentd-es + version: "{{ fluentd_version }}" template: metadata: labels: k8s-app: fluentd-es kubernetes.io/cluster-service: "true" - version: "v{{ fluentd_version }}" + version: "{{ fluentd_version }}" + # This annotation ensures that fluentd does not get evicted if the node + # supports critical pod annotation based priority scheme. + # Note that this does not guarantee admission on the nodes (#40573). + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' spec: - tolerations: - - effect: NoSchedule - operator: Exists + priorityClassName: system-node-critical +{% if rbac_enabled %} + serviceAccountName: efk +{% endif %} containers: - name: fluentd-es image: "{{ fluentd_image_repo }}:{{ fluentd_image_tag }}" - command: - - '/bin/sh' - - '-c' - - '/usr/sbin/td-agent -c {{ fluentd_config_dir }}/{{ fluentd_config_file}} 2>&1 >> /var/log/fluentd.log' + env: + - name: FLUENTD_ARGS + value: "--no-supervisor -q" resources: limits: {% if fluentd_cpu_limit is defined and fluentd_cpu_limit != "0m" %} @@ -34,27 +44,26 @@ spec: {% endif %} memory: {{ fluentd_mem_limit }} requests: - cpu: {{ fluentd_cpu_requests }} + cpu: {{ fluentd_cpu_requests }} memory: {{ fluentd_mem_requests }} volumeMounts: - name: varlog mountPath: /var/log - - name: dockercontainers + - name: varlibdockercontainers mountPath: "{{ docker_daemon_graph }}/containers" readOnly: true - - name: config + - name: config-volume mountPath: "{{ fluentd_config_dir }}" + nodeSelector: + beta.kubernetes.io/fluentd-ds-ready: "true" terminationGracePeriodSeconds: 30 volumes: - name: varlog hostPath: path: /var/log - - name: dockercontainers + - name: varlibdockercontainers hostPath: path: {{ docker_daemon_graph }}/containers - - name: config - configMap: - name: fluentd-config -{% if rbac_enabled %} - serviceAccountName: efk -{% endif %} + - name: config-volume + configMap: + name: fluentd-config \ No newline at end of file diff --git a/roles/kubernetes-apps/efk/kibana/defaults/main.yml b/roles/kubernetes-apps/efk/kibana/defaults/main.yml index 0651a032d062c40ddf5a7c65247bdd721b96465a..c76e3e71009461a722968e70d20f8e0113a11752 100644 --- a/roles/kubernetes-apps/efk/kibana/defaults/main.yml +++ b/roles/kubernetes-apps/efk/kibana/defaults/main.yml @@ -4,3 +4,4 @@ kibana_mem_limit: 0M kibana_cpu_requests: 100m kibana_mem_requests: 0M kibana_service_port: 5601 +kibana_base_url: "/api/v1/namespaces/kube-system/services/kibana-logging/proxy" diff --git a/roles/kubernetes-apps/efk/kibana/templates/kibana-deployment.yml.j2 b/roles/kubernetes-apps/efk/kibana/templates/kibana-deployment.yml.j2 index c5603d389a86b86d3904b9bacd42889c6bc14a94..880482d4de064139b5b541d1c8d52f6b57bf9126 100644 --- a/roles/kubernetes-apps/efk/kibana/templates/kibana-deployment.yml.j2 +++ b/roles/kubernetes-apps/efk/kibana/templates/kibana-deployment.yml.j2 @@ -1,6 +1,6 @@ --- -# https://raw.githubusercontent.com/kubernetes/kubernetes/v1.5.2/cluster/addons/fluentd-kibana/kibana-controller.yaml -apiVersion: extensions/v1beta1 +# https://raw.githubusercontent.com/kubernetes/kubernetes/release-1.10/cluster/addons/fluentd-elasticsearch/kibana-deployment.yaml +apiVersion: apps/v1 kind: Deployment metadata: name: kibana-logging @@ -36,10 +36,12 @@ spec: env: - name: "ELASTICSEARCH_URL" value: "http://elasticsearch-logging:{{ elasticsearch_service_port }}" -{% if kibana_base_url is defined and kibana_base_url != "" %} - - name: "KIBANA_BASE_URL" + - name: "SERVER_BASEPATH" value: "{{ kibana_base_url }}" -{% endif %} + - name: XPACK_MONITORING_ENABLED + value: "false" + - name: XPACK_SECURITY_ENABLED + value: "false" ports: - containerPort: 5601 name: ui