diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..04bd205aa29a48b5c272f6af89cf5052959cd869 --- /dev/null +++ b/Makefile @@ -0,0 +1,3 @@ +generate: + @echo ">> Compiling assets and generating Kubernetes manifests" + @hack/scripts/generate-manifests.sh diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules index 36ea482cf5dd63ef1847656498213c92c07c4ef0..540853927f26f99d005bfdb93687bbb8039ce6d0 100644 --- a/assets/prometheus/rules/node.rules +++ b/assets/prometheus/rules/node.rules @@ -8,3 +8,35 @@ ALERT NodeExporterDown summary = "node-exporter cannot be scraped", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } +ALERT K8SNodeOutOfDisk + IF kube_node_status_out_of_disk{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Node ran out of disk space.", + description = "{{ $labels.node }} has run out of disk space.", + } + +ALERT K8SNodeMemoryPressure + IF kube_node_status_memory_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under memory pressure.", + description = "{{ $labels.node }} is under memory pressure.", + } + +ALERT K8SNodeDiskPressure + IF kube_node_status_disk_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under disk pressure.", + description = "{{ $labels.node }} is under disk pressure.", + } diff --git a/docs/developing-alerts-and-dashboards.md b/docs/developing-alerts-and-dashboards.md new file mode 100644 index 0000000000000000000000000000000000000000..80630940a1fa2fb8bfb19cf66150871c5a5b9ae9 --- /dev/null +++ b/docs/developing-alerts-and-dashboards.md @@ -0,0 +1,27 @@ +# Developing Alerts and Dashboards + +`kube-prometheus` ships with a set of default alerting rules and dashboards. At some point one might like to extend them. This document is intended to explain the workflow of how additional alerting rules and dashboards could be added. + +For both, the Prometheus alerting rules as well as the Grafana dashboards, there are Kubernetes `ConfigMap`s, that are generated from content in the `assets/` directory. + +The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests. + +## Alerts + +The `ConfigMap` that is generated and holds the alerting rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`. + +It is generated by taking all the `*.rules` files in the `assets/prometheus/rules/` directory and generate the `ConfigMap`. + +To extend the alerting rules simply add a new `.rules` file into the `assets/prometheus/rules/` directory and re-generate the manifests. To modify the existing rules, simply edit the respective `.rules` file and re-generate the manifest. + +Then the generated manifest can be applied against a Kubernetes cluster. + +## Dashboards + +The `ConfigMap` that is generated and holds the dashboard definitions can be found in `manifests/grafana/grafana-dashboards.yaml`. + +As Grafana's support for applying dashboards from files is limited a sidecar (called "grafana-watcher") was implemented. It watches the dashboard definitions provided through the `ConfigMap` and ensures that Grafana's SQLite database is in sync with the dashboard definitions. + +To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests. + +Then the generated manifest can be applied against a Kubernetes cluster. diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index c565d442ddc6a2676641039525f231be74ac1020..9176b956b417953ad24ccf3051117b315cbde572 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -27,6 +27,8 @@ kctl apply -f manifests/node-exporter kctl apply -f manifests/kube-state-metrics kctl apply -f manifests/grafana/grafana-credentials.yaml kctl apply -f manifests/grafana -kctl apply -f manifests/prometheus/ +find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \; +kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml +kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml kctl apply -f manifests/alertmanager/ diff --git a/hack/cluster-monitoring/teardown b/hack/cluster-monitoring/teardown index 9fcc451391f440debc54665a65116b7bffdf2f1d..ac4d222d005a83b284e1133d09ce779e69a494cc 100755 --- a/hack/cluster-monitoring/teardown +++ b/hack/cluster-monitoring/teardown @@ -15,7 +15,9 @@ kctl() { kctl delete -f manifests/node-exporter kctl delete -f manifests/kube-state-metrics kctl delete -f manifests/grafana -kctl delete -f manifests/prometheus +find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \; +kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml +kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml kctl delete -f manifests/alertmanager # Hack: wait a bit to let the controller delete the deployed Prometheus server. diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml index a45f893e6cb219c0e6dc37ba1d073099afa8068a..569fdfdb1e827060c371758f4155e6daf470d04d 100644 --- a/manifests/grafana/grafana-deployment.yaml +++ b/manifests/grafana/grafana-deployment.yaml @@ -11,7 +11,7 @@ spec: spec: containers: - name: grafana - image: grafana/grafana:4.1.1 + image: grafana/grafana:4.4.1 env: - name: GF_AUTH_BASIC_ENABLED value: "true" @@ -41,7 +41,7 @@ spec: memory: 200Mi cpu: 200m - name: grafana-watcher - image: quay.io/coreos/grafana-watcher:v0.0.5 + image: quay.io/coreos/grafana-watcher:v0.0.6 args: - '--watch-dir=/var/grafana-dashboards' - '--grafana-url=http://localhost:3000' @@ -56,9 +56,6 @@ spec: secretKeyRef: name: grafana-credentials key: password - volumeMounts: - - name: grafana-dashboards - mountPath: /var/grafana-dashboards resources: requests: memory: "16Mi" diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index d574b89fc8c26e5edf7c23453259a2fb8048a32c..29bbf7463c27d8865fbcb41a5e7c011804a809f0 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -1,9 +1,9 @@ apiVersion: extensions/v1beta1 kind: Deployment metadata: - name: prometheus-operator labels: k8s-app: prometheus-operator + name: prometheus-operator spec: replicas: 1 template: @@ -11,20 +11,20 @@ spec: labels: k8s-app: prometheus-operator spec: - serviceAccountName: prometheus-operator containers: - - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.10.1 - args: - - "--kubelet-service=kube-system/kubelet" - - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" + - args: + - --kubelet-service=kube-system/kubelet + - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 + image: quay.io/coreos/prometheus-operator:v0.11.1 + name: prometheus-operator ports: - - name: http - containerPort: 8080 + - containerPort: 8080 + name: http resources: + limits: + cpu: 200m + memory: 100Mi requests: cpu: 100m memory: 50Mi - limits: - cpu: 200m - memory: 300Mi + serviceAccountName: prometheus-operator diff --git a/manifests/prometheus/prometheus-cluster-role-binding.yaml b/manifests/prometheus/prometheus-cluster-role-binding.yaml deleted file mode 100644 index 3600490f589e56fb953d449c3195a61103cd8881..0000000000000000000000000000000000000000 --- a/manifests/prometheus/prometheus-cluster-role-binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRoleBinding -metadata: - name: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring diff --git a/manifests/prometheus/prometheus-cluster-role.yaml b/manifests/prometheus/prometheus-cluster-role.yaml deleted file mode 100644 index a85422ecfa0f8914f02468e7cdf013178f221057..0000000000000000000000000000000000000000 --- a/manifests/prometheus/prometheus-cluster-role.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRole -metadata: - name: prometheus -rules: -- apiGroups: [""] - resources: - - nodes - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: - - configmaps - verbs: ["get"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] diff --git a/manifests/prometheus/prometheus-k8s-role-bindings.yaml b/manifests/prometheus/prometheus-k8s-role-bindings.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5f190e7ab1a21e09673c6a9bef9e683b22e98304 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-role-bindings.yaml @@ -0,0 +1,54 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus/prometheus-k8s-roles.yaml b/manifests/prometheus/prometheus-k8s-roles.yaml new file mode 100644 index 0000000000000000000000000000000000000000..14302ea0e59deeef5287ebdbcbdd413d2a73861c --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-roles.yaml @@ -0,0 +1,51 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: monitoring +rules: +- apiGroups: [""] + resources: + - nodes + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: kube-system +rules: +- apiGroups: [""] + resources: + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: default +rules: +- apiGroups: [""] + resources: + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus-k8s +rules: +- nonResourceURLs: ["/metrics"] + verbs: ["get"] diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 181a70c788610186aff459ad42c98559f6b7719c..e17405620975326b8847603a923916eb05eafe6b 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -582,6 +582,38 @@ data: summary = "node-exporter cannot be scraped", description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } + ALERT K8SNodeOutOfDisk + IF kube_node_status_out_of_disk{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Node ran out of disk space.", + description = "{{ $labels.node }} has run out of disk space.", + } + + ALERT K8SNodeMemoryPressure + IF kube_node_status_memory_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under memory pressure.", + description = "{{ $labels.node }} is under memory pressure.", + } + + ALERT K8SNodeDiskPressure + IF kube_node_status_disk_pressure{condition="true"} == 1 + LABELS { + service = "k8s", + severity = "warning" + } + ANNOTATIONS { + summary = "Node is under disk pressure.", + description = "{{ $labels.node }} is under disk pressure.", + } prometheus.rules: |+ ALERT FailedReload IF prometheus_config_last_reload_successful == 0 diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml index cdc3ffb66d6b056425beb34124b6fab70621ab17..0eac9630b41b68a2c66e4eeeb36cdfcc27d5fa87 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml @@ -9,6 +9,8 @@ spec: endpoints: - port: http-metrics interval: 30s + - port: cadvisor + interval: 30s honorLabels: true selector: matchLabels: