From 8b6ee5c18b9323888cc1c146beed819984656f92 Mon Sep 17 00:00:00 2001 From: Alexander Holte-Davidsen <alexander@kit.no> Date: Mon, 5 Mar 2018 09:52:51 +0100 Subject: [PATCH] Add summary to Alertmanager rules where missing - updated accoring to guidelines --- assets/prometheus/rules/alertmanager.rules.yaml | 3 +++ assets/prometheus/rules/kubelet.rules.yaml | 1 + assets/prometheus/rules/kubernetes.rules.yaml | 6 ++++++ assets/prometheus/rules/node.rules.yaml | 2 ++ assets/prometheus/rules/prometheus.rules.yaml | 5 +++++ manifests/prometheus/prometheus-k8s-rules.yaml | 17 +++++++++++++++++ 6 files changed, 34 insertions(+) diff --git a/assets/prometheus/rules/alertmanager.rules.yaml b/assets/prometheus/rules/alertmanager.rules.yaml index fdfdfd0f..5e51f75b 100644 --- a/assets/prometheus/rules/alertmanager.rules.yaml +++ b/assets/prometheus/rules/alertmanager.rules.yaml @@ -11,6 +11,7 @@ groups: annotations: description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. + summary: Configuration out of sync - alert: AlertmanagerDownOrMissing expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 @@ -20,6 +21,7 @@ groups: annotations: description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. + summary: Alertmanager down or missing - alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful == 0 for: 10m @@ -28,3 +30,4 @@ groups: annotations: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. + summary: Alertmanager's configuration reload failed diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml index 0edd7878..85547dd6 100644 --- a/assets/prometheus/rules/kubelet.rules.yaml +++ b/assets/prometheus/rules/kubelet.rules.yaml @@ -26,6 +26,7 @@ groups: severity: warning annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. + summary: Prometheus failed to scrape - alert: K8SKubeletDown expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) * 100 > 10 diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml index f961ce6b..288841b7 100644 --- a/assets/prometheus/rules/kubernetes.rules.yaml +++ b/assets/prometheus/rules/kubernetes.rules.yaml @@ -51,6 +51,7 @@ groups: annotations: description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}} + summary: API server high latency - alert: APIServerLatencyHigh expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 @@ -60,6 +61,7 @@ groups: annotations: description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}} + summary: API server high latency - alert: APIServerErrorsHigh expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 2 @@ -68,6 +70,7 @@ groups: severity: warning annotations: description: API server returns errors for {{ $value }}% of requests + summary: API server request errors - alert: APIServerErrorsHigh expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 5 @@ -84,12 +87,14 @@ groups: annotations: description: No API servers are reachable or all have disappeared from service discovery + summary: No API servers are reachable - alert: K8sCertificateExpirationNotice labels: severity: warning annotations: description: Kubernetes API Certificate is expiring soon (less than 7 days) + summary: Kubernetes API Certificate is expiering soon expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 - alert: K8sCertificateExpirationNotice @@ -97,4 +102,5 @@ groups: severity: critical annotations: description: Kubernetes API Certificate is expiring in less than 1 day + summary: Kubernetes API Certificate is expiering expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml index 0e7e1bbd..d14f0870 100644 --- a/assets/prometheus/rules/node.rules.yaml +++ b/assets/prometheus/rules/node.rules.yaml @@ -26,6 +26,7 @@ groups: annotations: description: Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery + summary: Prometheus could not scrape a node-exporter - alert: NodeDiskRunningFull expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 for: 30m @@ -42,3 +43,4 @@ groups: annotations: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}}) + summary: Node disk is running full diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml index e006ba9b..43f2808c 100644 --- a/assets/prometheus/rules/prometheus.rules.yaml +++ b/assets/prometheus/rules/prometheus.rules.yaml @@ -8,6 +8,7 @@ groups: severity: warning annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + summary: Reloading Promehteus' configuration failed - alert: PrometheusNotificationQueueRunningFull expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity @@ -17,6 +18,7 @@ groups: annotations: description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}} + summary: Prometheus' alert notification queue is running full - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) @@ -27,6 +29,7 @@ groups: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alert from Prometheus - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) @@ -37,6 +40,7 @@ groups: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alerts from Prometheus - alert: PrometheusNotConnectedToAlertmanagers expr: prometheus_notifications_alertmanagers_discovered < 1 @@ -46,6 +50,7 @@ groups: annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers - alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index c7cb14ac..05368dc1 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -20,6 +20,7 @@ data: annotations: description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. + summary: Configuration out of sync - alert: AlertmanagerDownOrMissing expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 @@ -29,6 +30,7 @@ data: annotations: description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. + summary: Alertmanager down or missing - alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful == 0 for: 10m @@ -37,6 +39,7 @@ data: annotations: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. + summary: Alertmanager's configuration reload failed etcd3.rules.yaml: |+ groups: - name: ./etcd3.rules @@ -363,6 +366,7 @@ data: severity: warning annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. + summary: Prometheus failed to scrape - alert: K8SKubeletDown expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) * 100 > 10 @@ -436,6 +440,7 @@ data: annotations: description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}} + summary: API server high latency - alert: APIServerLatencyHigh expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 @@ -445,6 +450,7 @@ data: annotations: description: the API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}} + summary: API server high latency - alert: APIServerErrorsHigh expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 2 @@ -453,6 +459,7 @@ data: severity: warning annotations: description: API server returns errors for {{ $value }}% of requests + summary: API server request errors - alert: APIServerErrorsHigh expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) * 100 > 5 @@ -469,12 +476,14 @@ data: annotations: description: No API servers are reachable or all have disappeared from service discovery + summary: No API servers are reachable - alert: K8sCertificateExpirationNotice labels: severity: warning annotations: description: Kubernetes API Certificate is expiring soon (less than 7 days) + summary: Kubernetes API Certificate is expiering soon expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 - alert: K8sCertificateExpirationNotice @@ -482,6 +491,7 @@ data: severity: critical annotations: description: Kubernetes API Certificate is expiring in less than 1 day + summary: Kubernetes API Certificate is expiering expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 node.rules.yaml: |+ groups: @@ -512,6 +522,7 @@ data: annotations: description: Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery + summary: Prometheus could not scrape a node-exporter - alert: NodeDiskRunningFull expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 for: 30m @@ -528,6 +539,7 @@ data: annotations: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}}) + summary: Node disk is running full prometheus.rules.yaml: |+ groups: - name: prometheus.rules @@ -539,6 +551,7 @@ data: severity: warning annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + summary: Reloading Promehteus' configuration failed - alert: PrometheusNotificationQueueRunningFull expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity @@ -548,6 +561,7 @@ data: annotations: description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}} + summary: Prometheus' alert notification queue is running full - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) @@ -558,6 +572,7 @@ data: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alert from Prometheus - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) @@ -568,6 +583,7 @@ data: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alerts from Prometheus - alert: PrometheusNotConnectedToAlertmanagers expr: prometheus_notifications_alertmanagers_discovered < 1 @@ -577,6 +593,7 @@ data: annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers - alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 -- GitLab