From 1b7c8cdf2199b4f6f070a280b6aa7248772dd291 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk <fbranczyk@gmail.com> Date: Mon, 16 Oct 2017 15:11:53 +0200 Subject: [PATCH] *: bump Prometheus to v2.0.0-rc.1 --- assets/prometheus/rules/alertmanager.rules | 36 - .../prometheus/rules/alertmanager.rules.yaml | 33 + assets/prometheus/rules/etcd3.rules | 177 --- assets/prometheus/rules/etcd3.rules.yaml | 123 ++ assets/prometheus/rules/general.rules | 63 - assets/prometheus/rules/general.rules.yaml | 48 + assets/prometheus/rules/kube-apiserver.rules | 28 - .../rules/kube-apiserver.rules.yaml | 22 + .../rules/kube-controller-manager.rules | 11 - .../rules/kube-controller-manager.rules.yaml | 13 + assets/prometheus/rules/kube-scheduler.rules | 11 - .../rules/kube-scheduler.rules.yaml | 13 + assets/prometheus/rules/kubelet.rules | 60 - assets/prometheus/rules/kubelet.rules.yaml | 49 + assets/prometheus/rules/kubernetes.rules | 171 --- assets/prometheus/rules/kubernetes.rules.yaml | 115 ++ assets/prometheus/rules/node.rules | 43 - assets/prometheus/rules/node.rules.yaml | 37 + assets/prometheus/rules/prometheus.rules | 10 - assets/prometheus/rules/prometheus.rules.yaml | 12 + hack/scripts/generate-rules-configmap.sh | 2 +- .../prometheus/prometheus-k8s-rules.yaml | 1095 +++++++---------- manifests/prometheus/prometheus-k8s.yaml | 2 +- 23 files changed, 942 insertions(+), 1232 deletions(-) delete mode 100644 assets/prometheus/rules/alertmanager.rules create mode 100644 assets/prometheus/rules/alertmanager.rules.yaml delete mode 100644 assets/prometheus/rules/etcd3.rules create mode 100644 assets/prometheus/rules/etcd3.rules.yaml delete mode 100644 assets/prometheus/rules/general.rules create mode 100644 assets/prometheus/rules/general.rules.yaml delete mode 100644 assets/prometheus/rules/kube-apiserver.rules create mode 100644 assets/prometheus/rules/kube-apiserver.rules.yaml delete mode 100644 assets/prometheus/rules/kube-controller-manager.rules create mode 100644 assets/prometheus/rules/kube-controller-manager.rules.yaml delete mode 100644 assets/prometheus/rules/kube-scheduler.rules create mode 100644 assets/prometheus/rules/kube-scheduler.rules.yaml delete mode 100644 assets/prometheus/rules/kubelet.rules create mode 100644 assets/prometheus/rules/kubelet.rules.yaml delete mode 100644 assets/prometheus/rules/kubernetes.rules create mode 100644 assets/prometheus/rules/kubernetes.rules.yaml delete mode 100644 assets/prometheus/rules/node.rules create mode 100644 assets/prometheus/rules/node.rules.yaml delete mode 100644 assets/prometheus/rules/prometheus.rules create mode 100644 assets/prometheus/rules/prometheus.rules.yaml diff --git a/assets/prometheus/rules/alertmanager.rules b/assets/prometheus/rules/alertmanager.rules deleted file mode 100644 index 71bdc687..00000000 --- a/assets/prometheus/rules/alertmanager.rules +++ /dev/null @@ -1,36 +0,0 @@ -ALERT AlertmanagerConfigInconsistent - IF count_values by (service) ("config_hash", alertmanager_config_hash) - / on(service) group_left - label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Alertmanager configurations are inconsistent", - description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." - } - -ALERT AlertmanagerDownOrMissing - IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") - / on(job) group_right - sum by(job) (up) != 1 - FOR 5m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Alertmanager down or not discovered", - description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." - } - -ALERT FailedReload - IF alertmanager_config_last_reload_successful == 0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Alertmanager configuration reload has failed", - description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." - } diff --git a/assets/prometheus/rules/alertmanager.rules.yaml b/assets/prometheus/rules/alertmanager.rules.yaml new file mode 100644 index 00000000..8f65c5da --- /dev/null +++ b/assets/prometheus/rules/alertmanager.rules.yaml @@ -0,0 +1,33 @@ +groups: +- name: ./alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) + GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", + "alertmanager-$1", "alertmanager", "(.*)") != 1 + for: 5m + labels: + severity: critical + annotations: + description: The configuration of the instances of the Alertmanager cluster + `{{$labels.service}}` are out of sync. + summary: Alertmanager configurations are inconsistent + - alert: AlertmanagerDownOrMissing + expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", + "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 + for: 5m + labels: + severity: warning + annotations: + description: An unexpected number of Alertmanagers are scraped or Alertmanagers + disappeared from discovery. + summary: Alertmanager down or not discovered + - alert: FailedReload + expr: alertmanager_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Alertmanager configuration reload has failed diff --git a/assets/prometheus/rules/etcd3.rules b/assets/prometheus/rules/etcd3.rules deleted file mode 100644 index 1b1621e4..00000000 --- a/assets/prometheus/rules/etcd3.rules +++ /dev/null @@ -1,177 +0,0 @@ -# general cluster availability - -# alert if another failed member will result in an unavailable cluster -ALERT InsufficientMembers -IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) -FOR 3m -LABELS { - severity = "critical" -} -ANNOTATIONS { - summary = "etcd cluster insufficient members", - description = "If one more etcd member goes down the cluster will be unavailable", -} - -# etcd leader alerts -# ================== - -# alert if any etcd instance has no leader -ALERT NoLeader -IF etcd_server_has_leader{job="etcd"} == 0 -FOR 1m -LABELS { - severity = "critical" -} -ANNOTATIONS { - summary = "etcd member has no leader", - description = "etcd member {{ $labels.instance }} has no leader", -} - -# alert if there are lots of leader changes -ALERT HighNumberOfLeaderChanges -IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "a high number of leader changes within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", -} - -# gRPC request alerts -# =================== - -# alert if more than 1% of gRPC method calls have failed within the last 5 minutes -ALERT HighNumberOfFailedGRPCRequests -IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) - / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "a high number of gRPC requests are failing", - description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", -} - -# alert if more than 5% of gRPC method calls have failed within the last 5 minutes -ALERT HighNumberOfFailedGRPCRequests -IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) - / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 -FOR 5m -LABELS { - severity = "critical" -} -ANNOTATIONS { - summary = "a high number of gRPC requests are failing", - description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", -} - -# alert if the 99th percentile of gRPC method calls take more than 150ms -ALERT GRPCRequestsSlow -IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 -FOR 10m -LABELS { - severity = "critical" -} -ANNOTATIONS { - summary = "slow gRPC requests", - description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow", -} - -# HTTP requests alerts -# ==================== - -# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes -ALERT HighNumberOfFailedHTTPRequests -IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", -} - -# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes -ALERT HighNumberOfFailedHTTPRequests -IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 -FOR 5m -LABELS { - severity = "critical" -} -ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", -} - -# alert if the 99th percentile of HTTP requests take more than 150ms -ALERT HTTPRequestsSlow -IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "slow HTTP requests", - description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow", -} - -# etcd member communication alerts -# ================================ - -# alert if 99th percentile of round trips take 150ms -ALERT EtcdMemberCommunicationSlow -IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "etcd member communication is slow", - description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow", -} - -# etcd proposal alerts -# ==================== - -# alert if there are several failed proposals within an hour -ALERT HighNumberOfFailedProposals -IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "a high number of proposals within the etcd cluster are failing", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", -} - -# etcd disk io latency alerts -# =========================== - -# alert if 99th percentile of fsync durations is higher than 500ms -ALERT HighFsyncDurations -IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "high fsync durations", - description = "etcd instance {{ $labels.instance }} fync durations are high", -} - -# alert if 99th percentile of commit durations is higher than 250ms -ALERT HighCommitDurations -IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 -FOR 10m -LABELS { - severity = "warning" -} -ANNOTATIONS { - summary = "high commit durations", - description = "etcd instance {{ $labels.instance }} commit durations are high", -} diff --git a/assets/prometheus/rules/etcd3.rules.yaml b/assets/prometheus/rules/etcd3.rules.yaml new file mode 100644 index 00000000..ade2ed62 --- /dev/null +++ b/assets/prometheus/rules/etcd3.rules.yaml @@ -0,0 +1,123 @@ +groups: +- name: ./etcd3.rules + rules: + - alert: InsufficientMembers + expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + for: 3m + labels: + severity: critical + annotations: + description: If one more etcd member goes down the cluster will be unavailable + summary: etcd cluster insufficient members + - alert: NoLeader + expr: etcd_server_has_leader{job="etcd"} == 0 + for: 1m + labels: + severity: critical + annotations: + description: etcd member {{ $labels.instance }} has no leader + summary: etcd member has no leader + - alert: HighNumberOfLeaderChanges + expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader + changes within the last hour + summary: a high number of leader changes within the etcd cluster are happening + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: GRPCRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: critical + annotations: + description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method + }} are slow + summary: slow gRPC requests + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method + }} are slow + summary: slow HTTP requests + - alert: EtcdMemberCommunicationSlow + expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} member communication with + {{ $labels.To }} is slow + summary: etcd member communication is slow + - alert: HighNumberOfFailedProposals + expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal + failures within the last hour + summary: a high number of proposals within the etcd cluster are failing + - alert: HighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) + > 0.5 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} fync durations are high + summary: high fsync durations + - alert: HighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) + > 0.25 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} commit durations are high + summary: high commit durations diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules deleted file mode 100644 index 3500d689..00000000 --- a/assets/prometheus/rules/general.rules +++ /dev/null @@ -1,63 +0,0 @@ -### Up Alerting ### - -Alert TargetDown - IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Targets are down", - description = "{{ $value }}% or more of {{ $labels.job }} targets are down." - } - -### Dead man's switch ### - -ALERT DeadMansSwitch - IF vector(1) - LABELS { - severity = "none", - } - ANNOTATIONS { - summary = "Alerting DeadMansSwitch", - description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", - } - -### File descriptor alerts ### - -ALERT TooManyOpenFileDescriptors - IF 100 * (process_open_fds / process_max_fds) > 95 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "too many open file descriptors", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", - } - -instance:fd_utilization = process_open_fds / process_max_fds - -# alert if file descriptors are likely to exhaust within the next 4 hours -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", - } - -# alert if file descriptors are likely to exhaust within the next hour -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[10m], 3600) > 1 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", - } diff --git a/assets/prometheus/rules/general.rules.yaml b/assets/prometheus/rules/general.rules.yaml new file mode 100644 index 00000000..355e12f3 --- /dev/null +++ b/assets/prometheus/rules/general.rules.yaml @@ -0,0 +1,48 @@ +groups: +- name: ./general.rules + rules: + - alert: TargetDown + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' + summary: Targets are down + - alert: DeadMansSwitch + expr: vector(1) + labels: + severity: none + annotations: + description: This is a DeadMansSwitch meant to ensure that the entire Alerting + pipeline is functional. + summary: Alerting DeadMansSwitch + - alert: TooManyOpenFileDescriptors + expr: 100 * (process_open_fds / process_max_fds) > 95 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.' + summary: too many open file descriptors + - record: instance:fd_utilization + expr: process_open_fds / process_max_fds + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) instance will exhaust in file/socket descriptors soon' + summary: file descriptors soon exhausted + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) instance will exhaust in file/socket descriptors soon' + summary: file descriptors soon exhausted diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules deleted file mode 100644 index 04b4a6de..00000000 --- a/assets/prometheus/rules/kube-apiserver.rules +++ /dev/null @@ -1,28 +0,0 @@ -ALERT K8SApiserverDown - IF absent(up{job="apiserver"} == 1) - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", - } - -# Some verbs excluded because they are expected to be long-lasting: -# WATCHLIST is long-poll, CONNECT is `kubectl exec`. -# -# apiserver_request_latencies' unit is microseconds -ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } diff --git a/assets/prometheus/rules/kube-apiserver.rules.yaml b/assets/prometheus/rules/kube-apiserver.rules.yaml new file mode 100644 index 00000000..55ebe025 --- /dev/null +++ b/assets/prometheus/rules/kube-apiserver.rules.yaml @@ -0,0 +1,22 @@ +groups: +- name: ./kube-apiserver.rules + rules: + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: Prometheus failed to scrape API server(s), or all API servers have + disappeared from service discovery. + summary: API server unreachable + - alert: K8SApiServerLatency + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}) + WITHOUT (instance, resource)) / 1e+06 > 1 + for: 10m + labels: + severity: warning + annotations: + description: 99th percentile Latency for {{ $labels.verb }} requests to the + kube-apiserver is higher than 1s. + summary: Kubernetes apiserver latency is high diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules deleted file mode 100644 index 3157cd12..00000000 --- a/assets/prometheus/rules/kube-controller-manager.rules +++ /dev/null @@ -1,11 +0,0 @@ -ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"} == 1) - FOR 5m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager", - } diff --git a/assets/prometheus/rules/kube-controller-manager.rules.yaml b/assets/prometheus/rules/kube-controller-manager.rules.yaml new file mode 100644 index 00000000..f23bbde3 --- /dev/null +++ b/assets/prometheus/rules/kube-controller-manager.rules.yaml @@ -0,0 +1,13 @@ +groups: +- name: ./kube-controller-manager.rules + rules: + - alert: K8SControllerManagerDown + expr: absent(up{job="kube-controller-manager"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S controller manager. Deployments and replication + controllers are not making progress. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager + summary: Controller manager is down diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules deleted file mode 100644 index ee86017a..00000000 --- a/assets/prometheus/rules/kube-scheduler.rules +++ /dev/null @@ -1,11 +0,0 @@ -ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"} == 1) - FOR 5m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Scheduler is down", - description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", - runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler", - } diff --git a/assets/prometheus/rules/kube-scheduler.rules.yaml b/assets/prometheus/rules/kube-scheduler.rules.yaml new file mode 100644 index 00000000..0383b3b1 --- /dev/null +++ b/assets/prometheus/rules/kube-scheduler.rules.yaml @@ -0,0 +1,13 @@ +groups: +- name: ./kube-scheduler.rules + rules: + - alert: K8SSchedulerDown + expr: absent(up{job="kube-scheduler"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S scheduler. New pods are not being assigned + to nodes. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler + summary: Scheduler is down diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules deleted file mode 100644 index 0d47d9d7..00000000 --- a/assets/prometheus/rules/kubelet.rules +++ /dev/null @@ -1,60 +0,0 @@ -ALERT K8SNodeNotReady - IF kube_node_status_condition{condition="Ready", status="true"} == 0 - FOR 1h - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - -ALERT K8SManyNodesNotReady - IF - count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1 - AND - ( - count(kube_node_status_condition{condition="Ready", status="true"} == 0) - / - count(kube_node_status_condition{condition="Ready", status="true"}) - ) > 0.2 - FOR 1m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Many Kubernetes nodes are Not Ready", - description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).", - } - -ALERT K8SKubeletDown - IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 - FOR 1h - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape {{ $value }}% of kubelets.", - } - -ALERT K8SKubeletDown - IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", - } - -ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", - } diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml new file mode 100644 index 00000000..1aa5f84c --- /dev/null +++ b/assets/prometheus/rules/kubelet.rules.yaml @@ -0,0 +1,49 @@ +groups: +- name: ./kubelet.rules + rules: + - alert: K8SNodeNotReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 1h + labels: + severity: warning + annotations: + description: The Kubelet on {{ $labels.node }} has not checked in with the API, + or has set itself to NotReady, for more than an hour + summary: Node status is NotReady + - alert: K8SManyNodesNotReady + expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) + > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == + 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady + state).' + summary: Many Kubernetes nodes are Not Ready + - alert: K8SKubeletDown + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + for: 1h + labels: + severity: warning + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletDown + expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) + > 0.1 + for: 1h + labels: + severity: critical + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets + have disappeared from service discovery. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletTooManyPods + expr: kubelet_running_pod_count > 100 + labels: + severity: warning + annotations: + description: Kubelet {{$labels.instance}} is running {{$value}} pods, close + to the limit of 110 + summary: Kubelet is close to pod limit diff --git a/assets/prometheus/rules/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules deleted file mode 100644 index 084d11e5..00000000 --- a/assets/prometheus/rules/kubernetes.rules +++ /dev/null @@ -1,171 +0,0 @@ -# NOTE: These rules were kindly contributed by the SoundCloud engineering team. - -### Container resources ### - -cluster_namespace_controller_pod_container:spec_memory_limit_bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_spec_memory_limit_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:spec_cpu_shares = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_spec_cpu_shares{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:cpu_usage:rate = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - irate( - container_cpu_usage_seconds_total{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_usage:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_usage_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_working_set:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_working_set_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_rss:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_rss{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_cache:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_cache{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:disk_usage:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_disk_usage_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_pagefaults:rate = - sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( - label_replace( - irate( - container_memory_failures_total{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -cluster_namespace_controller_pod_container:memory_oom:rate = - sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( - label_replace( - irate( - container_memory_failcnt{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - -### Cluster resources ### - -cluster:memory_allocation:percent = - 100 * sum by (cluster) ( - container_spec_memory_limit_bytes{pod_name!=""} - ) / sum by (cluster) ( - machine_memory_bytes - ) - -cluster:memory_used:percent = - 100 * sum by (cluster) ( - container_memory_usage_bytes{pod_name!=""} - ) / sum by (cluster) ( - machine_memory_bytes - ) - -cluster:cpu_allocation:percent = - 100 * sum by (cluster) ( - container_spec_cpu_shares{pod_name!=""} - ) / sum by (cluster) ( - container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores - ) - -cluster:node_cpu_use:percent = - 100 * sum by (cluster) ( - rate(node_cpu{mode!="idle"}[5m]) - ) / sum by (cluster) ( - machine_cpu_cores - ) - -### API latency ### - -# Raw metrics are in microseconds. Convert to seconds. -cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile( - 0.99, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 -cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile( - 0.9, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 -cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile( - 0.5, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 - -### Scheduling latency ### - -cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - -cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - -cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 -cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml new file mode 100644 index 00000000..ab5ccf06 --- /dev/null +++ b/assets/prometheus/rules/kubernetes.rules.yaml @@ -0,0 +1,115 @@ +groups: +- name: ./kubernetes.rules + rules: + - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:spec_cpu_shares + expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:cpu_usage:rate + expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_usage:bytes + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_working_set:bytes + expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_rss:bytes + expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_cache:bytes + expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:disk_usage:bytes + expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate + expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name, scope, type) + - record: cluster_namespace_controller_pod_container:memory_oom:rate + expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name, scope, type) + - record: cluster:memory_allocation:percent + expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) + / sum(machine_memory_bytes) BY (cluster) + - record: cluster:memory_used:percent + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) + BY (cluster) + - record: cluster:cpu_allocation:percent + expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} + * ON(cluster, instance) machine_cpu_cores) BY (cluster) + - record: cluster:node_cpu_use:percent + expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) + BY (cluster) + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules deleted file mode 100644 index 4f768671..00000000 --- a/assets/prometheus/rules/node.rules +++ /dev/null @@ -1,43 +0,0 @@ -ALERT NodeExporterDown - IF absent(up{job="node-exporter"} == 1) - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "node-exporter cannot be scraped", - description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", - } - -ALERT K8SNodeOutOfDisk - IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Node ran out of disk space.", - description = "{{ $labels.node }} has run out of disk space.", - } - -ALERT K8SNodeMemoryPressure - IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1 - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Node is under memory pressure.", - description = "{{ $labels.node }} is under memory pressure.", - } - -ALERT K8SNodeDiskPressure - IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1 - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Node is under disk pressure.", - description = "{{ $labels.node }} is under disk pressure.", - } diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml new file mode 100644 index 00000000..9c1641ca --- /dev/null +++ b/assets/prometheus/rules/node.rules.yaml @@ -0,0 +1,37 @@ +groups: +- name: ./node.rules + rules: + - alert: NodeExporterDown + expr: absent(up{job="node-exporter"} == 1) + for: 10m + labels: + severity: warning + annotations: + description: Prometheus could not scrape a node-exporter for more than 10m, + or node-exporters have disappeared from discovery. + summary: node-exporter cannot be scraped + - alert: K8SNodeOutOfDisk + expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + labels: + service: k8s + severity: critical + annotations: + description: '{{ $labels.node }} has run out of disk space.' + summary: Node ran out of disk space. + - alert: K8SNodeMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == + 1 + labels: + service: k8s + severity: warning + annotations: + description: '{{ $labels.node }} is under memory pressure.' + summary: Node is under memory pressure. + - alert: K8SNodeDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + labels: + service: k8s + severity: warning + annotations: + description: '{{ $labels.node }} is under disk pressure.' + summary: Node is under disk pressure. diff --git a/assets/prometheus/rules/prometheus.rules b/assets/prometheus/rules/prometheus.rules deleted file mode 100644 index 05c278f1..00000000 --- a/assets/prometheus/rules/prometheus.rules +++ /dev/null @@ -1,10 +0,0 @@ -ALERT FailedReload - IF prometheus_config_last_reload_successful == 0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Prometheus configuration reload has failed", - description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." - } diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml new file mode 100644 index 00000000..6ed0cd68 --- /dev/null +++ b/assets/prometheus/rules/prometheus.rules.yaml @@ -0,0 +1,12 @@ +groups: +- name: ./prometheus.rules + rules: + - alert: FailedReload + expr: prometheus_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Prometheus' configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Prometheus configuration reload has failed diff --git a/hack/scripts/generate-rules-configmap.sh b/hack/scripts/generate-rules-configmap.sh index b8e00fef..9eb2efc8 100755 --- a/hack/scripts/generate-rules-configmap.sh +++ b/hack/scripts/generate-rules-configmap.sh @@ -11,7 +11,7 @@ metadata: data: EOF -for f in assets/prometheus/rules/*.rules +for f in assets/prometheus/rules/*.rules.yaml do echo " $(basename $f): |+" cat $f | sed "s/^/ /g" diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index de3d7787..041c127b 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -6,623 +6,478 @@ metadata: role: prometheus-rulefiles prometheus: k8s data: - alertmanager.rules: |+ - ALERT AlertmanagerConfigInconsistent - IF count_values by (service) ("config_hash", alertmanager_config_hash) - / on(service) group_left - label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Alertmanager configurations are inconsistent", - description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." - } - - ALERT AlertmanagerDownOrMissing - IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") - / on(job) group_right - sum by(job) (up) != 1 - FOR 5m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Alertmanager down or not discovered", - description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." - } - - ALERT FailedReload - IF alertmanager_config_last_reload_successful == 0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Alertmanager configuration reload has failed", - description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." - } - etcd3.rules: |+ - # general cluster availability - - # alert if another failed member will result in an unavailable cluster - ALERT InsufficientMembers - IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) - FOR 3m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "etcd cluster insufficient members", - description = "If one more etcd member goes down the cluster will be unavailable", - } - - # etcd leader alerts - # ================== - - # alert if any etcd instance has no leader - ALERT NoLeader - IF etcd_server_has_leader{job="etcd"} == 0 - FOR 1m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "etcd member has no leader", - description = "etcd member {{ $labels.instance }} has no leader", - } - - # alert if there are lots of leader changes - ALERT HighNumberOfLeaderChanges - IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of leader changes within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", - } - - # gRPC request alerts - # =================== - - # alert if more than 1% of gRPC method calls have failed within the last 5 minutes - ALERT HighNumberOfFailedGRPCRequests - IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) - / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of gRPC requests are failing", - description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if more than 5% of gRPC method calls have failed within the last 5 minutes - ALERT HighNumberOfFailedGRPCRequests - IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) - / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of gRPC requests are failing", - description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if the 99th percentile of gRPC method calls take more than 150ms - ALERT GRPCRequestsSlow - IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "slow gRPC requests", - description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow", - } - - # HTTP requests alerts - # ==================== - - # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if the 99th percentile of HTTP requests take more than 150ms - ALERT HTTPRequestsSlow - IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "slow HTTP requests", - description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow", - } - - # etcd member communication alerts - # ================================ - - # alert if 99th percentile of round trips take 150ms - ALERT EtcdMemberCommunicationSlow - IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "etcd member communication is slow", - description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow", - } - - # etcd proposal alerts - # ==================== - - # alert if there are several failed proposals within an hour - ALERT HighNumberOfFailedProposals - IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of proposals within the etcd cluster are failing", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", - } - - # etcd disk io latency alerts - # =========================== - - # alert if 99th percentile of fsync durations is higher than 500ms - ALERT HighFsyncDurations - IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "high fsync durations", - description = "etcd instance {{ $labels.instance }} fync durations are high", - } - - # alert if 99th percentile of commit durations is higher than 250ms - ALERT HighCommitDurations - IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "high commit durations", - description = "etcd instance {{ $labels.instance }} commit durations are high", - } - general.rules: |+ - ### Up Alerting ### - - Alert TargetDown - IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Targets are down", - description = "{{ $value }}% or more of {{ $labels.job }} targets are down." - } - - ### Dead man's switch ### - - ALERT DeadMansSwitch - IF vector(1) - LABELS { - severity = "none", - } - ANNOTATIONS { - summary = "Alerting DeadMansSwitch", - description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", - } - - ### File descriptor alerts ### - - ALERT TooManyOpenFileDescriptors - IF 100 * (process_open_fds / process_max_fds) > 95 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "too many open file descriptors", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", - } - - instance:fd_utilization = process_open_fds / process_max_fds - - # alert if file descriptors are likely to exhaust within the next 4 hours - ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", - } - - # alert if file descriptors are likely to exhaust within the next hour - ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[10m], 3600) > 1 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", - } - kube-apiserver.rules: |+ - ALERT K8SApiserverDown - IF absent(up{job="apiserver"} == 1) - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", - } - - # Some verbs excluded because they are expected to be long-lasting: - # WATCHLIST is long-poll, CONNECT is `kubectl exec`. - # - # apiserver_request_latencies' unit is microseconds - ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } - kube-controller-manager.rules: |+ - ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"} == 1) - FOR 5m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager", - } - kubelet.rules: |+ - ALERT K8SNodeNotReady - IF kube_node_status_condition{condition="Ready", status="true"} == 0 - FOR 1h - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - - ALERT K8SManyNodesNotReady - IF - count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1 - AND - ( - count(kube_node_status_condition{condition="Ready", status="true"} == 0) - / - count(kube_node_status_condition{condition="Ready", status="true"}) - ) > 0.2 - FOR 1m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Many Kubernetes nodes are Not Ready", - description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).", - } - - ALERT K8SKubeletDown - IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 - FOR 1h - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape {{ $value }}% of kubelets.", - } - - ALERT K8SKubeletDown - IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", - } - - ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", - } - kubernetes.rules: |+ - # NOTE: These rules were kindly contributed by the SoundCloud engineering team. - - ### Container resources ### - - cluster_namespace_controller_pod_container:spec_memory_limit_bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_spec_memory_limit_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:spec_cpu_shares = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_spec_cpu_shares{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:cpu_usage:rate = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - irate( - container_cpu_usage_seconds_total{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_usage:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_usage_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_working_set:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_working_set_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_rss:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_rss{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_cache:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_memory_cache{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:disk_usage:bytes = - sum by (cluster,namespace,controller,pod_name,container_name) ( - label_replace( - container_disk_usage_bytes{container_name!=""}, - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_pagefaults:rate = - sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( - label_replace( - irate( - container_memory_failures_total{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - cluster_namespace_controller_pod_container:memory_oom:rate = - sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( - label_replace( - irate( - container_memory_failcnt{container_name!=""}[5m] - ), - "controller", "$1", - "pod_name", "^(.*)-[a-z0-9]+" - ) - ) - - ### Cluster resources ### - - cluster:memory_allocation:percent = - 100 * sum by (cluster) ( - container_spec_memory_limit_bytes{pod_name!=""} - ) / sum by (cluster) ( - machine_memory_bytes - ) - - cluster:memory_used:percent = - 100 * sum by (cluster) ( - container_memory_usage_bytes{pod_name!=""} - ) / sum by (cluster) ( - machine_memory_bytes - ) - - cluster:cpu_allocation:percent = - 100 * sum by (cluster) ( - container_spec_cpu_shares{pod_name!=""} - ) / sum by (cluster) ( - container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores - ) - - cluster:node_cpu_use:percent = - 100 * sum by (cluster) ( - rate(node_cpu{mode!="idle"}[5m]) - ) / sum by (cluster) ( - machine_cpu_cores - ) - - ### API latency ### - - # Raw metrics are in microseconds. Convert to seconds. - cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile( - 0.99, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 - cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile( - 0.9, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 - cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile( - 0.5, - sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) - ) / 1e6 - - ### Scheduling latency ### - - cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - - cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - - cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = - histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = - histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = - histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - kube-scheduler.rules: |+ - ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"} == 1) - FOR 5m - LABELS { - severity = "critical", - } - ANNOTATIONS { - summary = "Scheduler is down", - description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", - runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler", - } - node.rules: |+ - ALERT NodeExporterDown - IF absent(up{job="node-exporter"} == 1) - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "node-exporter cannot be scraped", - description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", - } - - ALERT K8SNodeOutOfDisk - IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Node ran out of disk space.", - description = "{{ $labels.node }} has run out of disk space.", - } - - ALERT K8SNodeMemoryPressure - IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1 - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Node is under memory pressure.", - description = "{{ $labels.node }} is under memory pressure.", - } - - ALERT K8SNodeDiskPressure - IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1 - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Node is under disk pressure.", - description = "{{ $labels.node }} is under disk pressure.", - } - prometheus.rules: |+ - ALERT FailedReload - IF prometheus_config_last_reload_successful == 0 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Prometheus configuration reload has failed", - description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." - } + alertmanager.rules.yaml: |+ + groups: + - name: ./alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) + GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", + "alertmanager-$1", "alertmanager", "(.*)") != 1 + for: 5m + labels: + severity: critical + annotations: + description: The configuration of the instances of the Alertmanager cluster + `{{$labels.service}}` are out of sync. + summary: Alertmanager configurations are inconsistent + - alert: AlertmanagerDownOrMissing + expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", + "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 + for: 5m + labels: + severity: warning + annotations: + description: An unexpected number of Alertmanagers are scraped or Alertmanagers + disappeared from discovery. + summary: Alertmanager down or not discovered + - alert: FailedReload + expr: alertmanager_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Alertmanager configuration reload has failed + etcd3.rules.yaml: |+ + groups: + - name: ./etcd3.rules + rules: + - alert: InsufficientMembers + expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + for: 3m + labels: + severity: critical + annotations: + description: If one more etcd member goes down the cluster will be unavailable + summary: etcd cluster insufficient members + - alert: NoLeader + expr: etcd_server_has_leader{job="etcd"} == 0 + for: 1m + labels: + severity: critical + annotations: + description: etcd member {{ $labels.instance }} has no leader + summary: etcd member has no leader + - alert: HighNumberOfLeaderChanges + expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader + changes within the last hour + summary: a high number of leader changes within the etcd cluster are happening + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: GRPCRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: critical + annotations: + description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method + }} are slow + summary: slow gRPC requests + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method + }} are slow + summary: slow HTTP requests + - alert: EtcdMemberCommunicationSlow + expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} member communication with + {{ $labels.To }} is slow + summary: etcd member communication is slow + - alert: HighNumberOfFailedProposals + expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal + failures within the last hour + summary: a high number of proposals within the etcd cluster are failing + - alert: HighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) + > 0.5 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} fync durations are high + summary: high fsync durations + - alert: HighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) + > 0.25 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} commit durations are high + summary: high commit durations + general.rules.yaml: |+ + groups: + - name: ./general.rules + rules: + - alert: TargetDown + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' + summary: Targets are down + - alert: DeadMansSwitch + expr: vector(1) + labels: + severity: none + annotations: + description: This is a DeadMansSwitch meant to ensure that the entire Alerting + pipeline is functional. + summary: Alerting DeadMansSwitch + - alert: TooManyOpenFileDescriptors + expr: 100 * (process_open_fds / process_max_fds) > 95 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.' + summary: too many open file descriptors + - record: instance:fd_utilization + expr: process_open_fds / process_max_fds + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) instance will exhaust in file/socket descriptors soon' + summary: file descriptors soon exhausted + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ + $labels.instance }}) instance will exhaust in file/socket descriptors soon' + summary: file descriptors soon exhausted + kube-apiserver.rules.yaml: |+ + groups: + - name: ./kube-apiserver.rules + rules: + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: Prometheus failed to scrape API server(s), or all API servers have + disappeared from service discovery. + summary: API server unreachable + - alert: K8SApiServerLatency + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}) + WITHOUT (instance, resource)) / 1e+06 > 1 + for: 10m + labels: + severity: warning + annotations: + description: 99th percentile Latency for {{ $labels.verb }} requests to the + kube-apiserver is higher than 1s. + summary: Kubernetes apiserver latency is high + kube-controller-manager.rules.yaml: |+ + groups: + - name: ./kube-controller-manager.rules + rules: + - alert: K8SControllerManagerDown + expr: absent(up{job="kube-controller-manager"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S controller manager. Deployments and replication + controllers are not making progress. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager + summary: Controller manager is down + kube-scheduler.rules.yaml: |+ + groups: + - name: ./kube-scheduler.rules + rules: + - alert: K8SSchedulerDown + expr: absent(up{job="kube-scheduler"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S scheduler. New pods are not being assigned + to nodes. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler + summary: Scheduler is down + kubelet.rules.yaml: |+ + groups: + - name: ./kubelet.rules + rules: + - alert: K8SNodeNotReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 1h + labels: + severity: warning + annotations: + description: The Kubelet on {{ $labels.node }} has not checked in with the API, + or has set itself to NotReady, for more than an hour + summary: Node status is NotReady + - alert: K8SManyNodesNotReady + expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) + > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == + 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady + state).' + summary: Many Kubernetes nodes are Not Ready + - alert: K8SKubeletDown + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + for: 1h + labels: + severity: warning + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletDown + expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) + > 0.1 + for: 1h + labels: + severity: critical + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets + have disappeared from service discovery. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletTooManyPods + expr: kubelet_running_pod_count > 100 + labels: + severity: warning + annotations: + description: Kubelet {{$labels.instance}} is running {{$value}} pods, close + to the limit of 110 + summary: Kubelet is close to pod limit + kubernetes.rules.yaml: |+ + groups: + - name: ./kubernetes.rules + rules: + - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:spec_cpu_shares + expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:cpu_usage:rate + expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_usage:bytes + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_working_set:bytes + expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_rss:bytes + expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_cache:bytes + expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:disk_usage:bytes + expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate + expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name, scope, type) + - record: cluster_namespace_controller_pod_container:memory_oom:rate + expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name, scope, type) + - record: cluster:memory_allocation:percent + expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) + / sum(machine_memory_bytes) BY (cluster) + - record: cluster:memory_used:percent + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) + BY (cluster) + - record: cluster:cpu_allocation:percent + expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} + * ON(cluster, instance) machine_cpu_cores) BY (cluster) + - record: cluster:node_cpu_use:percent + expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) + BY (cluster) + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, + cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + node.rules.yaml: |+ + groups: + - name: ./node.rules + rules: + - alert: NodeExporterDown + expr: absent(up{job="node-exporter"} == 1) + for: 10m + labels: + severity: warning + annotations: + description: Prometheus could not scrape a node-exporter for more than 10m, + or node-exporters have disappeared from discovery. + summary: node-exporter cannot be scraped + - alert: K8SNodeOutOfDisk + expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + labels: + service: k8s + severity: critical + annotations: + description: '{{ $labels.node }} has run out of disk space.' + summary: Node ran out of disk space. + - alert: K8SNodeMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == + 1 + labels: + service: k8s + severity: warning + annotations: + description: '{{ $labels.node }} is under memory pressure.' + summary: Node is under memory pressure. + - alert: K8SNodeDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + labels: + service: k8s + severity: warning + annotations: + description: '{{ $labels.node }} is under disk pressure.' + summary: Node is under disk pressure. + prometheus.rules.yaml: |+ + groups: + - name: ./prometheus.rules + rules: + - alert: FailedReload + expr: prometheus_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Prometheus' configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Prometheus configuration reload has failed diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index e936de46..168daa34 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v1.7.2 + version: v2.0.0-rc.1 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpressions: -- GitLab