diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules index 9e26ab9a1500e975d41a4039cf75eb9778e12def..3500d6899015395e684f60c73ae88080563a237e 100644 --- a/assets/prometheus/rules/general.rules +++ b/assets/prometheus/rules/general.rules @@ -1,14 +1,14 @@ ### Up Alerting ### Alert TargetDown - IF 100 * (count(up == 0) / count(up)) > 3 + IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "Targets are down", - description = "More than {{ $value }}% of targets are down." + description = "{{ $value }}% or more of {{ $labels.job }} targets are down." } ### Dead man's switch ### diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules index c041881abb2c398ae342ea875cae02ee19e1d497..a7fdfddc045a00a9a2fa92caf8cf4fad5b9aaf57 100644 --- a/assets/prometheus/rules/kube-apiserver.rules +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -1,5 +1,5 @@ ALERT K8SApiserverDown - IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + IF absent(up{job="apiserver"} == 1) FOR 5m LABELS { severity = "critical" diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules index f75e27680d8cc543582b73e72f831cdbd5e25764..3157cd12bbc76fe266aecdbd5df754cd70b990e2 100644 --- a/assets/prometheus/rules/kube-controller-manager.rules +++ b/assets/prometheus/rules/kube-controller-manager.rules @@ -1,5 +1,5 @@ ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + IF absent(up{job="kube-controller-manager"} == 1) FOR 5m LABELS { severity = "critical", @@ -7,4 +7,5 @@ ALERT K8SControllerManagerDown ANNOTATIONS { summary = "Controller manager is down", description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager", } diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules index 6eff4bcdc010b6d7682d7d97e38af7b674a9384e..ee86017abb0442f23a29cc96a0cc91dbe009bb5c 100644 --- a/assets/prometheus/rules/kube-scheduler.rules +++ b/assets/prometheus/rules/kube-scheduler.rules @@ -1,5 +1,5 @@ ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + IF absent(up{job="kube-scheduler"} == 1) FOR 5m LABELS { severity = "critical", @@ -7,4 +7,5 @@ ALERT K8SSchedulerDown ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler", } diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules index cbcd576c4685e069be4a62fa2e21e226a02122f8..8c0843ce23a7daba95f4ccc9330a0cf0023078d6 100644 --- a/assets/prometheus/rules/kubelet.rules +++ b/assets/prometheus/rules/kubelet.rules @@ -11,24 +11,24 @@ ALERT K8SNodeNotReady ALERT K8SManyNodesNotReady IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + count(kube_node_status_ready{condition="true"} == 0) > 1 AND ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) + count(kube_node_status_ready{condition="true"} == 0) / - count by (cluster) (kube_node_status_ready{condition="true"}) + count(kube_node_status_ready{condition="true"}) ) > 0.2 FOR 1m LABELS { severity = "critical", } ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + summary = "Many Kubernetes nodes are Not Ready", + description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).", } ALERT K8SKubeletDown - IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 + IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 FOR 1h LABELS { severity = "warning", @@ -39,7 +39,7 @@ ALERT K8SKubeletDown } ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 FOR 1h LABELS { severity = "critical", diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules index 8fd5b7d0363f9b2b3be9b5f038333db9ff61a872..36ea482cf5dd63ef1847656498213c92c07c4ef0 100644 --- a/assets/prometheus/rules/node.rules +++ b/assets/prometheus/rules/node.rules @@ -1,10 +1,10 @@ ALERT NodeExporterDown - IF up{job="node-exporter"} == 0 + IF absent(up{job="node-exporter"} == 1) FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "node-exporter cannot be scraped", - description = "Prometheus could not scrape a node-exporter for more than 10m.", + description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index f27cf33bee7f55c85f5a03c4e1afcf406acf92bb..181a70c788610186aff459ad42c98559f6b7719c 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -225,14 +225,14 @@ data: ### Up Alerting ### Alert TargetDown - IF 100 * (count(up == 0) / count(up)) > 3 + IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "Targets are down", - description = "More than {{ $value }}% of targets are down." + description = "{{ $value }}% or more of {{ $labels.job }} targets are down." } ### Dead man's switch ### @@ -287,7 +287,7 @@ data: } kube-apiserver.rules: |+ ALERT K8SApiserverDown - IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + IF absent(up{job="apiserver"} == 1) FOR 5m LABELS { severity = "critical" @@ -316,7 +316,7 @@ data: } kube-controller-manager.rules: |+ ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + IF absent(up{job="kube-controller-manager"} == 1) FOR 5m LABELS { severity = "critical", @@ -324,6 +324,7 @@ data: ANNOTATIONS { summary = "Controller manager is down", description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager", } kubelet.rules: |+ ALERT K8SNodeNotReady @@ -339,24 +340,24 @@ data: ALERT K8SManyNodesNotReady IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + count(kube_node_status_ready{condition="true"} == 0) > 1 AND ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) + count(kube_node_status_ready{condition="true"} == 0) / - count by (cluster) (kube_node_status_ready{condition="true"}) + count(kube_node_status_ready{condition="true"}) ) > 0.2 FOR 1m LABELS { severity = "critical", } ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + summary = "Many Kubernetes nodes are Not Ready", + description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).", } ALERT K8SKubeletDown - IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 + IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 FOR 1h LABELS { severity = "warning", @@ -367,7 +368,7 @@ data: } ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 FOR 1h LABELS { severity = "critical", @@ -560,7 +561,7 @@ data: histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 kube-scheduler.rules: |+ ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + IF absent(up{job="kube-scheduler"} == 1) FOR 5m LABELS { severity = "critical", @@ -568,17 +569,18 @@ data: ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler", } node.rules: |+ ALERT NodeExporterDown - IF up{job="node-exporter"} == 0 + IF absent(up{job="node-exporter"} == 1) FOR 10m LABELS { severity = "warning" } ANNOTATIONS { summary = "node-exporter cannot be scraped", - description = "Prometheus could not scrape a node-exporter for more than 10m.", + description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.", } prometheus.rules: |+ ALERT FailedReload