diff --git a/assets/prometheus/rules/alertmanager.rules b/assets/prometheus/rules/alertmanager.rules index 71bdc687645cdf3e1589f53a31009136162b970f..30a70ee395bfef538a90a01299c11b4596cb6b4b 100644 --- a/assets/prometheus/rules/alertmanager.rules +++ b/assets/prometheus/rules/alertmanager.rules @@ -4,7 +4,7 @@ ALERT AlertmanagerConfigInconsistent label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 FOR 5m LABELS { - severity = "critical" + severity = "page" } ANNOTATIONS { summary = "Alertmanager configurations are inconsistent", @@ -17,7 +17,7 @@ ALERT AlertmanagerDownOrMissing sum by(job) (up) != 1 FOR 5m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "Alertmanager down or not discovered", @@ -28,7 +28,7 @@ ALERT FailedReload IF alertmanager_config_last_reload_successful == 0 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "Alertmanager configuration reload has failed", diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules index 9e26ab9a1500e975d41a4039cf75eb9778e12def..7b406f0783af5a217e5ac539ad7449f56dd787a6 100644 --- a/assets/prometheus/rules/general.rules +++ b/assets/prometheus/rules/general.rules @@ -4,7 +4,7 @@ Alert TargetDown IF 100 * (count(up == 0) / count(up)) > 3 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "Targets are down", @@ -15,9 +15,6 @@ Alert TargetDown ALERT DeadMansSwitch IF vector(1) - LABELS { - severity = "none", - } ANNOTATIONS { summary = "Alerting DeadMansSwitch", description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", @@ -29,7 +26,7 @@ ALERT TooManyOpenFileDescriptors IF 100 * (process_open_fds / process_max_fds) > 95 FOR 10m LABELS { - severity = "critical" + severity = "page" } ANNOTATIONS { summary = "too many open file descriptors", @@ -43,7 +40,7 @@ ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "file descriptors soon exhausted", @@ -55,7 +52,7 @@ ALERT FdExhaustionClose IF predict_linear(instance:fd_utilization[10m], 3600) > 1 FOR 10m LABELS { - severity = "critical" + severity = "page" } ANNOTATIONS { summary = "file descriptors soon exhausted", diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules index c041881abb2c398ae342ea875cae02ee19e1d497..be6dc97ffc5f7890fdae79a372987b04b6537fcf 100644 --- a/assets/prometheus/rules/kube-apiserver.rules +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -2,7 +2,7 @@ ALERT K8SApiserverDown IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) FOR 5m LABELS { - severity = "critical" + severity = "page" } ANNOTATIONS { summary = "API server unreachable", @@ -20,7 +20,7 @@ ALERT K8SApiServerLatency ) / 1e6 > 1.0 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "Kubernetes apiserver latency is high", diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules index f75e27680d8cc543582b73e72f831cdbd5e25764..905462736e0f487681d731ed61072f471afcf204 100644 --- a/assets/prometheus/rules/kube-controller-manager.rules +++ b/assets/prometheus/rules/kube-controller-manager.rules @@ -2,7 +2,7 @@ ALERT K8SControllerManagerDown IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) FOR 5m LABELS { - severity = "critical", + severity = "page", } ANNOTATIONS { summary = "Controller manager is down", diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules index 6eff4bcdc010b6d7682d7d97e38af7b674a9384e..80e954ddc97779418a756e8a24288e76a0ff49ab 100644 --- a/assets/prometheus/rules/kube-scheduler.rules +++ b/assets/prometheus/rules/kube-scheduler.rules @@ -2,7 +2,7 @@ ALERT K8SSchedulerDown IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) FOR 5m LABELS { - severity = "critical", + severity = "page", } ANNOTATIONS { summary = "Scheduler is down", diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules index cbcd576c4685e069be4a62fa2e21e226a02122f8..124d8dd0d3684704d6d5fedc0b7f0530a789cf72 100644 --- a/assets/prometheus/rules/kubelet.rules +++ b/assets/prometheus/rules/kubelet.rules @@ -2,7 +2,7 @@ ALERT K8SNodeNotReady IF kube_node_status_ready{condition="true"} == 0 FOR 1h LABELS { - severity = "warning", + severity = "ticket", } ANNOTATIONS { summary = "Node status is NotReady", @@ -20,7 +20,7 @@ ALERT K8SManyNodesNotReady ) > 0.2 FOR 1m LABELS { - severity = "critical", + severity = "page", } ANNOTATIONS { summary = "Many K8s nodes are Not Ready", @@ -31,7 +31,7 @@ ALERT K8SKubeletDown IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 FOR 1h LABELS { - severity = "warning", + severity = "ticket", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", @@ -42,7 +42,7 @@ ALERT K8SKubeletDown IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 FOR 1h LABELS { - severity = "critical", + severity = "page", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", @@ -52,7 +52,7 @@ ALERT K8SKubeletDown ALERT K8SKubeletTooManyPods IF kubelet_running_pod_count > 100 LABELS { - severity = "warning", + severity = "ticket", } ANNOTATIONS { summary = "Kubelet is close to pod limit", diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules index 8fd5b7d0363f9b2b3be9b5f038333db9ff61a872..9844947a38473608418195bf846e15e8952e8472 100644 --- a/assets/prometheus/rules/node.rules +++ b/assets/prometheus/rules/node.rules @@ -2,7 +2,7 @@ ALERT NodeExporterDown IF up{job="node-exporter"} == 0 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "node-exporter cannot be scraped", diff --git a/assets/prometheus/rules/prometheus.rules b/assets/prometheus/rules/prometheus.rules index 05c278f1c5606238c8b8edbd24d1df820972ade9..c29ed6ca3b444d2a771254fbe046aed3b125a814 100644 --- a/assets/prometheus/rules/prometheus.rules +++ b/assets/prometheus/rules/prometheus.rules @@ -2,7 +2,7 @@ ALERT FailedReload IF prometheus_config_last_reload_successful == 0 FOR 10m LABELS { - severity = "warning" + severity = "ticket" } ANNOTATIONS { summary = "Prometheus configuration reload has failed",