diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules index ac78229775b2faa23455af3c70527452348ca629..9e26ab9a1500e975d41a4039cf75eb9778e12def 100644 --- a/assets/prometheus/rules/general.rules +++ b/assets/prometheus/rules/general.rules @@ -1,14 +1,14 @@ ### Up Alerting ### Alert TargetDown - IF up == 0 + IF 100 * (count(up == 0) / count(up)) > 3 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { - summary = "target is down", - description = "A target of type {{ $labels.job }} is down." + summary = "Targets are down", + description = "More than {{ $value }}% of targets are down." } ### Dead man's switch ### @@ -25,26 +25,15 @@ ALERT DeadMansSwitch ### File descriptor alerts ### -ALERT TooManyOpenFiles - IF 100*process_open_fds / process_max_fds > 50 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "too many open file descriptors", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", - } - -ALERT K8STooManyOpenFiles - IF 100*process_open_fds / process_max_fds > 80 +ALERT TooManyOpenFileDescriptors + IF 100 * (process_open_fds / process_max_fds) > 95 FOR 10m LABELS { severity = "critical" } ANNOTATIONS { summary = "too many open file descriptors", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", } instance:fd_utilization = process_open_fds / process_max_fds @@ -58,7 +47,7 @@ ALERT FdExhaustionClose } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } # alert if file descriptors are likely to exhaust within the next hour @@ -70,40 +59,5 @@ ALERT FdExhaustionClose } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", - } - -### Contrack alerts ### - -# To catch the conntrack sysctl de-tuning when it happens -ALERT ConntrackTuningMissing - IF node_nf_conntrack_udp_timeout > 10 - FOR 10m - LABELS { - severity = "warning", - } - ANNOTATIONS { - summary = "Node does not have the correct conntrack tunings", - description = "Nodes keep un-setting the correct tunings, investigate when it happens.", - } - -ALERT ConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - -ALERT ConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules index 6c58fe52127af76dc52e8b139fab02abb4eff7b3..8d8d13923c0511bc478075907e60c1b79109a517 100644 --- a/assets/prometheus/rules/kube-apiserver.rules +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -1,15 +1,3 @@ -ALERT K8SApiserverDown - IF up{job="apiserver"} == 0 - FOR 15m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "An API server could not be scraped.", - } - -# Disable for non HA kubernetes setups. ALERT K8SApiserverDown IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) FOR 5m @@ -18,7 +6,7 @@ ALERT K8SApiserverDown } ANNOTATIONS { summary = "API server unreachable", - description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", } # Some verbs excluded because they are expected to be long-lasting: diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules index c3cc8e9b30b262150ed501f6d3ecda3f3aa7abc6..cbcd576c4685e069be4a62fa2e21e226a02122f8 100644 --- a/assets/prometheus/rules/kubelet.rules +++ b/assets/prometheus/rules/kubelet.rules @@ -1,14 +1,3 @@ -ALERT K8SNodeDown - IF up{job="kubelet"} == 0 - FOR 1h - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", - } - ALERT K8SNodeNotReady IF kube_node_status_ready{condition="true"} == 0 FOR 1h @@ -38,16 +27,26 @@ ALERT K8SManyNodesNotReady description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", } +ALERT K8SKubeletDown + IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape {{ $value }}% of kubelets.", + } + ALERT K8SKubeletDown IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 FOR 1h LABELS { - service = "k8s", - severity = "critical" + severity = "critical", } ANNOTATIONS { summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", } ALERT K8SKubeletTooManyPods diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 23638f354ea2f0b8400097eca7a1edb4d133642d..14284560ced4093f7ce3f28b5d4d4630f24222f7 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -625,6 +625,7 @@ data: ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + runbook = "https://github.com/coreos/tectonic-installer/blob/master/Documentation/troubleshooting/controller-recovery.md#disaster-recovery-of-scheduler-and-controller-manager-pods" } node.rules: |+ ALERT NodeExporterDown