diff --git a/assets/prometheus/rules/alertmanager.rules b/assets/prometheus/rules/alertmanager.rules new file mode 100644 index 0000000000000000000000000000000000000000..71bdc687645cdf3e1589f53a31009136162b970f --- /dev/null +++ b/assets/prometheus/rules/alertmanager.rules @@ -0,0 +1,36 @@ +ALERT AlertmanagerConfigInconsistent + IF count_values by (service) ("config_hash", alertmanager_config_hash) + / on(service) group_left + label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Alertmanager configurations are inconsistent", + description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." + } + +ALERT AlertmanagerDownOrMissing + IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") + / on(job) group_right + sum by(job) (up) != 1 + FOR 5m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Alertmanager down or not discovered", + description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." + } + +ALERT FailedReload + IF alertmanager_config_last_reload_successful == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Alertmanager configuration reload has failed", + description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } diff --git a/assets/prometheus/rules/etcd2.rules b/assets/prometheus/rules/etcd2.rules deleted file mode 100644 index 10fa5e8d7e3fdb4f03d8138f192f038f2508df42..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/etcd2.rules +++ /dev/null @@ -1,121 +0,0 @@ -### General cluster availability ### - -# alert if another failed peer will result in an unavailable cluster -ALERT InsufficientPeers - IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) - FOR 3m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Etcd cluster small", - description = "If one more etcd peer goes down the cluster will be unavailable", - } - -### HTTP requests alerts ### - -# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - -# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - -# alert if 50% of requests get a 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", - } - -# alert if the 99th percentile of HTTP requests take more than 150ms -ALERT HTTPRequestsSlow - IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "slow HTTP requests", - description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", - } - -### File descriptor alerts ### - -instance:fd_utilization = process_open_fds / process_max_fds - -# alert if file descriptors are likely to exhaust within the next 4 hours -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", - } - -# alert if file descriptors are likely to exhaust within the next hour -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[10m], 3600) > 1 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", - } - -### etcd proposal alerts ### - -# alert if there are several failed proposals within an hour -ALERT HighNumberOfFailedProposals - IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of failed proposals within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", - } - -### etcd disk io latency alerts ### - -# alert if 99th percentile of fsync durations is higher than 500ms -ALERT HighFsyncDurations - IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "high fsync durations", - description = "ectd instance {{ $labels.instance }} fync durations are high", - } diff --git a/assets/prometheus/rules/etcd3.rules b/assets/prometheus/rules/etcd3.rules new file mode 100644 index 0000000000000000000000000000000000000000..a3b2cddd30393d4268e090000bcbfe624f1d5c79 --- /dev/null +++ b/assets/prometheus/rules/etcd3.rules @@ -0,0 +1,177 @@ +# general cluster availability + +# alert if another failed member will result in an unavailable cluster +ALERT InsufficientMembers +IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) +FOR 3m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "etcd cluster insufficient members", + description = "If one more etcd member goes down the cluster will be unavailable", +} + +# etcd leader alerts +# ================== + +# alert if any etcd instance has no leader +ALERT NoLeader +IF etcd_server_has_leader{job="etcd"} == 0 +FOR 1m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "etcd member has no leader", + description = "etcd member {{ $labels.instance }} has no leader", +} + +# alert if there are lots of leader changes +ALERT HighNumberOfLeaderChanges +IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of leader changes within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", +} + +# gRPC request alerts +# =================== + +# alert if more than 1% of gRPC method calls have failed within the last 5 minutes +ALERT HighNumberOfFailedGRPCRequests +IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if more than 5% of gRPC method calls have failed within the last 5 minutes +ALERT HighNumberOfFailedGRPCRequests +IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 +FOR 5m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if the 99th percentile of gRPC method calls take more than 150ms +ALERT GRPCRequestsSlow +IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "slow gRPC requests", + description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", +} + +# HTTP requests alerts +# ==================== + +# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes +ALERT HighNumberOfFailedHTTPRequests +IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes +ALERT HighNumberOfFailedHTTPRequests +IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 +FOR 5m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if the 99th percentile of HTTP requests take more than 150ms +ALERT HTTPRequestsSlow +IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "slow HTTP requests", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", +} + +# etcd member communication alerts +# ================================ + +# alert if 99th percentile of round trips take 150ms +ALERT EtcdMemberCommunicationSlow +IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "etcd member communication is slow", + description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow", +} + +# etcd proposal alerts +# ==================== + +# alert if there are several failed proposals within an hour +ALERT HighNumberOfFailedProposals +IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of proposals within the etcd cluster are failing", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", +} + +# etcd disk io latency alerts +# =========================== + +# alert if 99th percentile of fsync durations is higher than 500ms +ALERT HighFsyncDurations +IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "high fsync durations", + description = "etcd instance {{ $labels.instance }} fync durations are high", +} + +# alert if 99th percentile of commit durations is higher than 250ms +ALERT HighCommitDurations +IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "high commit durations", + description = "etcd instance {{ $labels.instance }} commit durations are high", +} diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules new file mode 100644 index 0000000000000000000000000000000000000000..9a8f931f6880c7c9e3c3334c0ef85c5eba13942c --- /dev/null +++ b/assets/prometheus/rules/general.rules @@ -0,0 +1,97 @@ +### Up Alerting ### + +Alert TargetDown + IF up == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "target is down", + description = "A target of type {{ $labels.job }} is down." + } + +### File descriptor alerts ### + +ALERT TooManyOpenFiles + IF 100*process_open_fds / process_max_fds > 50 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", + } + +ALERT K8STooManyOpenFiles + IF 100*process_open_fds / process_max_fds > 80 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", + } + +instance:fd_utilization = process_open_fds / process_max_fds + +# alert if file descriptors are likely to exhaust within the next 4 hours +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", + } + +# alert if file descriptors are likely to exhaust within the next hour +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[10m], 3600) > 1 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", + } + +### Contrack alerts ### + +# To catch the conntrack sysctl de-tuning when it happens +ALERT ConntrackTuningMissing + IF node_nf_conntrack_udp_timeout > 10 + FOR 10m + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Node does not have the correct conntrack tunings", + description = "Nodes keep un-setting the correct tunings, investigate when it happens.", + } + +ALERT ConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + +ALERT ConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules new file mode 100644 index 0000000000000000000000000000000000000000..fadaf5b405cb5611ac1a091bbb79b2a71e1c8b95 --- /dev/null +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -0,0 +1,38 @@ +ALERT K8SApiserverDown + IF up{job="apiserver"} == 0 + FOR 15m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "An API server could not be scraped.", + } + +# Disable for non HA kubernetes setups. +ALERT K8SApiserverDown + IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + } + +# Some verbs excluded because they are expected to be long-lasting: +# WATCHLIST is long-poll, CONNECT is `kubectl exec`. +ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) + ) / 1e6 > 1.0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules new file mode 100644 index 0000000000000000000000000000000000000000..f75e27680d8cc543582b73e72f831cdbd5e25764 --- /dev/null +++ b/assets/prometheus/rules/kube-controller-manager.rules @@ -0,0 +1,10 @@ +ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules new file mode 100644 index 0000000000000000000000000000000000000000..6eff4bcdc010b6d7682d7d97e38af7b674a9384e --- /dev/null +++ b/assets/prometheus/rules/kube-scheduler.rules @@ -0,0 +1,10 @@ +ALERT K8SSchedulerDown + IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Scheduler is down", + description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + } diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules new file mode 100644 index 0000000000000000000000000000000000000000..c3cc8e9b30b262150ed501f6d3ecda3f3aa7abc6 --- /dev/null +++ b/assets/prometheus/rules/kubelet.rules @@ -0,0 +1,61 @@ +ALERT K8SNodeDown + IF up{job="kubelet"} == 0 + FOR 1h + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", + } + +ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + +ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + +ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + } + +ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + } diff --git a/assets/prometheus/rules/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules index 157eb3fa8cb61b79ac7340a12b5d89542f3ff28b..084d11e53f4b768b38a3e43708a5936289aed31d 100644 --- a/assets/prometheus/rules/kubernetes.rules +++ b/assets/prometheus/rules/kubernetes.rules @@ -169,220 +169,3 @@ cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - -ALERT K8SNodeDown - IF up{job="kubelet"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", - } - -ALERT K8SNodeNotReady - IF kube_node_status_ready{condition="true"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - -ALERT K8SManyNodesNotReady - IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 - AND - ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) - / - count by (cluster) (kube_node_status_ready{condition="true"}) - ) > 0.2 - FOR 1m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", - } - -ALERT K8SKubeletNodeExporterDown - IF up{job="node-exporter"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet node_exporter cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", - } - -ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", - } - -ALERT K8SApiserverDown - IF up{job="kubernetes"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "An API server could not be scraped.", - } - -# Disable for non HA kubernetes setups. -ALERT K8SApiserverDown - IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) - FOR 5m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", - } - -ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Scheduler is down", - description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", - } - -ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - } - -ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - -ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - -# To catch the conntrack sysctl de-tuning when it happens -ALERT K8SConntrackTuningMissing - IF node_nf_conntrack_udp_timeout > 10 - FOR 10m - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node does not have the correct conntrack tunings", - description = "Nodes keep un-setting the correct tunings, investigate when it happens.", - } - -ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - -ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 - FOR 10m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - -# Some verbs excluded because they are expected to be long-lasting: -# WATCHLIST is long-poll, CONNECT is `kubectl exec`. -ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } - -ALERT K8SApiServerEtcdAccessLatency - IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Access to etcd is slow", - description = "99th percentile latency for apiserver to access etcd is higher than 1s.", - } - -ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", - } - diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules new file mode 100644 index 0000000000000000000000000000000000000000..8fd5b7d0363f9b2b3be9b5f038333db9ff61a872 --- /dev/null +++ b/assets/prometheus/rules/node.rules @@ -0,0 +1,10 @@ +ALERT NodeExporterDown + IF up{job="node-exporter"} == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "node-exporter cannot be scraped", + description = "Prometheus could not scrape a node-exporter for more than 10m.", + } diff --git a/assets/prometheus/rules/prometheus.rules b/assets/prometheus/rules/prometheus.rules new file mode 100644 index 0000000000000000000000000000000000000000..05c278f1c5606238c8b8edbd24d1df820972ade9 --- /dev/null +++ b/assets/prometheus/rules/prometheus.rules @@ -0,0 +1,10 @@ +ALERT FailedReload + IF prometheus_config_last_reload_successful == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Prometheus configuration reload has failed", + description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 71f72da12436dd027dde8b158e97774db119ef0a..f57b678559077b52b7cd01f477b804fa40fd7aa2 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -6,75 +6,258 @@ metadata: role: prometheus-rulefiles prometheus: k8s data: - etcd2.rules: |+ - ### General cluster availability ### - - # alert if another failed peer will result in an unavailable cluster - ALERT InsufficientPeers - IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) - FOR 3m + alertmanager.rules: |+ + ALERT AlertmanagerConfigInconsistent + IF count_values by (service) ("config_hash", alertmanager_config_hash) + / on(service) group_left + label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + FOR 5m LABELS { severity = "critical" } ANNOTATIONS { - summary = "Etcd cluster small", - description = "If one more etcd peer goes down the cluster will be unavailable", + summary = "Alertmanager configurations are inconsistent", + description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." } - ### HTTP requests alerts ### - - # alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 - FOR 10m + ALERT AlertmanagerDownOrMissing + IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") + / on(job) group_right + sum by(job) (up) != 1 + FOR 5m LABELS { severity = "warning" } ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + summary = "Alertmanager down or not discovered", + description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." } - # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 - FOR 5m + ALERT FailedReload + IF alertmanager_config_last_reload_successful == 0 + FOR 10m LABELS { - severity = "critical" + severity = "warning" } ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if 50% of requests get a 4xx response + summary = "Alertmanager configuration reload has failed", + description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } + etcd3.rules: |+ + # general cluster availability + + # alert if another failed member will result in an unavailable cluster + ALERT InsufficientMembers + IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + FOR 3m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "etcd cluster insufficient members", + description = "If one more etcd member goes down the cluster will be unavailable", + } + + # etcd leader alerts + # ================== + + # alert if any etcd instance has no leader + ALERT NoLeader + IF etcd_server_has_leader{job="etcd"} == 0 + FOR 1m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "etcd member has no leader", + description = "etcd member {{ $labels.instance }} has no leader", + } + + # alert if there are lots of leader changes + ALERT HighNumberOfLeaderChanges + IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of leader changes within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", + } + + # gRPC request alerts + # =================== + + # alert if more than 1% of gRPC method calls have failed within the last 5 minutes + ALERT HighNumberOfFailedGRPCRequests + IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if more than 5% of gRPC method calls have failed within the last 5 minutes + ALERT HighNumberOfFailedGRPCRequests + IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if the 99th percentile of gRPC method calls take more than 150ms + ALERT GRPCRequestsSlow + IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "slow gRPC requests", + description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", + } + + # HTTP requests alerts + # ==================== + + # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 + IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if the 99th percentile of HTTP requests take more than 150ms + ALERT HTTPRequestsSlow + IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "slow HTTP requests", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + } + + # etcd member communication alerts + # ================================ + + # alert if 99th percentile of round trips take 150ms + ALERT EtcdMemberCommunicationSlow + IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "etcd member communication is slow", + description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow", + } + + # etcd proposal alerts + # ==================== + + # alert if there are several failed proposals within an hour + ALERT HighNumberOfFailedProposals + IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of proposals within the etcd cluster are failing", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + } + + # etcd disk io latency alerts + # =========================== + + # alert if 99th percentile of fsync durations is higher than 500ms + ALERT HighFsyncDurations + IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high fsync durations", + description = "etcd instance {{ $labels.instance }} fync durations are high", + } + + # alert if 99th percentile of commit durations is higher than 250ms + ALERT HighCommitDurations + IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high commit durations", + description = "etcd instance {{ $labels.instance }} commit durations are high", + } + general.rules: |+ + ### Up Alerting ### + + Alert TargetDown + IF up == 0 FOR 10m LABELS { - severity = "critical" + severity = "warning" } ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", + summary = "target is down", + description = "A target of type {{ $labels.job }} is down." } - # alert if the 99th percentile of HTTP requests take more than 150ms - ALERT HTTPRequestsSlow - IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 + ### File descriptor alerts ### + + ALERT TooManyOpenFiles + IF 100*process_open_fds / process_max_fds > 50 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { - summary = "slow HTTP requests", - description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", } - ### File descriptor alerts ### + ALERT K8STooManyOpenFiles + IF 100*process_open_fds / process_max_fds > 80 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value }}% of the available file/socket descriptors.", + } instance:fd_utilization = process_open_fds / process_max_fds @@ -87,7 +270,7 @@ data: } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", } # alert if file descriptors are likely to exhaust within the next hour @@ -99,34 +282,154 @@ data: } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance {{ $labels.namespace }}/{{ $labels.pod }} will exhaust in file descriptors soon", } - ### etcd proposal alerts ### + ### Contrack alerts ### - # alert if there are several failed proposals within an hour - ALERT HighNumberOfFailedProposals - IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 + # To catch the conntrack sysctl de-tuning when it happens + ALERT ConntrackTuningMissing + IF node_nf_conntrack_udp_timeout > 10 + FOR 10m + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Node does not have the correct conntrack tunings", + description = "Nodes keep un-setting the correct tunings, investigate when it happens.", + } + + ALERT ConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + + ALERT ConntrackTableFull + IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Number of tracked connections is near the limit", + description = "The nf_conntrack table is {{ $value }}% full.", + } + kube-apiserver.rules: |+ + ALERT K8SApiserverDown + IF up{job="apiserver"} == 0 + FOR 15m LABELS { severity = "warning" } ANNOTATIONS { - summary = "a high number of failed proposals within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + summary = "API server unreachable", + description = "An API server could not be scraped.", } - ### etcd disk io latency alerts ### + # Disable for non HA kubernetes setups. + ALERT K8SApiserverDown + IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", + } - # alert if 99th percentile of fsync durations is higher than 500ms - ALERT HighFsyncDurations - IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 + # Some verbs excluded because they are expected to be long-lasting: + # WATCHLIST is long-poll, CONNECT is `kubectl exec`. + ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) + ) / 1e6 > 1.0 FOR 10m LABELS { severity = "warning" } ANNOTATIONS { - summary = "high fsync durations", - description = "ectd instance {{ $labels.instance }} fync durations are high", + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } + kube-controller-manager.rules: |+ + ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } + kubelet.rules: |+ + ALERT K8SNodeDown + IF up{job="kubelet"} == 0 + FOR 1h + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Kubelet cannot be scraped", + description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", + } + + ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + + ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + + ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + service = "k8s", + severity = "critical" + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", + } + + ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", } kubernetes.rules: |+ # NOTE: These rules were kindly contributed by the SoundCloud engineering team. @@ -300,220 +603,36 @@ data: histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - - ALERT K8SNodeDown - IF up{job="kubelet"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", - } - - ALERT K8SNodeNotReady - IF kube_node_status_ready{condition="true"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - - ALERT K8SManyNodesNotReady - IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 - AND - ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) - / - count by (cluster) (kube_node_status_ready{condition="true"}) - ) > 0.2 - FOR 1m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", - } - - ALERT K8SKubeletNodeExporterDown - IF up{job="node-exporter"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet node_exporter cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", - } - - ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", - } - - ALERT K8SApiserverDown - IF up{job="kubernetes"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "An API server could not be scraped.", - } - - # Disable for non HA kubernetes setups. - ALERT K8SApiserverDown - IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) - FOR 5m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", - } - + kube-scheduler.rules: |+ ALERT K8SSchedulerDown IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) FOR 5m LABELS { - service = "k8s", severity = "critical", } ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", } - - ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - } - - ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - - ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - - # To catch the conntrack sysctl de-tuning when it happens - ALERT K8SConntrackTuningMissing - IF node_nf_conntrack_udp_timeout > 10 - FOR 10m - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node does not have the correct conntrack tunings", - description = "Nodes keep un-setting the correct tunings, investigate when it happens.", - } - - ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 + node.rules: |+ + ALERT NodeExporterDown + IF up{job="node-exporter"} == 0 FOR 10m LABELS { - service = "k8s", severity = "warning" } ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + summary = "node-exporter cannot be scraped", + description = "Prometheus could not scrape a node-exporter for more than 10m.", } - - ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 + prometheus.rules: |+ + ALERT FailedReload + IF prometheus_config_last_reload_successful == 0 FOR 10m LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - - # Some verbs excluded because they are expected to be long-lasting: - # WATCHLIST is long-poll, CONNECT is `kubectl exec`. - ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } - - ALERT K8SApiServerEtcdAccessLatency - IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 - FOR 15m - LABELS { - service = "k8s", severity = "warning" } ANNOTATIONS { - summary = "Access to etcd is slow", - description = "99th percentile latency for apiserver to access etcd is higher than 1s.", - } - - ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + summary = "Prometheus configuration reload has failed", + description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." } -