diff --git a/assets/alertmanager/alertmanager.yaml b/assets/alertmanager/alertmanager.yaml index f08a210600ce3ec7447c9b8d3c1df81c8c1e2f57..6b5789b57eb914414024bc837a97340808fd7adf 100644 --- a/assets/alertmanager/alertmanager.yaml +++ b/assets/alertmanager/alertmanager.yaml @@ -5,8 +5,10 @@ route: group_wait: 30s group_interval: 5m repeat_interval: 12h - receiver: 'webhook' + receiver: 'null' + routes: + - match: + alertname: DeadMansSwitch + receiver: 'null' receivers: -- name: 'webhook' - webhook_configs: - - url: 'http://alertmanagerwh:30500/' +- name: 'null' diff --git a/assets/prometheus/rules/alertmanager.rules b/assets/prometheus/rules/alertmanager.rules new file mode 100644 index 0000000000000000000000000000000000000000..71bdc687645cdf3e1589f53a31009136162b970f --- /dev/null +++ b/assets/prometheus/rules/alertmanager.rules @@ -0,0 +1,36 @@ +ALERT AlertmanagerConfigInconsistent + IF count_values by (service) ("config_hash", alertmanager_config_hash) + / on(service) group_left + label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Alertmanager configurations are inconsistent", + description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." + } + +ALERT AlertmanagerDownOrMissing + IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") + / on(job) group_right + sum by(job) (up) != 1 + FOR 5m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Alertmanager down or not discovered", + description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." + } + +ALERT FailedReload + IF alertmanager_config_last_reload_successful == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Alertmanager configuration reload has failed", + description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } diff --git a/assets/prometheus/rules/etcd2.rules b/assets/prometheus/rules/etcd2.rules deleted file mode 100644 index 10fa5e8d7e3fdb4f03d8138f192f038f2508df42..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/etcd2.rules +++ /dev/null @@ -1,121 +0,0 @@ -### General cluster availability ### - -# alert if another failed peer will result in an unavailable cluster -ALERT InsufficientPeers - IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) - FOR 3m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "Etcd cluster small", - description = "If one more etcd peer goes down the cluster will be unavailable", - } - -### HTTP requests alerts ### - -# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - -# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 - FOR 5m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - -# alert if 50% of requests get a 4xx response -ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", - } - -# alert if the 99th percentile of HTTP requests take more than 150ms -ALERT HTTPRequestsSlow - IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "slow HTTP requests", - description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", - } - -### File descriptor alerts ### - -instance:fd_utilization = process_open_fds / process_max_fds - -# alert if file descriptors are likely to exhaust within the next 4 hours -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", - } - -# alert if file descriptors are likely to exhaust within the next hour -ALERT FdExhaustionClose - IF predict_linear(instance:fd_utilization[10m], 3600) > 1 - FOR 10m - LABELS { - severity = "critical" - } - ANNOTATIONS { - summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", - } - -### etcd proposal alerts ### - -# alert if there are several failed proposals within an hour -ALERT HighNumberOfFailedProposals - IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "a high number of failed proposals within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", - } - -### etcd disk io latency alerts ### - -# alert if 99th percentile of fsync durations is higher than 500ms -ALERT HighFsyncDurations - IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 - FOR 10m - LABELS { - severity = "warning" - } - ANNOTATIONS { - summary = "high fsync durations", - description = "ectd instance {{ $labels.instance }} fync durations are high", - } diff --git a/assets/prometheus/rules/etcd3.rules b/assets/prometheus/rules/etcd3.rules new file mode 100644 index 0000000000000000000000000000000000000000..a3b2cddd30393d4268e090000bcbfe624f1d5c79 --- /dev/null +++ b/assets/prometheus/rules/etcd3.rules @@ -0,0 +1,177 @@ +# general cluster availability + +# alert if another failed member will result in an unavailable cluster +ALERT InsufficientMembers +IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) +FOR 3m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "etcd cluster insufficient members", + description = "If one more etcd member goes down the cluster will be unavailable", +} + +# etcd leader alerts +# ================== + +# alert if any etcd instance has no leader +ALERT NoLeader +IF etcd_server_has_leader{job="etcd"} == 0 +FOR 1m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "etcd member has no leader", + description = "etcd member {{ $labels.instance }} has no leader", +} + +# alert if there are lots of leader changes +ALERT HighNumberOfLeaderChanges +IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of leader changes within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", +} + +# gRPC request alerts +# =================== + +# alert if more than 1% of gRPC method calls have failed within the last 5 minutes +ALERT HighNumberOfFailedGRPCRequests +IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if more than 5% of gRPC method calls have failed within the last 5 minutes +ALERT HighNumberOfFailedGRPCRequests +IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 +FOR 5m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if the 99th percentile of gRPC method calls take more than 150ms +ALERT GRPCRequestsSlow +IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "slow gRPC requests", + description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", +} + +# HTTP requests alerts +# ==================== + +# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes +ALERT HighNumberOfFailedHTTPRequests +IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes +ALERT HighNumberOfFailedHTTPRequests +IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 +FOR 5m +LABELS { + severity = "critical" +} +ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", +} + +# alert if the 99th percentile of HTTP requests take more than 150ms +ALERT HTTPRequestsSlow +IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "slow HTTP requests", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", +} + +# etcd member communication alerts +# ================================ + +# alert if 99th percentile of round trips take 150ms +ALERT EtcdMemberCommunicationSlow +IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "etcd member communication is slow", + description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow", +} + +# etcd proposal alerts +# ==================== + +# alert if there are several failed proposals within an hour +ALERT HighNumberOfFailedProposals +IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "a high number of proposals within the etcd cluster are failing", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", +} + +# etcd disk io latency alerts +# =========================== + +# alert if 99th percentile of fsync durations is higher than 500ms +ALERT HighFsyncDurations +IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "high fsync durations", + description = "etcd instance {{ $labels.instance }} fync durations are high", +} + +# alert if 99th percentile of commit durations is higher than 250ms +ALERT HighCommitDurations +IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 +FOR 10m +LABELS { + severity = "warning" +} +ANNOTATIONS { + summary = "high commit durations", + description = "etcd instance {{ $labels.instance }} commit durations are high", +} diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules new file mode 100644 index 0000000000000000000000000000000000000000..9e26ab9a1500e975d41a4039cf75eb9778e12def --- /dev/null +++ b/assets/prometheus/rules/general.rules @@ -0,0 +1,63 @@ +### Up Alerting ### + +Alert TargetDown + IF 100 * (count(up == 0) / count(up)) > 3 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Targets are down", + description = "More than {{ $value }}% of targets are down." + } + +### Dead man's switch ### + +ALERT DeadMansSwitch + IF vector(1) + LABELS { + severity = "none", + } + ANNOTATIONS { + summary = "Alerting DeadMansSwitch", + description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", + } + +### File descriptor alerts ### + +ALERT TooManyOpenFileDescriptors + IF 100 * (process_open_fds / process_max_fds) > 95 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", + } + +instance:fd_utilization = process_open_fds / process_max_fds + +# alert if file descriptors are likely to exhaust within the next 4 hours +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", + } + +# alert if file descriptors are likely to exhaust within the next hour +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[10m], 3600) > 1 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", + } diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules new file mode 100644 index 0000000000000000000000000000000000000000..c041881abb2c398ae342ea875cae02ee19e1d497 --- /dev/null +++ b/assets/prometheus/rules/kube-apiserver.rules @@ -0,0 +1,28 @@ +ALERT K8SApiserverDown + IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", + } + +# Some verbs excluded because they are expected to be long-lasting: +# WATCHLIST is long-poll, CONNECT is `kubectl exec`. +# +# apiserver_request_latencies' unit is microseconds +ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) + ) / 1e6 > 1.0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules new file mode 100644 index 0000000000000000000000000000000000000000..f75e27680d8cc543582b73e72f831cdbd5e25764 --- /dev/null +++ b/assets/prometheus/rules/kube-controller-manager.rules @@ -0,0 +1,10 @@ +ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules new file mode 100644 index 0000000000000000000000000000000000000000..6eff4bcdc010b6d7682d7d97e38af7b674a9384e --- /dev/null +++ b/assets/prometheus/rules/kube-scheduler.rules @@ -0,0 +1,10 @@ +ALERT K8SSchedulerDown + IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Scheduler is down", + description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", + } diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules new file mode 100644 index 0000000000000000000000000000000000000000..cbcd576c4685e069be4a62fa2e21e226a02122f8 --- /dev/null +++ b/assets/prometheus/rules/kubelet.rules @@ -0,0 +1,60 @@ +ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", + } + +ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } + +ALERT K8SKubeletDown + IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape {{ $value }}% of kubelets.", + } + +ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", + } + +ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + } diff --git a/assets/prometheus/rules/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules index 157eb3fa8cb61b79ac7340a12b5d89542f3ff28b..084d11e53f4b768b38a3e43708a5936289aed31d 100644 --- a/assets/prometheus/rules/kubernetes.rules +++ b/assets/prometheus/rules/kubernetes.rules @@ -169,220 +169,3 @@ cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - -ALERT K8SNodeDown - IF up{job="kubelet"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", - } - -ALERT K8SNodeNotReady - IF kube_node_status_ready{condition="true"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - -ALERT K8SManyNodesNotReady - IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 - AND - ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) - / - count by (cluster) (kube_node_status_ready{condition="true"}) - ) > 0.2 - FOR 1m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", - } - -ALERT K8SKubeletNodeExporterDown - IF up{job="node-exporter"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet node_exporter cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", - } - -ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", - } - -ALERT K8SApiserverDown - IF up{job="kubernetes"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "An API server could not be scraped.", - } - -# Disable for non HA kubernetes setups. -ALERT K8SApiserverDown - IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) - FOR 5m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", - } - -ALERT K8SSchedulerDown - IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Scheduler is down", - description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", - } - -ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - } - -ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - -ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - -# To catch the conntrack sysctl de-tuning when it happens -ALERT K8SConntrackTuningMissing - IF node_nf_conntrack_udp_timeout > 10 - FOR 10m - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node does not have the correct conntrack tunings", - description = "Nodes keep un-setting the correct tunings, investigate when it happens.", - } - -ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - -ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 - FOR 10m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - -# Some verbs excluded because they are expected to be long-lasting: -# WATCHLIST is long-poll, CONNECT is `kubectl exec`. -ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) - ) / 1e6 > 1.0 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } - -ALERT K8SApiServerEtcdAccessLatency - IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Access to etcd is slow", - description = "99th percentile latency for apiserver to access etcd is higher than 1s.", - } - -ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", - } - diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules new file mode 100644 index 0000000000000000000000000000000000000000..8fd5b7d0363f9b2b3be9b5f038333db9ff61a872 --- /dev/null +++ b/assets/prometheus/rules/node.rules @@ -0,0 +1,10 @@ +ALERT NodeExporterDown + IF up{job="node-exporter"} == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "node-exporter cannot be scraped", + description = "Prometheus could not scrape a node-exporter for more than 10m.", + } diff --git a/assets/prometheus/rules/prometheus.rules b/assets/prometheus/rules/prometheus.rules new file mode 100644 index 0000000000000000000000000000000000000000..05c278f1c5606238c8b8edbd24d1df820972ade9 --- /dev/null +++ b/assets/prometheus/rules/prometheus.rules @@ -0,0 +1,10 @@ +ALERT FailedReload + IF prometheus_config_last_reload_successful == 0 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "Prometheus configuration reload has failed", + description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } diff --git a/manifests/alertmanager/alertmanager-config.yaml b/manifests/alertmanager/alertmanager-config.yaml index eee36b33fae8b956c5dd7c5ef9c95dd7e3432c4d..62d39016214739e8f04a9158aad07df4a488013b 100644 --- a/manifests/alertmanager/alertmanager-config.yaml +++ b/manifests/alertmanager/alertmanager-config.yaml @@ -3,4 +3,4 @@ kind: Secret metadata: name: alertmanager-main data: - alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg== + alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index 84a3238a4e6cfe2a6464d26ee1d999031fbe9d4f..c092d8e2ffc1303d846391bb841754a97042fcbf 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -6,7 +6,7 @@ metadata: labels: prometheus: frontend spec: - version: v1.6.3 + version: v1.7.0 serviceMonitorSelector: matchLabels: tier: frontend diff --git a/manifests/prometheus-operator/prometheus-operator-service.yaml b/manifests/prometheus-operator/prometheus-operator-service.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8882d4a793ed26bc1e4fb3c85134ca16061a25f4 --- /dev/null +++ b/manifests/prometheus-operator/prometheus-operator-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus-operator + labels: + k8s-app: prometheus-operator +spec: + type: ClusterIP + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP + selector: + k8s-app: prometheus-operator diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 573aaf2925c114777d8f3ef8d64e9ba007516f9c..97b1cafbf0e4d990f73a0529cc8624ccdf59e4b3 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -3,25 +3,28 @@ kind: Deployment metadata: name: prometheus-operator labels: - operator: prometheus + k8s-app: prometheus-operator spec: replicas: 1 template: metadata: labels: - operator: prometheus + k8s-app: prometheus-operator spec: serviceAccountName: prometheus-operator containers: - - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.9.1 - args: - - "--kubelet-service=kube-system/kubelet" - - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" - resources: - requests: - cpu: 100m - memory: 50Mi - limits: - cpu: 200m - memory: 300Mi + - name: prometheus-operator + image: quay.io/coreos/prometheus-operator:v0.9.1 + args: + - "--kubelet-service=kube-system/kubelet" + - "--config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1" + ports: + - name: http + containerPort: 8080 + resources: + requests: + cpu: 100m + memory: 50Mi + limits: + cpu: 200m + memory: 300Mi diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 71f72da12436dd027dde8b158e97774db119ef0a..cb062db18e6fd87199ffe409df1a493e8023becc 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -6,76 +6,260 @@ metadata: role: prometheus-rulefiles prometheus: k8s data: - etcd2.rules: |+ - ### General cluster availability ### - - # alert if another failed peer will result in an unavailable cluster - ALERT InsufficientPeers - IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) - FOR 3m + alertmanager.rules: |+ + ALERT AlertmanagerConfigInconsistent + IF count_values by (service) ("config_hash", alertmanager_config_hash) + / on(service) group_left + label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + FOR 5m LABELS { severity = "critical" } ANNOTATIONS { - summary = "Etcd cluster small", - description = "If one more etcd peer goes down the cluster will be unavailable", + summary = "Alertmanager configurations are inconsistent", + description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync." } - ### HTTP requests alerts ### - - # alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 - FOR 10m + ALERT AlertmanagerDownOrMissing + IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") + / on(job) group_right + sum by(job) (up) != 1 + FOR 5m LABELS { severity = "warning" } ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + summary = "Alertmanager down or not discovered", + description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery." } - # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response - ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 - FOR 5m + ALERT FailedReload + IF alertmanager_config_last_reload_successful == 0 + FOR 10m LABELS { - severity = "critical" + severity = "warning" } ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - } - - # alert if 50% of requests get a 4xx response + summary = "Alertmanager configuration reload has failed", + description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + } + etcd3.rules: |+ + # general cluster availability + + # alert if another failed member will result in an unavailable cluster + ALERT InsufficientMembers + IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + FOR 3m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "etcd cluster insufficient members", + description = "If one more etcd member goes down the cluster will be unavailable", + } + + # etcd leader alerts + # ================== + + # alert if any etcd instance has no leader + ALERT NoLeader + IF etcd_server_has_leader{job="etcd"} == 0 + FOR 1m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "etcd member has no leader", + description = "etcd member {{ $labels.instance }} has no leader", + } + + # alert if there are lots of leader changes + ALERT HighNumberOfLeaderChanges + IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of leader changes within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", + } + + # gRPC request alerts + # =================== + + # alert if more than 1% of gRPC method calls have failed within the last 5 minutes + ALERT HighNumberOfFailedGRPCRequests + IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if more than 5% of gRPC method calls have failed within the last 5 minutes + ALERT HighNumberOfFailedGRPCRequests + IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) + / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of gRPC requests are failing", + description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if the 99th percentile of gRPC method calls take more than 150ms + ALERT GRPCRequestsSlow + IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "slow gRPC requests", + description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow", + } + + # HTTP requests alerts + # ==================== + + # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) - / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 + IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if the 99th percentile of HTTP requests take more than 150ms + ALERT HTTPRequestsSlow + IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "slow HTTP requests", + description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + } + + # etcd member communication alerts + # ================================ + + # alert if 99th percentile of round trips take 150ms + ALERT EtcdMemberCommunicationSlow + IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "etcd member communication is slow", + description = "etcd instance {{ $labels.instance }} member communication with {{ $label.To }} is slow", + } + + # etcd proposal alerts + # ==================== + + # alert if there are several failed proposals within an hour + ALERT HighNumberOfFailedProposals + IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of proposals within the etcd cluster are failing", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + } + + # etcd disk io latency alerts + # =========================== + + # alert if 99th percentile of fsync durations is higher than 500ms + ALERT HighFsyncDurations + IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high fsync durations", + description = "etcd instance {{ $labels.instance }} fync durations are high", + } + + # alert if 99th percentile of commit durations is higher than 250ms + ALERT HighCommitDurations + IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high commit durations", + description = "etcd instance {{ $labels.instance }} commit durations are high", + } + general.rules: |+ + ### Up Alerting ### + + Alert TargetDown + IF 100 * (count(up == 0) / count(up)) > 3 FOR 10m LABELS { - severity = "critical" + severity = "warning" } ANNOTATIONS { - summary = "a high number of HTTP requests are failing", - description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", + summary = "Targets are down", + description = "More than {{ $value }}% of targets are down." } - # alert if the 99th percentile of HTTP requests take more than 150ms - ALERT HTTPRequestsSlow - IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 - FOR 10m + ### Dead man's switch ### + + ALERT DeadMansSwitch + IF vector(1) LABELS { - severity = "warning" + severity = "none", } ANNOTATIONS { - summary = "slow HTTP requests", - description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + summary = "Alerting DeadMansSwitch", + description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.", } ### File descriptor alerts ### + ALERT TooManyOpenFileDescriptors + IF 100 * (process_open_fds / process_max_fds) > 95 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "too many open file descriptors", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.", + } + instance:fd_utilization = process_open_fds / process_max_fds # alert if file descriptors are likely to exhaust within the next 4 hours @@ -87,7 +271,7 @@ data: } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", } # alert if file descriptors are likely to exhaust within the next hour @@ -99,34 +283,108 @@ data: } ANNOTATIONS { summary = "file descriptors soon exhausted", - description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon", + } + kube-apiserver.rules: |+ + ALERT K8SApiserverDown + IF absent({job="apiserver"}) or (count by(cluster) (up{job="apiserver"} == 1) < count by(cluster) (up{job="apiserver"})) + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "API server unreachable", + description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.", } - ### etcd proposal alerts ### - - # alert if there are several failed proposals within an hour - ALERT HighNumberOfFailedProposals - IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 + # Some verbs excluded because they are expected to be long-lasting: + # WATCHLIST is long-poll, CONNECT is `kubectl exec`. + # + # apiserver_request_latencies' unit is microseconds + ALERT K8SApiServerLatency + IF histogram_quantile( + 0.99, + sum without (instance,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) + ) / 1e6 > 1.0 + FOR 10m LABELS { severity = "warning" } ANNOTATIONS { - summary = "a high number of failed proposals within the etcd cluster are happening", - description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + summary = "Kubernetes apiserver latency is high", + description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", + } + kube-controller-manager.rules: |+ + ALERT K8SControllerManagerDown + IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) + FOR 5m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Controller manager is down", + description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", + } + kubelet.rules: |+ + ALERT K8SNodeNotReady + IF kube_node_status_ready{condition="true"} == 0 + FOR 1h + LABELS { + severity = "warning", + } + ANNOTATIONS { + summary = "Node status is NotReady", + description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", } - ### etcd disk io latency alerts ### + ALERT K8SManyNodesNotReady + IF + count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 + AND + ( + count by (cluster) (kube_node_status_ready{condition="true"} == 0) + / + count by (cluster) (kube_node_status_ready{condition="true"}) + ) > 0.2 + FOR 1m + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many K8s nodes are Not Ready", + description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", + } - # alert if 99th percentile of fsync durations is higher than 500ms - ALERT HighFsyncDurations - IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 - FOR 10m + ALERT K8SKubeletDown + IF count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.03 + FOR 1h LABELS { - severity = "warning" + severity = "warning", + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape {{ $value }}% of kubelets.", + } + + ALERT K8SKubeletDown + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 + FOR 1h + LABELS { + severity = "critical", + } + ANNOTATIONS { + summary = "Many Kubelets cannot be scraped", + description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.", + } + + ALERT K8SKubeletTooManyPods + IF kubelet_running_pod_count > 100 + LABELS { + severity = "warning", } ANNOTATIONS { - summary = "high fsync durations", - description = "ectd instance {{ $labels.instance }} fync durations are high", + summary = "Kubelet is close to pod limit", + description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", } kubernetes.rules: |+ # NOTE: These rules were kindly contributed by the SoundCloud engineering team. @@ -300,220 +558,36 @@ data: histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - - ALERT K8SNodeDown - IF up{job="kubelet"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", - } - - ALERT K8SNodeNotReady - IF kube_node_status_ready{condition="true"} == 0 - FOR 1h - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node status is NotReady", - description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", - } - - ALERT K8SManyNodesNotReady - IF - count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 - AND - ( - count by (cluster) (kube_node_status_ready{condition="true"} == 0) - / - count by (cluster) (kube_node_status_ready{condition="true"}) - ) > 0.2 - FOR 1m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Many K8s nodes are Not Ready", - description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", - } - - ALERT K8SKubeletNodeExporterDown - IF up{job="node-exporter"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubelet node_exporter cannot be scraped", - description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", - } - - ALERT K8SKubeletDown - IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 - FOR 1h - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Many Kubelets cannot be scraped", - description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", - } - - ALERT K8SApiserverDown - IF up{job="kubernetes"} == 0 - FOR 15m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "An API server could not be scraped.", - } - - # Disable for non HA kubernetes setups. - ALERT K8SApiserverDown - IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) - FOR 5m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "API server unreachable", - description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", - } - + kube-scheduler.rules: |+ ALERT K8SSchedulerDown IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) FOR 5m LABELS { - service = "k8s", severity = "critical", } ANNOTATIONS { summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", } - - ALERT K8SControllerManagerDown - IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) - FOR 5m - LABELS { - service = "k8s", - severity = "critical", - } - ANNOTATIONS { - summary = "Controller manager is down", - description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", - } - - ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 - FOR 10m - LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - - ALERT K8SConntrackTableFull - IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "Number of tracked connections is near the limit", - description = "The nf_conntrack table is {{ $value }}% full.", - } - - # To catch the conntrack sysctl de-tuning when it happens - ALERT K8SConntrackTuningMissing - IF node_nf_conntrack_udp_timeout > 10 - FOR 10m - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Node does not have the correct conntrack tunings", - description = "Nodes keep un-setting the correct tunings, investigate when it happens.", - } - - ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 + node.rules: |+ + ALERT NodeExporterDown + IF up{job="node-exporter"} == 0 FOR 10m LABELS { - service = "k8s", severity = "warning" } ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", + summary = "node-exporter cannot be scraped", + description = "Prometheus could not scrape a node-exporter for more than 10m.", } - - ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 - FOR 10m - LABELS { - service = "k8s", - severity = "critical" - } - ANNOTATIONS { - summary = "{{ $labels.job }} has too many open file descriptors", - description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", - } - - # Some verbs excluded because they are expected to be long-lasting: - # WATCHLIST is long-poll, CONNECT is `kubectl exec`. - ALERT K8SApiServerLatency - IF histogram_quantile( - 0.99, - sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) - ) / 1e6 > 1.0 + prometheus.rules: |+ + ALERT FailedReload + IF prometheus_config_last_reload_successful == 0 FOR 10m LABELS { - service = "k8s", - severity = "warning" - } - ANNOTATIONS { - summary = "Kubernetes apiserver latency is high", - description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", - } - - ALERT K8SApiServerEtcdAccessLatency - IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 - FOR 15m - LABELS { - service = "k8s", severity = "warning" } ANNOTATIONS { - summary = "Access to etcd is slow", - description = "99th percentile latency for apiserver to access etcd is higher than 1s.", - } - - ALERT K8SKubeletTooManyPods - IF kubelet_running_pod_count > 100 - LABELS { - service = "k8s", - severity = "warning", - } - ANNOTATIONS { - summary = "Kubelet is close to pod limit", - description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", + summary = "Prometheus configuration reload has failed", + description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." } - diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml index d193b6769caaa587672e91eacbae7c40d6eef47d..29d68c821de15a7fb99b7e0d88ef605079d6cb9f 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml @@ -1,12 +1,16 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: ServiceMonitor metadata: - labels: - alertmanager: main name: alertmanager + labels: + app: alertmanager spec: + selector: + matchLabels: + alertmanager: main + namespaceSelector: + matchNames: + - monitoring endpoints: - port: web - selector: - matchExpressions: - - {key: alertmanager, operator: In, values: [main]} + interval: 30s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml index 1fd793e511f658c52c5b6da5a772f9f0b08fb39c..09a87c2e0466eca07111bd1e525e00e914ee407a 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml @@ -3,9 +3,9 @@ kind: ServiceMonitor metadata: name: kube-apiserver labels: - k8s-apps: https + k8s-app: apiserver spec: - jobLabel: provider + jobLabel: component selector: matchLabels: component: apiserver @@ -15,7 +15,7 @@ spec: - default endpoints: - port: https - interval: 15s + interval: 30s scheme: https tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml deleted file mode 100644 index fbfcda97b31b4be1e15eb1e6e789337adee38da0..0000000000000000000000000000000000000000 --- a/manifests/prometheus/prometheus-k8s-service-monitor-k8s-apps-http.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ServiceMonitor -metadata: - name: k8s-apps-http - namespace: monitoring - labels: - k8s-apps: http -spec: - jobLabel: k8s-app - selector: - matchExpressions: - - {key: k8s-app, operator: Exists} - - {key: k8s-app, operator: NotIn, values: [kubelet]} - namespaceSelector: - matchNames: - - kube-system - endpoints: - - port: http-metrics - interval: 15s - - port: http-metrics-dnsmasq - interval: 15s - - port: http-metrics-skydns - interval: 15s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eef95a84eacace875b6b24577bd6c99ccb00fd87 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-controller-manager + labels: + k8s-app: kube-controller-manager +spec: + jobLabel: k8s-app + endpoints: + - port: http-metrics + interval: 30s + selector: + matchLabels: + k8s-app: kube-controller-manager + namespaceSelector: + matchNames: + - kube-system diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml new file mode 100644 index 0000000000000000000000000000000000000000..663f8cfb04fedc1057b72dc549a91cae63867421 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-scheduler + labels: + k8s-app: kube-scheduler +spec: + jobLabel: k8s-app + endpoints: + - port: http-metrics + interval: 30s + selector: + matchLabels: + k8s-app: kube-scheduler + namespaceSelector: + matchNames: + - kube-system diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml index c4ed1afced6c8852d76167dbc6334b7533582d7e..a276702aaa5f3b0afb61fbf60b96b844fa15b078 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml @@ -2,9 +2,8 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: ServiceMonitor metadata: name: kube-state-metrics - namespace: monitoring labels: - k8s-apps: http + k8s-app: kube-state-metrics spec: jobLabel: k8s-app selector: @@ -15,5 +14,5 @@ spec: - monitoring endpoints: - port: http-metrics - interval: 15s + interval: 30s honorLabels: true diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml index 5729d8f042693eccea5473c6f0c7e37f988cb1f4..cdc3ffb66d6b056425beb34124b6fab70621ab17 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml @@ -3,16 +3,16 @@ kind: ServiceMonitor metadata: name: kubelet labels: - k8s-apps: http + k8s-app: kubelet spec: jobLabel: k8s-app + endpoints: + - port: http-metrics + interval: 30s + honorLabels: true selector: matchLabels: k8s-app: kubelet namespaceSelector: matchNames: - kube-system - endpoints: - - port: http-metrics - interval: 15s - honorLabels: true diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml index a7b20301d38756d4da4acaa05f71249a58962566..b68ed89f26cad467f543d5fbba005c5612a5af72 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml @@ -2,9 +2,8 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: ServiceMonitor metadata: name: node-exporter - namespace: monitoring labels: - k8s-apps: http + k8s-app: node-exporter spec: jobLabel: k8s-app selector: @@ -15,4 +14,4 @@ spec: - monitoring endpoints: - port: http-metrics - interval: 15s + interval: 30s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml new file mode 100644 index 0000000000000000000000000000000000000000..23c04073f959199a3e0d684cdfe50ebb67f0b515 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml @@ -0,0 +1,12 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: prometheus-operator + labels: + k8s-app: prometheus-operator +spec: + endpoints: + - port: http + selector: + matchLabels: + k8s-app: prometheus-operator diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml index 5e5d17be37dbd573c6fa813c59cbf41e912a1a4c..be74cd6d51770d74fbe2d37d0735e3fb9a2444cb 100644 --- a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml @@ -3,10 +3,14 @@ kind: ServiceMonitor metadata: name: prometheus labels: - prometheus: k8s + app: prometheus spec: + selector: + matchLabels: + prometheus: k8s + namespaceSelector: + matchNames: + - monitoring endpoints: - port: web - selector: - matchExpressions: - - {key: prometheus, operator: In, values: [k8s]} + interval: 30s diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index b7060ba6fcf80fb1ee32d3987debe1af6021a47d..63e9c3f7c027b09d5dfbcc1b61e10d68d4895001 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -6,7 +6,7 @@ metadata: prometheus: k8s spec: replicas: 2 - version: v1.6.3 + version: v1.7.0 serviceAccountName: prometheus-k8s serviceMonitorSelector: matchExpression: