diff --git a/assets/prometheus/rules/etcd2.rules b/assets/prometheus/rules/etcd2.rules index 4a38894e2463e20620895a8ddd6b52c3b2110ad9..10fa5e8d7e3fdb4f03d8138f192f038f2508df42 100644 --- a/assets/prometheus/rules/etcd2.rules +++ b/assets/prometheus/rules/etcd2.rules @@ -29,7 +29,7 @@ ALERT HighNumberOfFailedHTTPRequests # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response ALERT HighNumberOfFailedHTTPRequests - IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 FOR 5m LABELS { diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 9ad91eb0ed85afbfc8f65f23bef4125257eea9cf..bcb6a42e429a4da457185dc45e575688053e953e 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -32,6 +32,12 @@ kctl apply -f manifests/prometheus/prometheus-k8s-service.yaml kctl apply -f manifests/alertmanager/alertmanager-config.yaml kctl apply -f manifests/alertmanager/alertmanager-service.yaml +# unfortunately statefulsets cannot be changed except for their replica count +# so we need to make sure that the rule files are created before we create the +# prometheus resource so it can properly discover the rule files when creating +# the statefulset +sleep 5 + # `kubectl apply` is currently not working for third party resources so we are # using `kubectl create` here for the time being. # (https://github.com/kubernetes/kubernetes/issues/29542) diff --git a/hack/scripts/generate-alertmanager-config-secret.sh b/hack/scripts/generate-alertmanager-config-secret.sh new file mode 100755 index 0000000000000000000000000000000000000000..b0b4aaef77eb4467a51b2b979cd916b41ceed798 --- /dev/null +++ b/hack/scripts/generate-alertmanager-config-secret.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +cat <<-EOF +apiVersion: v1 +kind: Secret +metadata: + name: alertmanager-main +data: + alertmanager.yaml: $(cat assets/alertmanager/alertmanager.yaml | base64 --wrap=0) +EOF + diff --git a/hack/scripts/generate-dashboards-configmap.sh b/hack/scripts/generate-dashboards-configmap.sh new file mode 100755 index 0000000000000000000000000000000000000000..6e21600e81b4070e23045c51bbad1aa87b3dc0d6 --- /dev/null +++ b/hack/scripts/generate-dashboards-configmap.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +cat <<-EOF +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards +data: +EOF + +for f in assets/grafana/* +do + echo " $(basename $f): |+" + cat $f | sed "s/^/ /g" +done diff --git a/hack/scripts/generate-manifests.sh b/hack/scripts/generate-manifests.sh index d031b3a4d65a182f11f52fe368570a39b77c273e..bf5f42fa6be1e7f86108f94cebe13faa654f5d25 100755 --- a/hack/scripts/generate-manifests.sh +++ b/hack/scripts/generate-manifests.sh @@ -1,11 +1,11 @@ #!/bin/bash # Generate Alert Rules ConfigMap -kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml +hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-rules.yaml # Generate Dashboard ConfigMap -kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-dashboards.yaml +hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml # Generate Secret for Alertmanager config -kubectl create secret generic alertmanager-main --dry-run --from-file=assets/alertmanager/alertmanager.yaml -oyaml > manifests/alertmanager/alertmanager-config.yaml +hack/scripts/generate-alertmanager-config-secret.sh > manifests/alertmanager/alertmanager-config.yaml diff --git a/hack/scripts/generate-rules-configmap.sh b/hack/scripts/generate-rules-configmap.sh new file mode 100755 index 0000000000000000000000000000000000000000..b8e00fef88bb9f258b38d59ea207846e421d2b4e --- /dev/null +++ b/hack/scripts/generate-rules-configmap.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +cat <<-EOF +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-k8s-rules + labels: + role: prometheus-rulefiles + prometheus: k8s +data: +EOF + +for f in assets/prometheus/rules/*.rules +do + echo " $(basename $f): |+" + cat $f | sed "s/^/ /g" +done diff --git a/manifests/alertmanager/alertmanager-config.yaml b/manifests/alertmanager/alertmanager-config.yaml index 49f8c3c49fd99142bb4beed0628b9726b8d0d6ed..eee36b33fae8b956c5dd7c5ef9c95dd7e3432c4d 100644 --- a/manifests/alertmanager/alertmanager-config.yaml +++ b/manifests/alertmanager/alertmanager-config.yaml @@ -1,7 +1,6 @@ apiVersion: v1 -data: - alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg== kind: Secret metadata: - creationTimestamp: null name: alertmanager-main +data: + alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg== diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml index cba92d4972cf3f0fccefa915ad7de41d9c1bc033..15244d6153107e01a261c1b930802a5db3ce9f23 100644 --- a/manifests/grafana/grafana-dashboards.yaml +++ b/manifests/grafana/grafana-dashboards.yaml @@ -1,6 +1,9 @@ apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards data: - all-nodes-dashboard.json: | + all-nodes-dashboard.json: |+ { "dashboard": { @@ -861,7 +864,7 @@ data: ], "overwrite": true } - deployment-dashboard.json: |- + deployment-dashboard.json: |+ { "dashboard": { "__inputs": [ @@ -1678,8 +1681,7 @@ data: } ], "overwrite": true - } - kubernetes-pods-dashboard.json: | + } kubernetes-pods-dashboard.json: |+ { "dashboard": { "__inputs": [ @@ -2089,7 +2091,7 @@ data: ], "overwrite": true } - node-dashboard.json: | + node-dashboard.json: |+ { "dashboard": { @@ -2970,7 +2972,7 @@ data: ], "overwrite": true } - prometheus-datasource.json: | + prometheus-datasource.json: |+ { "access": "proxy", "basicAuth": false, @@ -2978,7 +2980,7 @@ data: "type": "prometheus", "url": "http://prometheus-k8s.monitoring.svc:9090" } - resource-requests-dashboard.json: |- + resource-requests-dashboard.json: |+ { "__inputs": [ { @@ -3402,8 +3404,4 @@ data: "timezone": "browser", "title": "Resource Requests", "version": 1 - } -kind: ConfigMap -metadata: - creationTimestamp: null - name: grafana-dashboards + } \ No newline at end of file diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 08f6dddceaeb8e450c766a12f072e537397acf47..7327b0db5a516a78447eba49d602b99af142c4d7 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -1,62 +1,138 @@ apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-k8s-rules + labels: + role: prometheus-rulefiles + prometheus: k8s data: - etcd2.rules: "### General cluster availability ###\n\n# alert if another failed - peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"} - == 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity - = \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n - \ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n - \ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to - an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n - \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n - \ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) > - 0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n - \ summary = \"a high number of HTTP requests are failing\",\n description - = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance - {{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP - endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n - \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m])) - \n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) - > 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS - {\n summary = \"a high number of HTTP requests are failing\",\n description - = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance - {{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT - HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", - code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) - > 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS - {\n summary = \"a high number of HTTP requests are failing\",\n description - = \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses - on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile - of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99, - rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS - {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP - requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP - requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts - ###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert - if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n - \ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS - {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors - soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance - }} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors - are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m], - 3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS - {\n summary = \"file descriptors soon exhausted\",\n description = \"{{ - $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors - soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed - proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h]) - > 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary - = \"a high number of failed proposals within the etcd cluster are happening\",\n - \ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }} - proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts - ###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT - HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) - > 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n - \ summary = \"high fsync durations\",\n description = \"ectd instance {{ - $labels.instance }} fync durations are high\",\n }\n" + etcd2.rules: |+ + ### General cluster availability ### + + # alert if another failed peer will result in an unavailable cluster + ALERT InsufficientPeers + IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) + FOR 3m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Etcd cluster small", + description = "If one more etcd peer goes down the cluster will be unavailable", + } + + ### HTTP requests alerts ### + + # alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + + # alert if 50% of requests get a 4xx response + ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", + } + + # alert if the 99th percentile of HTTP requests take more than 150ms + ALERT HTTPRequestsSlow + IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "slow HTTP requests", + description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + } + + ### File descriptor alerts ### + + instance:fd_utilization = process_open_fds / process_max_fds + + # alert if file descriptors are likely to exhaust within the next 4 hours + ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + } + + # alert if file descriptors are likely to exhaust within the next hour + ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[10m], 3600) > 1 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + } + + ### etcd proposal alerts ### + + # alert if there are several failed proposals within an hour + ALERT HighNumberOfFailedProposals + IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of failed proposals within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + } + + ### etcd disk io latency alerts ### + + # alert if 99th percentile of fsync durations is higher than 500ms + ALERT HighFsyncDurations + IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high fsync durations", + description = "ectd instance {{ $labels.instance }} fync durations are high", + } kubernetes.rules: |+ # NOTE: These rules were kindly contributed by the SoundCloud engineering team. - + ### Container resources ### - + cluster_namespace_controller_pod_container:spec_memory_limit_bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -65,7 +141,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:spec_cpu_shares = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -74,7 +150,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:cpu_usage:rate = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -85,7 +161,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_usage:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -94,7 +170,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_working_set:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -103,7 +179,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_rss:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -112,7 +188,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_cache:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -121,7 +197,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:disk_usage:bytes = sum by (cluster,namespace,controller,pod_name,container_name) ( label_replace( @@ -130,7 +206,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_pagefaults:rate = sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( label_replace( @@ -141,7 +217,7 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + cluster_namespace_controller_pod_container:memory_oom:rate = sum by (cluster,namespace,controller,pod_name,container_name,scope,type) ( label_replace( @@ -152,39 +228,39 @@ data: "pod_name", "^(.*)-[a-z0-9]+" ) ) - + ### Cluster resources ### - + cluster:memory_allocation:percent = 100 * sum by (cluster) ( container_spec_memory_limit_bytes{pod_name!=""} ) / sum by (cluster) ( machine_memory_bytes ) - + cluster:memory_used:percent = 100 * sum by (cluster) ( container_memory_usage_bytes{pod_name!=""} ) / sum by (cluster) ( machine_memory_bytes ) - + cluster:cpu_allocation:percent = 100 * sum by (cluster) ( container_spec_cpu_shares{pod_name!=""} ) / sum by (cluster) ( container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores ) - + cluster:node_cpu_use:percent = 100 * sum by (cluster) ( rate(node_cpu{mode!="idle"}[5m]) ) / sum by (cluster) ( machine_cpu_cores ) - + ### API latency ### - + # Raw metrics are in microseconds. Convert to seconds. cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} = histogram_quantile( @@ -201,30 +277,30 @@ data: 0.5, sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket) ) / 1e6 - + ### Scheduling latency ### - + cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6 - + cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6 - + cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} = histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} = histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 - + ALERT K8SNodeDown IF up{job="kubelet"} == 0 FOR 1h @@ -236,7 +312,7 @@ data: summary = "Kubelet cannot be scraped", description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour", } - + ALERT K8SNodeNotReady IF kube_node_status_ready{condition="true"} == 0 FOR 1h @@ -248,7 +324,7 @@ data: summary = "Node status is NotReady", description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour", } - + ALERT K8SManyNodesNotReady IF count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1 @@ -267,7 +343,7 @@ data: summary = "Many K8s nodes are Not Ready", description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.", } - + ALERT K8SKubeletNodeExporterDown IF up{job="node-exporter"} == 0 FOR 15m @@ -279,7 +355,7 @@ data: summary = "Kubelet node_exporter cannot be scraped", description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.", } - + ALERT K8SKubeletDown IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 FOR 1h @@ -291,7 +367,7 @@ data: summary = "Many Kubelets cannot be scraped", description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.", } - + ALERT K8SApiserverDown IF up{job="kubernetes"} == 0 FOR 15m @@ -303,7 +379,7 @@ data: summary = "API server unreachable", description = "An API server could not be scraped.", } - + # Disable for non HA kubernetes setups. ALERT K8SApiserverDown IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) @@ -316,7 +392,7 @@ data: summary = "API server unreachable", description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.", } - + ALERT K8SSchedulerDown IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0) FOR 5m @@ -328,7 +404,7 @@ data: summary = "Scheduler is down", description = "There is no running K8S scheduler. New pods are not being assigned to nodes.", } - + ALERT K8SControllerManagerDown IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0) FOR 5m @@ -340,7 +416,7 @@ data: summary = "Controller manager is down", description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.", } - + ALERT K8SConntrackTableFull IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50 FOR 10m @@ -352,7 +428,7 @@ data: summary = "Number of tracked connections is near the limit", description = "The nf_conntrack table is {{ $value }}% full.", } - + ALERT K8SConntrackTableFull IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90 LABELS { @@ -363,7 +439,7 @@ data: summary = "Number of tracked connections is near the limit", description = "The nf_conntrack table is {{ $value }}% full.", } - + # To catch the conntrack sysctl de-tuning when it happens ALERT K8SConntrackTuningMissing IF node_nf_conntrack_udp_timeout > 10 @@ -376,7 +452,7 @@ data: summary = "Node does not have the correct conntrack tunings", description = "Nodes keep un-setting the correct tunings, investigate when it happens.", } - + ALERT K8STooManyOpenFiles IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 FOR 10m @@ -388,7 +464,7 @@ data: summary = "{{ $labels.job }} has too many open file descriptors", description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", } - + ALERT K8STooManyOpenFiles IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 FOR 10m @@ -400,7 +476,7 @@ data: summary = "{{ $labels.job }} has too many open file descriptors", description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.", } - + # Some verbs excluded because they are expected to be long-lasting: # WATCHLIST is long-poll, CONNECT is `kubectl exec`. ALERT K8SApiServerLatency @@ -417,7 +493,7 @@ data: summary = "Kubernetes apiserver latency is high", description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.", } - + ALERT K8SApiServerEtcdAccessLatency IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0 FOR 15m @@ -429,7 +505,7 @@ data: summary = "Access to etcd is slow", description = "99th percentile latency for apiserver to access etcd is higher than 1s.", } - + ALERT K8SKubeletTooManyPods IF kubelet_running_pod_count > 100 LABELS { @@ -440,8 +516,4 @@ data: summary = "Kubelet is close to pod limit", description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110", } - -kind: ConfigMap -metadata: - creationTimestamp: null - name: prometheus-k8s-rules + diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 9054ea5868dbe938faeca867548ad8dda6a8ee11..23156650eb1c3fc5176003e0770f1895b6de640f 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -10,6 +10,10 @@ spec: serviceMonitorSelector: matchExpression: - {key: k8s-apps, operator: Exists} + ruleSelector: + matchLabels: + role: prometheus-rulefiles + prometheus: k8s resources: requests: # 2Gi is default, but won't schedule if you don't have a node with >2Gi