diff --git a/assets/alerts/etcd2.rules b/assets/alerts/etcd2.rules new file mode 100644 index 0000000000000000000000000000000000000000..4a38894e2463e20620895a8ddd6b52c3b2110ad9 --- /dev/null +++ b/assets/alerts/etcd2.rules @@ -0,0 +1,121 @@ +### General cluster availability ### + +# alert if another failed peer will result in an unavailable cluster +ALERT InsufficientPeers + IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1) + FOR 3m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "Etcd cluster small", + description = "If one more etcd peer goes down the cluster will be unavailable", + } + +### HTTP requests alerts ### + +# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + +# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05 + FOR 5m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + } + +# alert if 50% of requests get a 4xx response +ALERT HighNumberOfFailedHTTPRequests + IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m])) + / sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "a high number of HTTP requests are failing", + description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}", + } + +# alert if the 99th percentile of HTTP requests take more than 150ms +ALERT HTTPRequestsSlow + IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "slow HTTP requests", + description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow", + } + +### File descriptor alerts ### + +instance:fd_utilization = process_open_fds / process_max_fds + +# alert if file descriptors are likely to exhaust within the next 4 hours +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + } + +# alert if file descriptors are likely to exhaust within the next hour +ALERT FdExhaustionClose + IF predict_linear(instance:fd_utilization[10m], 3600) > 1 + FOR 10m + LABELS { + severity = "critical" + } + ANNOTATIONS { + summary = "file descriptors soon exhausted", + description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon", + } + +### etcd proposal alerts ### + +# alert if there are several failed proposals within an hour +ALERT HighNumberOfFailedProposals + IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "a high number of failed proposals within the etcd cluster are happening", + description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", + } + +### etcd disk io latency alerts ### + +# alert if 99th percentile of fsync durations is higher than 500ms +ALERT HighFsyncDurations + IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5 + FOR 10m + LABELS { + severity = "warning" + } + ANNOTATIONS { + summary = "high fsync durations", + description = "ectd instance {{ $labels.instance }} fync durations are high", + } diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index da08cb455cb893be1101d03898c31c87c7f33672..db867d731fc424d5c1014088f163bcca605c5417 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -1,5 +1,57 @@ apiVersion: v1 data: + etcd2.rules: "### General cluster availability ###\n\n# alert if another failed + peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"} + == 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity + = \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n + \ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n + \ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to + an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n + \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n + \ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) > + 0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n + \ summary = \"a high number of HTTP requests are failing\",\n description + = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance + {{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP + endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n + \ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m])) + \n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) + > 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS + {\n summary = \"a high number of HTTP requests are failing\",\n description + = \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance + {{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT + HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", + code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) + > 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS + {\n summary = \"a high number of HTTP requests are failing\",\n description + = \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses + on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile + of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99, + rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS + {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP + requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP + requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts + ###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert + if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n + \ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS + {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors + soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance + }} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors + are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m], + 3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS + {\n summary = \"file descriptors soon exhausted\",\n description = \"{{ + $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors + soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed + proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h]) + > 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary + = \"a high number of failed proposals within the etcd cluster are happening\",\n + \ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }} + proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts + ###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT + HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) + > 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n + \ summary = \"high fsync durations\",\n description = \"ectd instance {{ + $labels.instance }} fync durations are high\",\n }\n" kubernetes.rules: |+ ### Container resources ###