Add etcd alerting rules

8fa4145b · Fabian Reinartz · d5525072 · 8fa4145b
Commit 8fa4145b authored Oct 18, 2016 by Fabian Reinartz
--- a/alerts/etcd.rules
+++ b/alerts/etcd.rules
+# general cluster availability
+# alert if another failed peer will result in an unavailable cluster
+ALERT InsufficientPeers
+IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
+FOR 3m
+LABELS {
+  severity = "critical"
+}
+ANNOTATIONS {
+  summary = "Etcd cluster small",
+  description = "If one more etcd peer goes down the cluster will be unavailable",
+}
+# etcd leader alerts
+# ==================
+# alert if any etcd instance has no leader
+ALERT EtcdNoLeader
+IF etcd_server_has_leader{job="etcd"} == 0
+FOR 1m
+LABELS {
+  severity = "critical"
+}
+ANNOTATIONS {
+  summary = "etcd node has no leader",
+  description = "etcd node {{ $labels.instance }} has no leader",
+}
+# alert if there are lots of leader changes
+ALERT HighNumberOfLeaderChanges
+IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "a high number of leader changes within the etcd cluster are happening",
+  description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
+}
+# gRPC request alerts
+# ===================
+# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
+ALERT HighNumberOfFailedGRPCRequests
+IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
+  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "a high number of gRPC requests are failing",
+  description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
+}
+# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
+ALERT HighNumberOfFailedGRPCRequests
+IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
+  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
+FOR 5m
+LABELS {
+  severity = "critical"
+}
+ANNOTATIONS {
+  summary = "a high number of gRPC requests are failing",
+  description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
+}
+# alert if the 99th percentile of gRPC method calls take more than 150ms
+ALERT GRPCRequestsSlow
+IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
+FOR 10m
+LABELS {
+  severity = "critical"
+}
+ANNOTATIONS {
+  summary = "slow gRPC requests",
+  description = "on ectd instance {{ $labels.instance }} gRPC requests to {{ $label.grpc_method }} are slow",
+}
+# HTTP requests alerts
+# ====================
+# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
+ALERT HighNumberOfFailedHTTPRequests
+IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
+  / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "a high number of HTTP requests are failing",
+  description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
+}
+# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
+ALERT HighNumberOfFailedHTTPRequests
+IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m])) 
+  / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
+FOR 5m
+LABELS {
+  severity = "critical"
+}
+ANNOTATIONS {
+  summary = "a high number of HTTP requests are failing",
+  description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
+}
+# alert if the 99th percentile of HTTP requests take more than 150ms
+ALERT HTTPRequestsSlow
+IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "slow HTTP requests",
+  description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
+}
+# file descriptor alerts
+# ======================
+instance:fd_utilization = process_open_fds / process_max_fds
+# alert if file descriptors are likely to exhaust within the next 4 hours
+ALERT FdExhaustionClose
+IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "file descriptors soon exhausted",
+  description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
+}
+# alert if file descriptors are likely to exhaust within the next hour
+ALERT FdExhaustionClose
+IF predict_linear(instance:fd_utilization[10m], 3600) > 1
+FOR 10m
+LABELS {
+  severity = "critical"
+}
+ANNOTATIONS {
+  summary = "file descriptors soon exhausted",
+  description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
+}
+# etcd peer communication alerts
+# ==============================
+# alert if 99th percentile of round trips take 150ms
+ALERT EtcdPeerCommunicationSlow
+IF histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "etcd peer communication is slow",
+  description = "ectd instance {{ $labels.instance }} peer communication with {{ $label.To }} is slow",
+}
+# etcd proposal alerts
+# ====================
+# alert if there are several failed proposals within an hour
+ALERT HighNumberOfFailedProposals
+IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "a high number of failed proposals within the etcd cluster are happening",
+  description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
+}
+# etcd disk io latency alerts
+# ===========================
+# alert if 99th percentile of fsync durations is higher than 500ms
+ALERT HighFsyncDurations
+IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "high fsync durations",
+  description = "ectd instance {{ $labels.instance }} fync durations are high",
+}
+# alert if 99th percentile of commit durations is higher than 250ms
+ALERT HighCommitDurations
+IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
+FOR 10m
+LABELS {
+  severity = "warning"
+}
+ANNOTATIONS {
+  summary = "high commit durations",
+  description = "ectd instance {{ $labels.instance }} commit durations are high",
+}
\ No newline at end of file