Skip to content
Snippets Groups Projects
Commit 5291bc32 authored by Frederic Branczyk's avatar Frederic Branczyk Committed by GitHub
Browse files

Merge pull request #4 from brancz/etcd2-alerts

add etcd2 alerts
parents bb752d6f 2e5bcc16
No related branches found
No related tags found
No related merge requests found
### General cluster availability ###
# alert if another failed peer will result in an unavailable cluster
ALERT InsufficientPeers
IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
FOR 3m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "Etcd cluster small",
description = "If one more etcd peer goes down the cluster will be unavailable",
}
### HTTP requests alerts ###
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
FOR 5m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if 50% of requests get a 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
}
# alert if the 99th percentile of HTTP requests take more than 150ms
ALERT HTTPRequestsSlow
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "slow HTTP requests",
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
}
### File descriptor alerts ###
instance:fd_utilization = process_open_fds / process_max_fds
# alert if file descriptors are likely to exhaust within the next 4 hours
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
}
# alert if file descriptors are likely to exhaust within the next hour
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
}
### etcd proposal alerts ###
# alert if there are several failed proposals within an hour
ALERT HighNumberOfFailedProposals
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of failed proposals within the etcd cluster are happening",
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
}
### etcd disk io latency alerts ###
# alert if 99th percentile of fsync durations is higher than 500ms
ALERT HighFsyncDurations
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "high fsync durations",
description = "ectd instance {{ $labels.instance }} fync durations are high",
}
apiVersion: v1 apiVersion: v1
data: data:
etcd2.rules: "### General cluster availability ###\n\n# alert if another failed
peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"}
== 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity
= \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n
\ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n
\ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to
an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n
\ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) >
0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
\ summary = \"a high number of HTTP requests are failing\",\n description
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP
endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))
\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
> 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
{\n summary = \"a high number of HTTP requests are failing\",\n description
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT
HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\",
code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
> 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
{\n summary = \"a high number of HTTP requests are failing\",\n description
= \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses
on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile
of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99,
rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP
requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP
requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts
###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert
if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n
\ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors
soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance
}} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors
are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m],
3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
{\n summary = \"file descriptors soon exhausted\",\n description = \"{{
$labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors
soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed
proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h])
> 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary
= \"a high number of failed proposals within the etcd cluster are happening\",\n
\ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }}
proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts
###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT
HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m]))
> 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
\ summary = \"high fsync durations\",\n description = \"ectd instance {{
$labels.instance }} fync durations are high\",\n }\n"
kubernetes.rules: |+ kubernetes.rules: |+
### Container resources ### ### Container resources ###
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment