Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
kube-prometheus
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
GitHub Mirror
prometheus-operator
kube-prometheus
Commits
5291bc32
Commit
5291bc32
authored
8 years ago
by
Frederic Branczyk
Committed by
GitHub
8 years ago
Browse files
Options
Downloads
Plain Diff
Merge pull request #4 from brancz/etcd2-alerts
add etcd2 alerts
parents
bb752d6f
2e5bcc16
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
assets/alerts/etcd2.rules
+121
-0
121 additions, 0 deletions
assets/alerts/etcd2.rules
manifests/prometheus/prometheus-k8s-rules.yaml
+52
-0
52 additions, 0 deletions
manifests/prometheus/prometheus-k8s-rules.yaml
with
173 additions
and
0 deletions
assets/alerts/etcd2.rules
0 → 100644
+
121
−
0
View file @
5291bc32
### General cluster availability ###
# alert if another failed peer will result in an unavailable cluster
ALERT InsufficientPeers
IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
FOR 3m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "Etcd cluster small",
description = "If one more etcd peer goes down the cluster will be unavailable",
}
### HTTP requests alerts ###
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
FOR 5m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if 50% of requests get a 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
}
# alert if the 99th percentile of HTTP requests take more than 150ms
ALERT HTTPRequestsSlow
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "slow HTTP requests",
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
}
### File descriptor alerts ###
instance:fd_utilization = process_open_fds / process_max_fds
# alert if file descriptors are likely to exhaust within the next 4 hours
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
}
# alert if file descriptors are likely to exhaust within the next hour
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
}
### etcd proposal alerts ###
# alert if there are several failed proposals within an hour
ALERT HighNumberOfFailedProposals
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of failed proposals within the etcd cluster are happening",
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
}
### etcd disk io latency alerts ###
# alert if 99th percentile of fsync durations is higher than 500ms
ALERT HighFsyncDurations
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "high fsync durations",
description = "ectd instance {{ $labels.instance }} fync durations are high",
}
This diff is collapsed.
Click to expand it.
manifests/prometheus/prometheus-k8s-rules.yaml
+
52
−
0
View file @
5291bc32
apiVersion
:
v1
apiVersion
:
v1
data
:
data
:
etcd2.rules
:
"
###
General
cluster
availability
###
\n\n
#
alert
if
another
failed
peer
will
result
in
an
unavailable
cluster
\n
ALERT
InsufficientPeers
\n
IF
count(up{job=
\"
etcd-k8s
\"
}
==
0)
>
(count(up{job=
\"
etcd-k8s
\"
})
/
2
-
1)
\n
FOR
3m
\n
LABELS
{
\n
severity
=
\"
critical
\"\n
}
\n
ANNOTATIONS
{
\n
summary
=
\"
Etcd
cluster
small
\"
,
\n
\
description
=
\"
If
one
more
etcd
peer
goes
down
the
cluster
will
be
unavailable
\"
,
\n
\
}
\n\n
###
HTTP
requests
alerts
###
\n\n
#
alert
if
more
than
1%
of
requests
to
an
HTTP
endpoint
have
failed
with
a
non
4xx
response
\n
ALERT
HighNumberOfFailedHTTPRequests
\n
\
IF
sum
by(method)
(rate(etcd_http_failed_total{job=
\"
etcd-k8s
\"
,
code!~
\"
4[0-9]{2}
\"
}[5m]))
\n
\
/
sum
by(method)
(rate(etcd_http_received_total{job=
\"
etcd-k8s
\"
}[5m]))
>
0.01
\n
FOR
10m
\n
LABELS
{
\n
severity
=
\"
warning
\"\n
}
\n
ANNOTATIONS
{
\n
\
summary
=
\"
a
high
number
of
HTTP
requests
are
failing
\"
,
\n
description
=
\"
{{
$value
}}%
of
requests
for
{{
$labels.method
}}
failed
on
etcd
instance
{{
$labels.instance
}}
\"
,
\n
}
\n\n
#
alert
if
more
than
5%
of
requests
to
an
HTTP
endpoint
have
failed
with
a
non
4xx
response
\n
ALERT
HighNumberOfFailedHTTPRequests
\n
\
IF
sum
by(method)
(rate(etcd_http_failed_total{job=
\"
etcd-k8s
\"
,
code!~
\"
4[0-9]{2}
\"
}[5m]))
\n
/
sum
by(method)
(rate(etcd_http_received_total{job=
\"
etcd-k8s
\"
}[5m]))
>
0.05
\n
FOR
5m
\n
LABELS
{
\n
severity
=
\"
critical
\"\n
}
\n
ANNOTATIONS
{
\n
summary
=
\"
a
high
number
of
HTTP
requests
are
failing
\"
,
\n
description
=
\"
{{
$value
}}%
of
requests
for
{{
$labels.method
}}
failed
on
etcd
instance
{{
$labels.instance
}}
\"
,
\n
}
\n\n
#
alert
if
50%
of
requests
get
a
4xx
response
\n
ALERT
HighNumberOfFailedHTTPRequests
\n
IF
sum
by(method)
(rate(etcd_http_failed_total{job=
\"
etcd-k8s
\"
,
code=~
\"
4[0-9]{2}
\"
}[5m]))
\n
/
sum
by(method)
(rate(etcd_http_received_total{job=
\"
etcd-k8s
\"
}[5m]))
>
0.5
\n
FOR
10m
\n
LABELS
{
\n
severity
=
\"
critical
\"\n
}
\n
ANNOTATIONS
{
\n
summary
=
\"
a
high
number
of
HTTP
requests
are
failing
\"
,
\n
description
=
\"
{{
$value
}}%
of
requests
for
{{
$labels.method
}}
failed
with
4xx
responses
on
etcd
instance
{{
$labels.instance
}}
\"
,
\n
}
\n\n
#
alert
if
the
99th
percentile
of
HTTP
requests
take
more
than
150ms
\n
ALERT
HTTPRequestsSlow
\n
IF
histogram_quantile(0.99,
rate(etcd_http_successful_duration_second_bucket[5m]))
>
0.15
\n
FOR
10m
\n
LABELS
{
\n
severity
=
\"
warning
\"\n
}
\n
ANNOTATIONS
{
\n
summary
=
\"
slow
HTTP
requests
\"
,
\n
description
=
\"
on
ectd
instance
{{
$labels.instance
}}
HTTP
requests
to
{{
$label.method
}}
are
slow
\"
,
\n
}
\n\n
###
File
descriptor
alerts
###
\n\n
instance:fd_utilization
=
process_open_fds
/
process_max_fds
\n\n
#
alert
if
file
descriptors
are
likely
to
exhaust
within
the
next
4
hours
\n
ALERT
FdExhaustionClose
\n
\
IF
predict_linear(instance:fd_utilization[1h],
3600
*
4)
>
1
\n
FOR
10m
\n
LABELS
{
\n
severity
=
\"
warning
\"\n
}
\n
ANNOTATIONS
{
\n
summary
=
\"
file
descriptors
soon
exhausted
\"
,
\n
description
=
\"
{{
$labels.job
}}
instance
{{
$labels.instance
}}
will
exhaust
in
file
descriptors
soon
\"
,
\n
}
\n\n
#
alert
if
file
descriptors
are
likely
to
exhaust
within
the
next
hour
\n
ALERT
FdExhaustionClose
\n
IF
predict_linear(instance:fd_utilization[10m],
3600)
>
1
\n
FOR
10m
\n
LABELS
{
\n
severity
=
\"
critical
\"\n
}
\n
ANNOTATIONS
{
\n
summary
=
\"
file
descriptors
soon
exhausted
\"
,
\n
description
=
\"
{{
$labels.job
}}
instance
{{
$labels.instance
}}
will
exhaust
in
file
descriptors
soon
\"
,
\n
}
\n\n
###
etcd
proposal
alerts
###
\n\n
#
alert
if
there
are
several
failed
proposals
within
an
hour
\n
ALERT
HighNumberOfFailedProposals
\n
IF
increase(etcd_server_proposal_failed_total{job=
\"
etcd
\"
}[1h])
>
5
\n
LABELS
{
\n
severity
=
\"
warning
\"\n
}
\n
ANNOTATIONS
{
\n
summary
=
\"
a
high
number
of
failed
proposals
within
the
etcd
cluster
are
happening
\"
,
\n
\
description
=
\"
etcd
instance
{{
$labels.instance
}}
has
seen
{{
$value
}}
proposal
failures
within
the
last
hour
\"
,
\n
}
\n\n
###
etcd
disk
io
latency
alerts
###
\n\n
#
alert
if
99th
percentile
of
fsync
durations
is
higher
than
500ms
\n
ALERT
HighFsyncDurations
\n
IF
histogram_quantile(0.99,
rate(etcd_wal_fsync_durations_seconds_bucket[5m]))
>
0.5
\n
FOR
10m
\n
LABELS
{
\n
severity
=
\"
warning
\"\n
}
\n
ANNOTATIONS
{
\n
\
summary
=
\"
high
fsync
durations
\"
,
\n
description
=
\"
ectd
instance
{{
$labels.instance
}}
fync
durations
are
high
\"
,
\n
}
\n
"
kubernetes.rules
:
|+
kubernetes.rules
:
|+
### Container resources ###
### Container resources ###
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment