Skip to content
Snippets Groups Projects
Unverified Commit acea5efd authored by Paweł Krupa's avatar Paweł Krupa Committed by GitHub
Browse files

Merge pull request #1268 from paulfantom/alerts-best-practices

Alerts best practices
parents cd4438ed 46eb1713
No related branches found
No related tags found
No related merge requests found
...@@ -7,7 +7,8 @@ ...@@ -7,7 +7,8 @@
{ {
alert: 'NodeNetworkInterfaceFlapping', alert: 'NodeNetworkInterfaceFlapping',
annotations: { annotations: {
message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}', summary: "Network interface is often changin it's status",
description: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}',
}, },
expr: ||| expr: |||
changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2 changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2
......
# TODO(metalmatze): This file is temporarily saved here for later reference
# until we find out how to integrate the tests into our jsonnet stack.
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
alert_rule_test:
- eval_time: 5m
alertname: AlertmanagerMembersInconsistent
- eval_time: 11m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 17m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 23m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- interval: 1m
input_series:
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
alert_rule_test:
- eval_time: 5m
alertname: AlertmanagerMembersInconsistent
- eval_time: 11m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.1
namespace: monitoring
pod: alertmanager-main-1
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.2
namespace: monitoring
pod: alertmanager-main-2
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 17m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.1
namespace: monitoring
pod: alertmanager-main-1
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.2
namespace: monitoring
pod: alertmanager-main-2
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 23m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.1
namespace: monitoring
pod: alertmanager-main-1
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.2
namespace: monitoring
pod: alertmanager-main-2
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
...@@ -97,10 +97,11 @@ ...@@ -97,10 +97,11 @@
severity: 'critical', severity: 'critical',
}, },
annotations: { annotations: {
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.', summary: 'EKS CNI is running low on available IPs',
description: 'Instance {{ $labels.instance }} has only {{ $value }} IPs available which is lower than set threshold of %s' % $.values.eks.minimumAvailableIPs,
}, },
'for': $.values.eks.minimumAvailableIPsTime, 'for': $.values.eks.minimumAvailableIPsTime,
alert: 'EksAvailableIPs', alert: 'EksCNILowAvailableIPs',
}, },
], ],
}, },
......
...@@ -39,8 +39,9 @@ spec: ...@@ -39,8 +39,9 @@ spec:
rules: rules:
- alert: NodeNetworkInterfaceFlapping - alert: NodeNetworkInterfaceFlapping
annotations: annotations:
message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} description: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping
summary: Network interface is often changin it's status
expr: | expr: |
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m for: 2m
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment