diff --git a/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet b/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet index 74cfb4f23c544fdc76eab8543e49f221fe9dff4d..d022c599b0add8bfe3901a8501a0b0efd7efe28b 100644 --- a/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet @@ -7,7 +7,8 @@ { alert: 'NodeNetworkInterfaceFlapping', annotations: { - message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}', + summary: "Network interface is often changin it's status", + description: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}', }, expr: ||| changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2 diff --git a/jsonnet/kube-prometheus/components/mixin/alerts/tests.yaml b/jsonnet/kube-prometheus/components/mixin/alerts/tests.yaml deleted file mode 100644 index 532bb895561757e0024a66ef045dbce11f90afc8..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/components/mixin/alerts/tests.yaml +++ /dev/null @@ -1,157 +0,0 @@ -# TODO(metalmatze): This file is temporarily saved here for later reference -# until we find out how to integrate the tests into our jsonnet stack. - -rule_files: - - rules.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}' - values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}' - values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3' - alert_rule_test: - - eval_time: 5m - alertname: AlertmanagerMembersInconsistent - - eval_time: 11m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 17m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 23m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - interval: 1m - input_series: - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}' - values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2' - alert_rule_test: - - eval_time: 5m - alertname: AlertmanagerMembersInconsistent - - eval_time: 11m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 17m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 23m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' diff --git a/jsonnet/kube-prometheus/platforms/eks.libsonnet b/jsonnet/kube-prometheus/platforms/eks.libsonnet index 76eeb385526ec02266087cc482d216f24cc43209..b81d1ccef964624ddf1e745597e5bd042486480f 100644 --- a/jsonnet/kube-prometheus/platforms/eks.libsonnet +++ b/jsonnet/kube-prometheus/platforms/eks.libsonnet @@ -97,10 +97,11 @@ severity: 'critical', }, annotations: { - message: 'Instance {{ $labels.instance }} has less than 10 IPs available.', + summary: 'EKS CNI is running low on available IPs', + description: 'Instance {{ $labels.instance }} has only {{ $value }} IPs available which is lower than set threshold of %s' % $.values.eks.minimumAvailableIPs, }, 'for': $.values.eks.minimumAvailableIPsTime, - alert: 'EksAvailableIPs', + alert: 'EksCNILowAvailableIPs', }, ], }, diff --git a/manifests/kube-prometheus-prometheusRule.yaml b/manifests/kube-prometheus-prometheusRule.yaml index e3ee47faec50fb673badaa40cd1afd830fe3104e..d203dd915bb876cae85231402d3f393668ca19ef 100644 --- a/manifests/kube-prometheus-prometheusRule.yaml +++ b/manifests/kube-prometheus-prometheusRule.yaml @@ -39,8 +39,9 @@ spec: rules: - alert: NodeNetworkInterfaceFlapping annotations: - message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} + description: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping + summary: Network interface is often changin it's status expr: | changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 for: 2m