From 64db049d3a886f39816b2778ed305956c9c424dd Mon Sep 17 00:00:00 2001 From: Frederic Branczyk <fbranczyk@gmail.com> Date: Mon, 28 May 2018 16:54:39 +0200 Subject: [PATCH] kube-prometheus: Migrate kube-prometheus alerts to jsonnet --- .../prometheus/rules/alertmanager.rules.yaml | 33 ---- assets/prometheus/rules/general.rules.yaml | 39 ----- assets/prometheus/rules/node.rules.yaml | 47 ----- assets/prometheus/rules/prometheus.rules.yaml | 101 ----------- .../alerts/alertmanager.libsonnet | 53 ++++++ .../kube-prometheus/alerts/alerts.libsonnet | 4 + .../kube-prometheus/alerts/general.libsonnet | 34 ++++ jsonnet/kube-prometheus/alerts/node.libsonnet | 39 +++++ .../alerts/prometheus.libsonnet | 151 ++++++++++++++++ .../kube-prometheus/kube-prometheus.libsonnet | 28 ++- jsonnet/kube-prometheus/rules/rules.libsonnet | 39 +++++ manifests/grafana-dashboardDefinitions.yaml | 26 +-- manifests/prometheus-rules.yaml | 161 +++++++++++++++--- 13 files changed, 497 insertions(+), 258 deletions(-) delete mode 100644 assets/prometheus/rules/alertmanager.rules.yaml delete mode 100644 assets/prometheus/rules/general.rules.yaml delete mode 100644 assets/prometheus/rules/node.rules.yaml delete mode 100644 assets/prometheus/rules/prometheus.rules.yaml create mode 100644 jsonnet/kube-prometheus/alerts/alertmanager.libsonnet create mode 100644 jsonnet/kube-prometheus/alerts/alerts.libsonnet create mode 100644 jsonnet/kube-prometheus/alerts/general.libsonnet create mode 100644 jsonnet/kube-prometheus/alerts/node.libsonnet create mode 100644 jsonnet/kube-prometheus/alerts/prometheus.libsonnet create mode 100644 jsonnet/kube-prometheus/rules/rules.libsonnet diff --git a/assets/prometheus/rules/alertmanager.rules.yaml b/assets/prometheus/rules/alertmanager.rules.yaml deleted file mode 100644 index 5e51f75b..00000000 --- a/assets/prometheus/rules/alertmanager.rules.yaml +++ /dev/null @@ -1,33 +0,0 @@ -groups: -- name: alertmanager.rules - rules: - - alert: AlertmanagerConfigInconsistent - expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) - GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", - "alertmanager-$1", "alertmanager", "(.*)") != 1 - for: 5m - labels: - severity: critical - annotations: - description: The configuration of the instances of the Alertmanager cluster - `{{$labels.service}}` are out of sync. - summary: Configuration out of sync - - alert: AlertmanagerDownOrMissing - expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", - "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 - for: 5m - labels: - severity: warning - annotations: - description: An unexpected number of Alertmanagers are scraped or Alertmanagers - disappeared from discovery. - summary: Alertmanager down or missing - - alert: AlertmanagerFailedReload - expr: alertmanager_config_last_reload_successful == 0 - for: 10m - labels: - severity: warning - annotations: - description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace - }}/{{ $labels.pod}}. - summary: Alertmanager's configuration reload failed diff --git a/assets/prometheus/rules/general.rules.yaml b/assets/prometheus/rules/general.rules.yaml deleted file mode 100644 index 84ce6b47..00000000 --- a/assets/prometheus/rules/general.rules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -groups: -- name: general.rules - rules: - - alert: TargetDown - expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% of {{ $labels.job }} targets are down.' - summary: Targets are down - - alert: DeadMansSwitch - expr: vector(1) - labels: - severity: none - annotations: - description: This is a DeadMansSwitch meant to ensure that the entire Alerting - pipeline is functional. - summary: Alerting DeadMansSwitch - - record: fd_utilization - expr: process_open_fds / process_max_fds - - alert: FdExhaustionClose - expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance - will exhaust in file/socket descriptors within the next 4 hours' - summary: file descriptors soon exhausted - - alert: FdExhaustionClose - expr: predict_linear(fd_utilization[10m], 3600) > 1 - for: 10m - labels: - severity: critical - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance - will exhaust in file/socket descriptors within the next hour' - summary: file descriptors soon exhausted diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml deleted file mode 100644 index e678ca84..00000000 --- a/assets/prometheus/rules/node.rules.yaml +++ /dev/null @@ -1,47 +0,0 @@ -groups: -- name: node.rules - rules: - - record: instance:node_cpu:rate:sum - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) - BY (instance) - - record: instance:node_filesystem_usage:sum - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) - BY (instance) - - record: instance:node_network_receive_bytes:rate:sum - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) - - record: instance:node_network_transmit_bytes:rate:sum - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) - - record: instance:node_cpu:ratio - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) - GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) - - record: cluster:node_cpu:sum_rate5m - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) - - record: cluster:node_cpu:ratio - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - - alert: NodeExporterDown - expr: absent(up{job="node-exporter"} == 1) - for: 10m - labels: - severity: warning - annotations: - description: Prometheus could not scrape a node-exporter for more than 10m, - or node-exporters have disappeared from discovery - summary: Prometheus could not scrape a node-exporter - - alert: NodeDiskRunningFull - expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 - for: 30m - labels: - severity: warning - annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 24 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 24 hours - - alert: NodeDiskRunningFull - expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 - for: 10m - labels: - severity: critical - annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 2 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 2 hours diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml deleted file mode 100644 index da699c32..00000000 --- a/assets/prometheus/rules/prometheus.rules.yaml +++ /dev/null @@ -1,101 +0,0 @@ -groups: -- name: prometheus.rules - rules: - - alert: PrometheusConfigReloadFailed - expr: prometheus_config_last_reload_successful == 0 - for: 10m - labels: - severity: warning - annotations: - description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} - summary: Reloading Promehteus' configuration failed - - - alert: PrometheusNotificationQueueRunningFull - expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity - for: 10m - labels: - severity: warning - annotations: - description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ - $labels.pod}} - summary: Prometheus' alert notification queue is running full - - - alert: PrometheusErrorSendingAlerts - expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) - > 0.01 - for: 10m - labels: - severity: warning - annotations: - description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ - $labels.pod}} to Alertmanager {{$labels.Alertmanager}} - summary: Errors while sending alert from Prometheus - - - alert: PrometheusErrorSendingAlerts - expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) - > 0.03 - for: 10m - labels: - severity: critical - annotations: - description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ - $labels.pod}} to Alertmanager {{$labels.Alertmanager}} - summary: Errors while sending alerts from Prometheus - - - alert: PrometheusNotConnectedToAlertmanagers - expr: prometheus_notifications_alertmanagers_discovered < 1 - for: 10m - labels: - severity: warning - annotations: - description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected - to any Alertmanagers - summary: Prometheus is not connected to any Alertmanagers - - - alert: PrometheusTSDBReloadsFailing - expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 - for: 12h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} - reload failures over the last four hours.' - summary: Prometheus has issues reloading data blocks from disk - - - alert: PrometheusTSDBCompactionsFailing - expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 - for: 12h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} - compaction failures over the last four hours.' - summary: Prometheus has issues compacting sample blocks - - - alert: PrometheusTSDBWALCorruptions - expr: tsdb_wal_corruptions_total > 0 - for: 4h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead - log (WAL).' - summary: Prometheus write-ahead log is corrupted - - - alert: PrometheusNotIngestingSamples - expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0 - for: 10m - labels: - severity: warning - annotations: - description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." - summary: "Prometheus isn't ingesting samples" - - - alert: PrometheusTargetScapesDuplicate - expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 - for: 10m - labels: - severity: warning - annotations: - description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values" - summary: Prometheus has many samples rejected diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet new file mode 100644 index 00000000..d283cc18 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -0,0 +1,53 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'alertmanager.rules', + rules: [ + { + alert: 'AlertmanagerConfigInconsistent', + annotations: { + description: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.', + summary: 'Configuration out of sync', + }, + expr: ||| + count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'AlertmanagerDownOrMissing', + annotations: { + description: 'An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.', + summary: 'Alertmanager down or missing', + }, + expr: ||| + label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{%(alertmanagerSelector)s}) BY (job) != 1 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'AlertmanagerFailedReload', + annotations: { + description: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.", + summary: "Alertmanager's configuration reload failed", + }, + expr: ||| + alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/alerts/alerts.libsonnet b/jsonnet/kube-prometheus/alerts/alerts.libsonnet new file mode 100644 index 00000000..19568a24 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/alerts.libsonnet @@ -0,0 +1,4 @@ +(import 'alertmanager.libsonnet') + +(import 'general.libsonnet') + +(import 'node.libsonnet') + +(import 'prometheus.libsonnet') diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/alerts/general.libsonnet new file mode 100644 index 00000000..6f3e4534 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/general.libsonnet @@ -0,0 +1,34 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'general.rules', + rules: [ + { + alert: 'TargetDown', + annotations: { + description: '{{ $value }}% of {{ $labels.job }} targets are down.', + summary: 'Targets are down', + }, + expr: '100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10', + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'DeadMansSwitch', + annotations: { + description: 'This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.', + summary: 'Alerting DeadMansSwitch', + }, + expr: 'vector(1)', + labels: { + severity: 'none', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/alerts/node.libsonnet new file mode 100644 index 00000000..f5387a99 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/node.libsonnet @@ -0,0 +1,39 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'kube-prometheus-node-alerting.rules', + rules: [ + { + alert: 'NodeDiskRunningFull', + annotations: { + description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})', + summary: 'Node disk is running full within 24 hours', + }, + expr: ||| + predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[6h], 3600 * 24) < 0 + ||| % $._config, + 'for': '30m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'NodeDiskRunningFull', + annotations: { + description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})', + summary: 'Node disk is running full within 2 hours', + }, + expr: ||| + predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[30m], 3600 * 2) < 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet new file mode 100644 index 00000000..32d8262b --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet @@ -0,0 +1,151 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'prometheus.rules', + rules: [ + { + alert: 'PrometheusConfigReloadFailed', + annotations: { + description: "Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}", + summary: "Reloading Promehteus' configuration failed", + }, + expr: ||| + prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusNotificationQueueRunningFull', + annotations: { + description: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}", + summary: "Prometheus' alert notification queue is running full", + }, + expr: ||| + predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) > prometheus_notifications_queue_capacity{%(prometheusSelector)s} + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusErrorSendingAlerts', + annotations: { + description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + summary: 'Errors while sending alert from Prometheus', + }, + expr: ||| + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.01 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusErrorSendingAlerts', + annotations: { + description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + summary: 'Errors while sending alerts from Prometheus', + }, + expr: ||| + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.03 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'PrometheusNotConnectedToAlertmanagers', + annotations: { + description: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers', + summary: 'Prometheus is not connected to any Alertmanagers', + }, + expr: ||| + prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTSDBReloadsFailing', + annotations: { + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.', + summary: 'Prometheus has issues reloading data blocks from disk', + }, + expr: ||| + increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0 + ||| % $._config, + 'for': '12h', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTSDBCompactionsFailing', + annotations: { + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.', + summary: 'Prometheus has issues compacting sample blocks', + }, + expr: ||| + increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0 + ||| % $._config, + 'for': '12h', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTSDBWALCorruptions', + annotations: { + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).', + summary: 'Prometheus write-ahead log is corrupted', + }, + expr: ||| + tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0 + ||| % $._config, + 'for': '4h', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusNotIngestingSamples', + annotations: { + description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.", + summary: "Prometheus isn't ingesting samples", + }, + expr: ||| + rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTargetScapesDuplicate', + annotations: { + description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values', + summary: 'Prometheus has many samples rejected', + }, + expr: ||| + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index e79b7567..6c1636de 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -6,7 +6,9 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; (import 'alertmanager/alertmanager.libsonnet') + (import 'prometheus-operator/prometheus-operator.libsonnet') + (import 'prometheus/prometheus.libsonnet') + -(import 'kubernetes-mixin/mixin.libsonnet') + { +(import 'kubernetes-mixin/mixin.libsonnet') + +(import 'alerts/alerts.libsonnet') + +(import 'rules/rules.libsonnet') + { kubePrometheus+:: { namespace: k.core.v1.namespace.new($._config.namespace), }, @@ -14,11 +16,31 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; _config+:: { namespace: 'default', - kubeStateMetricsSelector: 'job="kube-state-metrics"', cadvisorSelector: 'job="kubelet"', - nodeExporterSelector: 'job="node-exporter"', kubeletSelector: 'job="kubelet"', + kubeStateMetricsSelector: 'job="kube-state-metrics"', + nodeExporterSelector: 'job="node-exporter"', notKubeDnsSelector: 'job!="kube-dns"', + kubeSchedulerSelector: 'job="kube-scheduler"', + kubeControllerManagerSelector: 'job="kube-controller-manager"', + kubeApiserverSelector: 'job="apiserver"', + podLabel: 'pod', + + alertmanagerSelector: 'job="alertmanager-main"', + prometheusSelector: 'job="prometheus-k8s"', + prometheusOperatorSelector: 'job="prometheus-operator"', + + jobs: { + Kubelet: $._config.kubeletSelector, + KubeScheduler: $._config.kubeSchedulerSelector, + KubeControllerManager: $._config.kubeControllerManagerSelector, + KubeAPI: $._config.kubeApiserverSelector, + KubeStateMetrics: $._config.kubeStateMetricsSelector, + NodeExporter: $._config.nodeExporterSelector, + Alertmanager: $._config.alertmanagerSelector, + Prometheus: $._config.prometheusSelector, + PrometheusOperator: $._config.prometheusOperatorSelector, + }, prometheus+:: { rules: $.prometheusRules + $.prometheusAlerts, diff --git a/jsonnet/kube-prometheus/rules/rules.libsonnet b/jsonnet/kube-prometheus/rules/rules.libsonnet new file mode 100644 index 00000000..ec3a331e --- /dev/null +++ b/jsonnet/kube-prometheus/rules/rules.libsonnet @@ -0,0 +1,39 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'kube-prometheus-node-recording.rules', + rules: [ + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)', + record: 'instance:node_cpu:rate:sum', + }, + { + expr: 'sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) BY (instance)', + record: 'instance:node_filesystem_usage:sum', + }, + { + expr: 'sum(rate(node_network_receive_bytes[3m])) BY (instance)', + record: 'instance:node_network_receive_bytes:rate:sum', + }, + { + expr: 'sum(rate(node_network_transmit_bytes[3m])) BY (instance)', + record: 'instance:node_network_transmit_bytes:rate:sum', + }, + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)', + record: 'instance:node_cpu:ratio', + }, + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))', + record: 'cluster:node_cpu:sum_rate5m', + }, + { + expr: 'cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))', + record: 'cluster:node_cpu:ratio', + }, + ], + }, + ], + }, +} diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index af7e2749..f4058562 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -3868,7 +3868,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod_name=\"$pod\"}[1m])) by (container_name)", + "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[1m])) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4097,7 +4097,7 @@ data: ], "targets": [ { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4228,7 +4228,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}) by (container_name)", + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4457,7 +4457,7 @@ data: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5003,7 +5003,7 @@ data: "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "percent", "gauge": { "maxValue": 100, @@ -5206,7 +5206,7 @@ data: "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "percent", "gauge": { "maxValue": 100, @@ -6066,7 +6066,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6145,7 +6145,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6224,7 +6224,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6317,7 +6317,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6397,7 +6397,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6477,7 +6477,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6557,7 +6557,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 35aaa927..d916ff29 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -49,13 +49,13 @@ data: without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n- \"name\": \"kube-apiserver.rules\"\n \"rules\": \n - \"expr\": |\n histogram_quantile(0.99, - sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) without(instance, + sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \"expr\": - |\n histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) + |\n histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) + \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n- \"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info) @@ -122,20 +122,49 @@ data: by (node) (\n (irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m]) +\n irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kubernetes-absent\"\n - \ \"rules\": \n - \"alert\": \"KubeAPIDown\"\n \"annotations\": \n \"message\": - \"KubeAPI has disappeared from Prometheus target discovery.\"\n \"expr\": |\n - \ absent(up{job=\"kube-apiserver\"} == 1)\n \"for\": \"15m\"\n \"labels\": - \n \"severity\": \"critical\"\n - \"alert\": \"KubeControllerManagerDown\"\n - \ \"annotations\": \n \"message\": \"KubeControllerManager has disappeared - from Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"kube-controller-manager\"} + \ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kube-prometheus-node-recording.rules\"\n + \ \"rules\": \n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[3m])) + BY (instance)\"\n \"record\": \"instance:node_cpu:rate:sum\"\n - \"expr\": + \"sum((node_filesystem_size{mountpoint=\\\"/\\\"} - node_filesystem_free{mountpoint=\\\"/\\\"})) + BY (instance)\"\n \"record\": \"instance:node_filesystem_usage:sum\"\n - \"expr\": + \"sum(rate(node_network_receive_bytes[3m])) BY (instance)\"\n \"record\": \"instance:node_network_receive_bytes:rate:sum\"\n + \ - \"expr\": \"sum(rate(node_network_transmit_bytes[3m])) BY (instance)\"\n \"record\": + \"instance:node_network_transmit_bytes:rate:sum\"\n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m])) + WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, + cpu)) BY (instance)\"\n \"record\": \"instance:node_cpu:ratio\"\n - \"expr\": + \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m]))\"\n \"record\": + \"cluster:node_cpu:sum_rate5m\"\n - \"expr\": \"cluster:node_cpu:rate5m / count(sum(node_cpu) + BY (instance, cpu))\"\n \"record\": \"cluster:node_cpu:ratio\"\n- \"name\": + \"kubernetes-absent\"\n \"rules\": \n - \"alert\": \"AlertmanagerDown\"\n \"annotations\": + \n \"message\": \"Alertmanager has disappeared from Prometheus target discovery.\"\n + \ \"expr\": |\n absent(up{job=\"alertmanager-main\"} == 1)\n \"for\": + \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeAPIDown\"\n + \ \"annotations\": \n \"message\": \"KubeAPI has disappeared from Prometheus + target discovery.\"\n \"expr\": |\n absent(up{job=\"apiserver\"} == 1)\n + \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - + \"alert\": \"KubeControllerManagerDown\"\n \"annotations\": \n \"message\": + \"KubeControllerManager has disappeared from Prometheus target discovery.\"\n + \ \"expr\": |\n absent(up{job=\"kube-controller-manager\"} == 1)\n \"for\": + \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeSchedulerDown\"\n + \ \"annotations\": \n \"message\": \"KubeScheduler has disappeared from + Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"kube-scheduler\"} == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"KubeSchedulerDown\"\n \"annotations\": \n \"message\": - \"KubeScheduler has disappeared from Prometheus target discovery.\"\n \"expr\": - |\n absent(up{job=\"kube-scheduler\"} == 1)\n \"for\": \"15m\"\n \"labels\": + \ - \"alert\": \"KubeStateMetricsDown\"\n \"annotations\": \n \"message\": + \"KubeStateMetrics has disappeared from Prometheus target discovery.\"\n \"expr\": + |\n absent(up{job=\"kube-state-metrics\"} == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeletDown\"\n \"annotations\": \n \"message\": \"Kubelet has disappeared from Prometheus target discovery.\"\n \ \"expr\": |\n absent(up{job=\"kubelet\"} == 1)\n \"for\": \"15m\"\n + \ \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"NodeExporterDown\"\n + \ \"annotations\": \n \"message\": \"NodeExporter has disappeared from + Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"node-exporter\"} + == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n + \ - \"alert\": \"PrometheusDown\"\n \"annotations\": \n \"message\": \"Prometheus + has disappeared from Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"prometheus-k8s\"} + == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n + \ - \"alert\": \"PrometheusOperatorDown\"\n \"annotations\": \n \"message\": + \"PrometheusOperator has disappeared from Prometheus target discovery.\"\n \"expr\": + |\n absent(up{job=\"prometheus-operator\"} == 1)\n \"for\": \"15m\"\n \ \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"kubernetes-apps\"\n \ \"rules\": \n - \"alert\": \"KubePodCrashLooping\"\n \"annotations\": \n \ \"message\": \"{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container @@ -239,28 +268,116 @@ data: 100\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"warning\"\n \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} - {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"kube-apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} + {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} > 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} - {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"kube-apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} + {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} > 4\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n \ - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": - \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"kube-apiserver\",code=~\"^(?:5..)$\"}[5m])) - without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"kube-apiserver\"}[5m])) + \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) + without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": - \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"kube-apiserver\",code=~\"^(?:5..)$\"}[5m])) - without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"kube-apiserver\"}[5m])) + \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) + without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring in less than 7 days.\"\n - \ \"expr\": |\n histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) + \ \"expr\": |\n histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n \ \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring in less than 1 day.\"\n \"expr\": |\n histogram_quantile(0.01, sum by - (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) - < 86400\n \"labels\": \n \"severity\": \"critical\"" + (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) + < 86400\n \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"alertmanager.rules\"\n + \ \"rules\": \n - \"alert\": \"AlertmanagerConfigInconsistent\"\n \"annotations\": + \n \"description\": \"The configuration of the instances of the Alertmanager + cluster `{{$labels.service}}` are out of sync.\"\n \"summary\": \"Configuration + out of sync\"\n \"expr\": |\n count_values(\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\"}) + BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"}, + \"service\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") != 1\n \"for\": + \"5m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"AlertmanagerDownOrMissing\"\n + \ \"annotations\": \n \"description\": \"An unexpected number of Alertmanagers + are scraped or Alertmanagers disappeared from discovery.\"\n \"summary\": + \"Alertmanager down or missing\"\n \"expr\": |\n label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"}, + \"job\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") / ON(job) GROUP_RIGHT() + sum(up{job=\"alertmanager-main\"}) BY (job) != 1\n \"for\": \"5m\"\n \"labels\": + \n \"severity\": \"warning\"\n - \"alert\": \"AlertmanagerFailedReload\"\n + \ \"annotations\": \n \"description\": \"Reloading Alertmanager's configuration + has failed for {{ $labels.namespace }}/{{ $labels.pod}}.\"\n \"summary\": + \"Alertmanager's configuration reload failed\"\n \"expr\": |\n alertmanager_config_last_reload_successful{job=\"alertmanager-main\"} + == 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n- + \"name\": \"general.rules\"\n \"rules\": \n - \"alert\": \"TargetDown\"\n \"annotations\": + \n \"description\": \"{{ $value }}% of {{ $labels.job }} targets are down.\"\n + \ \"summary\": \"Targets are down\"\n \"expr\": \"100 * (count(up == 0) + BY (job) / count(up) BY (job)) > 10\"\n \"for\": \"10m\"\n \"labels\": \n + \ \"severity\": \"warning\"\n - \"alert\": \"DeadMansSwitch\"\n \"annotations\": + \n \"description\": \"This is a DeadMansSwitch meant to ensure that the entire + Alerting pipeline is functional.\"\n \"summary\": \"Alerting DeadMansSwitch\"\n + \ \"expr\": \"vector(1)\"\n \"labels\": \n \"severity\": \"none\"\n- + \"name\": \"kube-prometheus-node-alerting.rules\"\n \"rules\": \n - \"alert\": + \"NodeDiskRunningFull\"\n \"annotations\": \n \"description\": \"device + {{$labels.device}} on node {{$labels.instance}} is running full within the next + 24 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node disk + is running full within 24 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[6h], + 3600 * 24) < 0\n \"for\": \"30m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"NodeDiskRunningFull\"\n \"annotations\": \n \"description\": + \"device {{$labels.device}} on node {{$labels.instance}} is running full within + the next 2 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node + disk is running full within 2 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[30m], + 3600 * 2) < 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n- + \"name\": \"prometheus.rules\"\n \"rules\": \n - \"alert\": \"PrometheusConfigReloadFailed\"\n + \ \"annotations\": \n \"description\": \"Reloading Prometheus' configuration + has failed for {{$labels.namespace}}/{{$labels.pod}}\"\n \"summary\": \"Reloading + Promehteus' configuration failed\"\n \"expr\": |\n prometheus_config_last_reload_successful{job=\"prometheus-k8s\"} + == 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusNotificationQueueRunningFull\"\n \"annotations\": + \n \"description\": \"Prometheus' alert notification queue is running full + for {{$labels.namespace}}/{{ $labels.pod}}\"\n \"summary\": \"Prometheus' + alert notification queue is running full\"\n \"expr\": |\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\"}[5m], + 60 * 30) > prometheus_notifications_queue_capacity{job=\"prometheus-k8s\"}\n \"for\": + \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusErrorSendingAlerts\"\n + \ \"annotations\": \n \"description\": \"Errors while sending alerts from + Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\"\n + \ \"summary\": \"Errors while sending alert from Prometheus\"\n \"expr\": + |\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m]) + / rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.01\n + \ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": + \"PrometheusErrorSendingAlerts\"\n \"annotations\": \n \"description\": + \"Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} + to Alertmanager {{$labels.Alertmanager}}\"\n \"summary\": \"Errors while + sending alerts from Prometheus\"\n \"expr\": |\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m]) + / rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.03\n + \ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n - + \"alert\": \"PrometheusNotConnectedToAlertmanagers\"\n \"annotations\": \n + \ \"description\": \"Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is + not connected to any Alertmanagers\"\n \"summary\": \"Prometheus is not connected + to any Alertmanagers\"\n \"expr\": |\n prometheus_notifications_alertmanagers_discovered{job=\"prometheus-k8s\"} + < 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTSDBReloadsFailing\"\n \"annotations\": \n \"description\": + \"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures + over the last four hours.\"\n \"summary\": \"Prometheus has issues reloading + data blocks from disk\"\n \"expr\": |\n increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-k8s\"}[2h]) + > 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTSDBCompactionsFailing\"\n \"annotations\": \n \"description\": + \"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction + failures over the last four hours.\"\n \"summary\": \"Prometheus has issues + compacting sample blocks\"\n \"expr\": |\n increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-k8s\"}[2h]) + > 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTSDBWALCorruptions\"\n \"annotations\": \n \"description\": + \"{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).\"\n + \ \"summary\": \"Prometheus write-ahead log is corrupted\"\n \"expr\": + |\n tsdb_wal_corruptions_total{job=\"prometheus-k8s\"} > 0\n \"for\": + \"4h\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusNotIngestingSamples\"\n + \ \"annotations\": \n \"description\": \"Prometheus {{ $labels.namespace + }}/{{ $labels.pod}} isn't ingesting samples.\"\n \"summary\": \"Prometheus + isn't ingesting samples\"\n \"expr\": |\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\"}[5m]) + <= 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTargetScapesDuplicate\"\n \"annotations\": \n \"description\": + \"{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate + timestamps but different values\"\n \"summary\": \"Prometheus has many samples + rejected\"\n \"expr\": |\n increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-k8s\"}[5m]) + > 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"" kind: ConfigMap metadata: labels: -- GitLab