Merge pull request #1010 from coreos/no_ingest_alert

Add alert if it samples aren't ingested

Merge pull request #1010 from coreos/no_ingest_alert
9764d157 · Antoine Legrand · GitHub · eb636277 · 0ae6c98a · 9764d157
Commit 9764d157 authored 7 years ago by Antoine Legrand Committed by GitHub 7 years ago
--- a/assets/prometheus/rules/prometheus.rules.yaml
+++ b/assets/prometheus/rules/prometheus.rules.yaml
@@ -8,6 +8,7 @@ groups:
      severity: warning
    annotations:
      description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
  - alert: PrometheusNotificationQueueRunningFull
    expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
    for: 10m
@@ -16,6 +17,7 @@ groups:
    annotations:
      description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
        $labels.pod}}
  - alert: PrometheusErrorSendingAlerts
    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
      > 0.01
@@ -25,6 +27,7 @@ groups:
    annotations:
      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
  - alert: PrometheusErrorSendingAlerts
    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
      > 0.03
@@ -34,6 +37,7 @@ groups:
    annotations:
      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
  - alert: PrometheusNotConnectedToAlertmanagers
    expr: prometheus_notifications_alertmanagers_discovered < 1
    for: 10m
@@ -42,6 +46,7 @@ groups:
    annotations:
      description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
        to any Alertmanagers
  - alert: PrometheusTSDBReloadsFailing
    expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
    for: 12h
@@ -51,6 +56,7 @@ groups:
      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
        reload failures over the last four hours.'
      summary: Prometheus has issues reloading data blocks from disk
  - alert: PrometheusTSDBCompactionsFailing
    expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
    for: 12h
@@ -60,6 +66,7 @@ groups:
      description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
        compaction failures over the last four hours.'
      summary: Prometheus has issues compacting sample blocks
  - alert: PrometheusTSDBWALCorruptions
    expr: tsdb_wal_corruptions_total > 0
    for: 4h
@@ -69,3 +76,12 @@ groups:
      description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
        log (WAL).'
      summary: Prometheus write-ahead log is corrupted
+  - alert: PrometheusNotIngestingSamples
+    expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
+      summary: "Prometheus isn't ingesting samples"
--- a/manifests/prometheus/prometheus-k8s-rules.yaml
+++ b/manifests/prometheus/prometheus-k8s-rules.yaml
@@ -539,6 +539,7 @@ data:
          severity: warning
        annotations:
          description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
      - alert: PrometheusNotificationQueueRunningFull
        expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
        for: 10m
@@ -547,6 +548,7 @@ data:
        annotations:
          description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
            $labels.pod}}
      - alert: PrometheusErrorSendingAlerts
        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
          > 0.01
@@ -556,6 +558,7 @@ data:
        annotations:
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
      - alert: PrometheusErrorSendingAlerts
        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
          > 0.03
@@ -565,6 +568,7 @@ data:
        annotations:
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
      - alert: PrometheusNotConnectedToAlertmanagers
        expr: prometheus_notifications_alertmanagers_discovered < 1
        for: 10m
@@ -573,6 +577,7 @@ data:
        annotations:
          description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
            to any Alertmanagers
      - alert: PrometheusTSDBReloadsFailing
        expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
        for: 12h
@@ -582,6 +587,7 @@ data:
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
            reload failures over the last four hours.'
          summary: Prometheus has issues reloading data blocks from disk
      - alert: PrometheusTSDBCompactionsFailing
        expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
        for: 12h
@@ -591,6 +597,7 @@ data:
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
            compaction failures over the last four hours.'
          summary: Prometheus has issues compacting sample blocks
      - alert: PrometheusTSDBWALCorruptions
        expr: tsdb_wal_corruptions_total > 0
        for: 4h
@@ -600,3 +607,12 @@ data:
          description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
            log (WAL).'
          summary: Prometheus write-ahead log is corrupted
+      - alert: PrometheusNotIngestingSamples
+        expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
+          summary: "Prometheus isn't ingesting samples"