Skip to content
Snippets Groups Projects
Commit 9764d157 authored by Antoine Legrand's avatar Antoine Legrand Committed by GitHub
Browse files

Merge pull request #1010 from coreos/no_ingest_alert

Add alert if it samples aren't ingested
parents eb636277 0ae6c98a
Branches
Tags
No related merge requests found
...@@ -8,6 +8,7 @@ groups: ...@@ -8,6 +8,7 @@ groups:
severity: warning severity: warning
annotations: annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
- alert: PrometheusNotificationQueueRunningFull - alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m for: 10m
...@@ -16,6 +17,7 @@ groups: ...@@ -16,6 +17,7 @@ groups:
annotations: annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}} $labels.pod}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.01 > 0.01
...@@ -25,6 +27,7 @@ groups: ...@@ -25,6 +27,7 @@ groups:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.03 > 0.03
...@@ -34,6 +37,7 @@ groups: ...@@ -34,6 +37,7 @@ groups:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusNotConnectedToAlertmanagers - alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1 expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m for: 10m
...@@ -42,6 +46,7 @@ groups: ...@@ -42,6 +46,7 @@ groups:
annotations: annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing - alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h for: 12h
...@@ -51,6 +56,7 @@ groups: ...@@ -51,6 +56,7 @@ groups:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.' reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk summary: Prometheus has issues reloading data blocks from disk
- alert: PrometheusTSDBCompactionsFailing - alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h for: 12h
...@@ -60,6 +66,7 @@ groups: ...@@ -60,6 +66,7 @@ groups:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.' compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks summary: Prometheus has issues compacting sample blocks
- alert: PrometheusTSDBWALCorruptions - alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0 expr: tsdb_wal_corruptions_total > 0
for: 4h for: 4h
...@@ -69,3 +76,12 @@ groups: ...@@ -69,3 +76,12 @@ groups:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).' log (WAL).'
summary: Prometheus write-ahead log is corrupted summary: Prometheus write-ahead log is corrupted
- alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
for: 10m
labels:
severity: warning
annotations:
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
summary: "Prometheus isn't ingesting samples"
...@@ -539,6 +539,7 @@ data: ...@@ -539,6 +539,7 @@ data:
severity: warning severity: warning
annotations: annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
- alert: PrometheusNotificationQueueRunningFull - alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m for: 10m
...@@ -547,6 +548,7 @@ data: ...@@ -547,6 +548,7 @@ data:
annotations: annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}} $labels.pod}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.01 > 0.01
...@@ -556,6 +558,7 @@ data: ...@@ -556,6 +558,7 @@ data:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.03 > 0.03
...@@ -565,6 +568,7 @@ data: ...@@ -565,6 +568,7 @@ data:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusNotConnectedToAlertmanagers - alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1 expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m for: 10m
...@@ -573,6 +577,7 @@ data: ...@@ -573,6 +577,7 @@ data:
annotations: annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing - alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h for: 12h
...@@ -582,6 +587,7 @@ data: ...@@ -582,6 +587,7 @@ data:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.' reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk summary: Prometheus has issues reloading data blocks from disk
- alert: PrometheusTSDBCompactionsFailing - alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h for: 12h
...@@ -591,6 +597,7 @@ data: ...@@ -591,6 +597,7 @@ data:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.' compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks summary: Prometheus has issues compacting sample blocks
- alert: PrometheusTSDBWALCorruptions - alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0 expr: tsdb_wal_corruptions_total > 0
for: 4h for: 4h
...@@ -600,3 +607,12 @@ data: ...@@ -600,3 +607,12 @@ data:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).' log (WAL).'
summary: Prometheus write-ahead log is corrupted summary: Prometheus write-ahead log is corrupted
- alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
for: 10m
labels:
severity: warning
annotations:
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
summary: "Prometheus isn't ingesting samples"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment