Skip to content
Snippets Groups Projects
Commit 0ae6c98a authored by Antoine Legrand's avatar Antoine Legrand
Browse files

Add alert if it no samples are ingested

parent 80b2a511
No related branches found
No related tags found
No related merge requests found
...@@ -8,6 +8,7 @@ groups: ...@@ -8,6 +8,7 @@ groups:
severity: warning severity: warning
annotations: annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
- alert: PrometheusNotificationQueueRunningFull - alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m for: 10m
...@@ -16,6 +17,7 @@ groups: ...@@ -16,6 +17,7 @@ groups:
annotations: annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}} $labels.pod}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.01 > 0.01
...@@ -25,6 +27,7 @@ groups: ...@@ -25,6 +27,7 @@ groups:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.03 > 0.03
...@@ -34,6 +37,7 @@ groups: ...@@ -34,6 +37,7 @@ groups:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusNotConnectedToAlertmanagers - alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1 expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m for: 10m
...@@ -42,6 +46,7 @@ groups: ...@@ -42,6 +46,7 @@ groups:
annotations: annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing - alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h for: 12h
...@@ -51,6 +56,7 @@ groups: ...@@ -51,6 +56,7 @@ groups:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.' reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk summary: Prometheus has issues reloading data blocks from disk
- alert: PrometheusTSDBCompactionsFailing - alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h for: 12h
...@@ -60,6 +66,7 @@ groups: ...@@ -60,6 +66,7 @@ groups:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.' compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks summary: Prometheus has issues compacting sample blocks
- alert: PrometheusTSDBWALCorruptions - alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0 expr: tsdb_wal_corruptions_total > 0
for: 4h for: 4h
...@@ -69,3 +76,12 @@ groups: ...@@ -69,3 +76,12 @@ groups:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).' log (WAL).'
summary: Prometheus write-ahead log is corrupted summary: Prometheus write-ahead log is corrupted
- alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
for: 10m
labels:
severity: warning
annotations:
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
summary: "Prometheus isn't ingesting samples"
...@@ -539,6 +539,7 @@ data: ...@@ -539,6 +539,7 @@ data:
severity: warning severity: warning
annotations: annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
- alert: PrometheusNotificationQueueRunningFull - alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m for: 10m
...@@ -547,6 +548,7 @@ data: ...@@ -547,6 +548,7 @@ data:
annotations: annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}} $labels.pod}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.01 > 0.01
...@@ -556,6 +558,7 @@ data: ...@@ -556,6 +558,7 @@ data:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusErrorSendingAlerts - alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.03 > 0.03
...@@ -565,6 +568,7 @@ data: ...@@ -565,6 +568,7 @@ data:
annotations: annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusNotConnectedToAlertmanagers - alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1 expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m for: 10m
...@@ -573,6 +577,7 @@ data: ...@@ -573,6 +577,7 @@ data:
annotations: annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing - alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h for: 12h
...@@ -582,6 +587,7 @@ data: ...@@ -582,6 +587,7 @@ data:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.' reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk summary: Prometheus has issues reloading data blocks from disk
- alert: PrometheusTSDBCompactionsFailing - alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h for: 12h
...@@ -591,6 +597,7 @@ data: ...@@ -591,6 +597,7 @@ data:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.' compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks summary: Prometheus has issues compacting sample blocks
- alert: PrometheusTSDBWALCorruptions - alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0 expr: tsdb_wal_corruptions_total > 0
for: 4h for: 4h
...@@ -600,3 +607,12 @@ data: ...@@ -600,3 +607,12 @@ data:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).' log (WAL).'
summary: Prometheus write-ahead log is corrupted summary: Prometheus write-ahead log is corrupted
- alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
for: 10m
labels:
severity: warning
annotations:
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
summary: "Prometheus isn't ingesting samples"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment