diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml index e27aa281c1f9a02569f91f09a140d451d2eccfe1..e006ba9be56dec9af64ca433e130939cf8151c34 100644 --- a/assets/prometheus/rules/prometheus.rules.yaml +++ b/assets/prometheus/rules/prometheus.rules.yaml @@ -8,6 +8,7 @@ groups: severity: warning annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + - alert: PrometheusNotificationQueueRunningFull expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity for: 10m @@ -16,6 +17,7 @@ groups: annotations: description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}} + - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.01 @@ -25,6 +27,7 @@ groups: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.03 @@ -34,6 +37,7 @@ groups: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusNotConnectedToAlertmanagers expr: prometheus_notifications_alertmanagers_discovered < 1 for: 10m @@ -42,6 +46,7 @@ groups: annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers + - alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 for: 12h @@ -51,6 +56,7 @@ groups: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk + - alert: PrometheusTSDBCompactionsFailing expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 for: 12h @@ -60,6 +66,7 @@ groups: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks + - alert: PrometheusTSDBWALCorruptions expr: tsdb_wal_corruptions_total > 0 for: 4h @@ -69,3 +76,12 @@ groups: description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).' summary: Prometheus write-ahead log is corrupted + + - alert: PrometheusNotIngestingSamples + expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0 + for: 10m + labels: + severity: warning + annotations: + description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." + summary: "Prometheus isn't ingesting samples" diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index d563a57172a41aa2d5b623f3faf787ef0660752d..f6b2b8f8550719f17601dafb76f1ba600e92489d 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -539,6 +539,7 @@ data: severity: warning annotations: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + - alert: PrometheusNotificationQueueRunningFull expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity for: 10m @@ -547,6 +548,7 @@ data: annotations: description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}} + - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.01 @@ -556,6 +558,7 @@ data: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusErrorSendingAlerts expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) > 0.03 @@ -565,6 +568,7 @@ data: annotations: description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusNotConnectedToAlertmanagers expr: prometheus_notifications_alertmanagers_discovered < 1 for: 10m @@ -573,6 +577,7 @@ data: annotations: description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers + - alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 for: 12h @@ -582,6 +587,7 @@ data: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk + - alert: PrometheusTSDBCompactionsFailing expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 for: 12h @@ -591,6 +597,7 @@ data: description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks + - alert: PrometheusTSDBWALCorruptions expr: tsdb_wal_corruptions_total > 0 for: 4h @@ -600,3 +607,12 @@ data: description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).' summary: Prometheus write-ahead log is corrupted + + - alert: PrometheusNotIngestingSamples + expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0 + for: 10m + labels: + severity: warning + annotations: + description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." + summary: "Prometheus isn't ingesting samples"