From dcf521d1dc0d0eb51d71a148f670454538c27585 Mon Sep 17 00:00:00 2001
From: Sheogorath <sheogorath@shivering-isles.com>
Date: Fri, 11 Feb 2022 00:29:44 +0100
Subject: [PATCH] feat(flux-system): Add alert for failing reconsiliations

This patch adds an alert for failing flux reconsiliations that should
prevent failing flux deployments from staying unnoticed.
---
 infrastructure/flux-system/alerts.yaml        | 15 +++++++++++++++
 infrastructure/flux-system/kustomization.yaml |  1 +
 2 files changed, 16 insertions(+)
 create mode 100644 infrastructure/flux-system/alerts.yaml

diff --git a/infrastructure/flux-system/alerts.yaml b/infrastructure/flux-system/alerts.yaml
new file mode 100644
index 000000000..1633ef2fb
--- /dev/null
+++ b/infrastructure/flux-system/alerts.yaml
@@ -0,0 +1,15 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: flux-system-alerts
+spec:
+  groups:
+    - name: GitOpsToolkit
+      rules:
+        - alert: ReconciliationFailure
+          expr: max(gotk_reconcile_condition{status="False",type="Ready"}) by (exported_namespace, name, kind) + on(exported_namespace, name, kind) (max(gotk_reconcile_condition{status="Deleted"}) by (exported_namespace, name, kind)) * 2 == 1
+          for: 10m
+          labels:
+            severity: critical
+          annotations:
+            summary: '{{ $labels.kind }} {{ $labels.exported_namespace }}/{{ $labels.name }} reconciliation has been failing for more than ten minutes.'
diff --git a/infrastructure/flux-system/kustomization.yaml b/infrastructure/flux-system/kustomization.yaml
index f06c5e82e..3d68f553b 100644
--- a/infrastructure/flux-system/kustomization.yaml
+++ b/infrastructure/flux-system/kustomization.yaml
@@ -3,6 +3,7 @@ kind: Kustomization
 namespace: flux-system
 resources:
   - monitoring.yaml
+  - alerts.yaml
   - ../../shared/networkpolicies/allow-from-monitoring.yaml
 patchesStrategicMerge:
   - networkpolicy.yaml
-- 
GitLab