From dcf521d1dc0d0eb51d71a148f670454538c27585 Mon Sep 17 00:00:00 2001 From: Sheogorath <sheogorath@shivering-isles.com> Date: Fri, 11 Feb 2022 00:29:44 +0100 Subject: [PATCH] feat(flux-system): Add alert for failing reconsiliations This patch adds an alert for failing flux reconsiliations that should prevent failing flux deployments from staying unnoticed. --- infrastructure/flux-system/alerts.yaml | 15 +++++++++++++++ infrastructure/flux-system/kustomization.yaml | 1 + 2 files changed, 16 insertions(+) create mode 100644 infrastructure/flux-system/alerts.yaml diff --git a/infrastructure/flux-system/alerts.yaml b/infrastructure/flux-system/alerts.yaml new file mode 100644 index 000000000..1633ef2fb --- /dev/null +++ b/infrastructure/flux-system/alerts.yaml @@ -0,0 +1,15 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: flux-system-alerts +spec: + groups: + - name: GitOpsToolkit + rules: + - alert: ReconciliationFailure + expr: max(gotk_reconcile_condition{status="False",type="Ready"}) by (exported_namespace, name, kind) + on(exported_namespace, name, kind) (max(gotk_reconcile_condition{status="Deleted"}) by (exported_namespace, name, kind)) * 2 == 1 + for: 10m + labels: + severity: critical + annotations: + summary: '{{ $labels.kind }} {{ $labels.exported_namespace }}/{{ $labels.name }} reconciliation has been failing for more than ten minutes.' diff --git a/infrastructure/flux-system/kustomization.yaml b/infrastructure/flux-system/kustomization.yaml index f06c5e82e..3d68f553b 100644 --- a/infrastructure/flux-system/kustomization.yaml +++ b/infrastructure/flux-system/kustomization.yaml @@ -3,6 +3,7 @@ kind: Kustomization namespace: flux-system resources: - monitoring.yaml + - alerts.yaml - ../../shared/networkpolicies/allow-from-monitoring.yaml patchesStrategicMerge: - networkpolicy.yaml -- GitLab