diff --git a/infrastructure/flux-system/alerts.yaml b/infrastructure/flux-system/alerts.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1633ef2fb3ed26e37637f70090c92acf65f67593 --- /dev/null +++ b/infrastructure/flux-system/alerts.yaml @@ -0,0 +1,15 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: flux-system-alerts +spec: + groups: + - name: GitOpsToolkit + rules: + - alert: ReconciliationFailure + expr: max(gotk_reconcile_condition{status="False",type="Ready"}) by (exported_namespace, name, kind) + on(exported_namespace, name, kind) (max(gotk_reconcile_condition{status="Deleted"}) by (exported_namespace, name, kind)) * 2 == 1 + for: 10m + labels: + severity: critical + annotations: + summary: '{{ $labels.kind }} {{ $labels.exported_namespace }}/{{ $labels.name }} reconciliation has been failing for more than ten minutes.' diff --git a/infrastructure/flux-system/kustomization.yaml b/infrastructure/flux-system/kustomization.yaml index f06c5e82e3780a8cee50cccf45cc34d7e1a5b536..3d68f553b0ec398ca9f9012ab32bccf69681d3e7 100644 --- a/infrastructure/flux-system/kustomization.yaml +++ b/infrastructure/flux-system/kustomization.yaml @@ -3,6 +3,7 @@ kind: Kustomization namespace: flux-system resources: - monitoring.yaml + - alerts.yaml - ../../shared/networkpolicies/allow-from-monitoring.yaml patchesStrategicMerge: - networkpolicy.yaml