From 6c1949a1a5bd7f4f87d9ce071c3e22ef41e0f602 Mon Sep 17 00:00:00 2001 From: Sheogorath <sheogorath@shivering-isles.com> Date: Sun, 11 Feb 2024 12:45:05 +0100 Subject: [PATCH] chore(longhorn): Extend alert trigger duration This patch extends the duration for the longhorn-share alert, that was hotfixed with the hotfix-7183 cronjob. Since the cronjob runs every 5 minutes, this patch extends the alert to 10 minutes to catch possible hotfix failures. --- infrastructure/longhorn/monitoring.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/longhorn/monitoring.yaml b/infrastructure/longhorn/monitoring.yaml index 96eed4097..081841c00 100644 --- a/infrastructure/longhorn/monitoring.yaml +++ b/infrastructure/longhorn/monitoring.yaml @@ -108,7 +108,7 @@ spec: description: Longhorn share manager count is off by {{$value}}. This is likely due to a recent bug in Longhorn. https://github.com/longhorn/longhorn/issues/7183#issuecomment-1823715359 summary: Longhorn share manager count is off by {{$value}} for 5m. expr: count(sum by (namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_access_mode{access_mode="ReadWriteMany"}) * on (namespace, persistentvolumeclaim) group_right kube_persistentvolumeclaim_info{storageclass=~"longhorn.*"}) - sum(kube_pod_info{namespace="longhorn-system", pod=~"share-manager-.*"}) > 0 - for: 5m + for: 10m labels: issue: Longhorn share manager count is off by {{$value}} for 5m. severity: critical -- GitLab