From 6c1949a1a5bd7f4f87d9ce071c3e22ef41e0f602 Mon Sep 17 00:00:00 2001
From: Sheogorath <sheogorath@shivering-isles.com>
Date: Sun, 11 Feb 2024 12:45:05 +0100
Subject: [PATCH] chore(longhorn): Extend alert trigger duration

This patch extends the duration for the longhorn-share alert, that was
hotfixed with the hotfix-7183 cronjob. Since the cronjob runs every 5
minutes, this patch extends the alert to 10 minutes to catch possible
hotfix failures.
---
 infrastructure/longhorn/monitoring.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/infrastructure/longhorn/monitoring.yaml b/infrastructure/longhorn/monitoring.yaml
index 96eed4097..081841c00 100644
--- a/infrastructure/longhorn/monitoring.yaml
+++ b/infrastructure/longhorn/monitoring.yaml
@@ -108,7 +108,7 @@ spec:
         description: Longhorn share manager count is off by {{$value}}. This is likely due to a recent bug in Longhorn. https://github.com/longhorn/longhorn/issues/7183#issuecomment-1823715359
         summary: Longhorn share manager count is off by {{$value}} for 5m.
       expr: count(sum by (namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_access_mode{access_mode="ReadWriteMany"}) * on (namespace, persistentvolumeclaim) group_right kube_persistentvolumeclaim_info{storageclass=~"longhorn.*"}) - sum(kube_pod_info{namespace="longhorn-system", pod=~"share-manager-.*"}) > 0
-      for: 5m
+      for: 10m
       labels:
         issue: Longhorn share manager count is off by {{$value}} for 5m.
         severity: critical
-- 
GitLab