diff --git a/infrastructure/longhorn/monitoring.yaml b/infrastructure/longhorn/monitoring.yaml index bf5627dc8b5848ba9e38aafb17af588d585b70e9..96eed409716be2019b8dfa5b7395991179dae03b 100644 --- a/infrastructure/longhorn/monitoring.yaml +++ b/infrastructure/longhorn/monitoring.yaml @@ -103,4 +103,12 @@ spec: labels: issue: Longhorn node {{$labels.node}} experiences high CPU pressure. severity: warning - + - alert: LonghornShareManagerOff + annotations: + description: Longhorn share manager count is off by {{$value}}. This is likely due to a recent bug in Longhorn. https://github.com/longhorn/longhorn/issues/7183#issuecomment-1823715359 + summary: Longhorn share manager count is off by {{$value}} for 5m. + expr: count(sum by (namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_access_mode{access_mode="ReadWriteMany"}) * on (namespace, persistentvolumeclaim) group_right kube_persistentvolumeclaim_info{storageclass=~"longhorn.*"}) - sum(kube_pod_info{namespace="longhorn-system", pod=~"share-manager-.*"}) > 0 + for: 5m + labels: + issue: Longhorn share manager count is off by {{$value}} for 5m. + severity: critical