diff --git a/infrastructure/base/cert-manager/alerts.yaml b/infrastructure/base/cert-manager/alerts.yaml index 946faddf7629dd4247e335f883bf5bbf15b477c2..9d89efdd1dfcd256407de715f5005b324b77a79b 100644 --- a/infrastructure/base/cert-manager/alerts.yaml +++ b/infrastructure/base/cert-manager/alerts.yaml @@ -12,7 +12,7 @@ spec: annotations: description: New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back. - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/cert-manager/certmanagerabsent + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/cert-manager/certmanagerabsent/ summary: Cert Manager has dissapeared from Prometheus service discovery. expr: absent(up{job="cert-manager"}) for: 10m @@ -25,7 +25,7 @@ spec: description: The domain that this cert covers will be unavailable after {{ $value | humanizeDuration }}. Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration }}. - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/cert-manager/certmanagercertexpirysoon + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/cert-manager/certmanagercertexpirysoon/ summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry, it should have renewed over a week ago. expr: | @@ -40,7 +40,7 @@ spec: description: This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert, the ingress controller _may_ be able to serve that instead. - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/cert-manager/certmanagercertnotready + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/cert-manager/certmanagercertnotready/ summary: The cert `{{ $labels.name }}` is not ready to serve traffic. expr: | max by (name, exported_namespace, namespace, condition) ( @@ -53,7 +53,7 @@ spec: annotations: description: Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week. - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/cert-manager/certmanagerhittingratelimits + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/cert-manager/certmanagerhittingratelimits/ summary: Cert manager hitting LetsEncrypt rate limits. expr: | sum by (host) ( diff --git a/infrastructure/base/longhorn/monitoring.yaml b/infrastructure/base/longhorn/monitoring.yaml index 0bea7829b6766d95449966d6b44d322b75d64ef2..fe7f3388951218581faf4658e080563f700bc9b2 100644 --- a/infrastructure/base/longhorn/monitoring.yaml +++ b/infrastructure/base/longhorn/monitoring.yaml @@ -34,7 +34,7 @@ spec: labels: issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high. severity: info - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/longhorn/longhornvolumeactualspaceusedinfo + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/longhorn/longhornvolumeactualspaceusedinfo/ - alert: LonghornVolumeStatusCritical annotations: description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for @@ -113,4 +113,4 @@ spec: labels: issue: Longhorn share manager count is off by {{$value}} for 5m. severity: critical - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/longhorn/longhornsharemanageroff + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/longhorn/longhornsharemanageroff/ diff --git a/infrastructure/base/metallb/release.yaml b/infrastructure/base/metallb/release.yaml index 417fbcfabade74788a366d716113d6f796b499c2..b8f5f768251753396be340111c51afe8157790cc 100644 --- a/infrastructure/base/metallb/release.yaml +++ b/infrastructure/base/metallb/release.yaml @@ -46,32 +46,32 @@ data: addressPoolExhausted: labels: severity: critical - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbaddresspoolexhausted + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbaddresspoolexhausted/ addressPoolUsage: thresholds: - percent: 75 labels: severity: info - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbaddresspoolusage + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbaddresspoolusage/ - percent: 85 labels: severity: warning - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbaddresspoolusage + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbaddresspoolusage/ - percent: 95 labels: severity: critical - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbaddresspoolusage + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbaddresspoolusage/ bgpSessionDown: labels: severity: critical - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbbgpsessiondown + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbbgpsessiondown/ configNotLoaded: labels: severity: warning - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbconfignotloaded + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbconfignotloaded/ staleConfig: labels: severity: warning - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbstaleconfig + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/metallb/metallbstaleconfig/ serviceAccount: kube-prometheus-stack-prometheus namespace: monitoring-system diff --git a/infrastructure/base/postgres/prometheusrules.yaml b/infrastructure/base/postgres/prometheusrules.yaml index 880de665bfd834af2fa55ff6a5bd42e1d9a780b6..040ffdbdbf059a05bf9649ca3f95cc6edfea74ad 100644 --- a/infrastructure/base/postgres/prometheusrules.yaml +++ b/infrastructure/base/postgres/prometheusrules.yaml @@ -17,7 +17,7 @@ spec: labels: issue: The WAL size of the postgres cluster exceeded 1GiB for more than 1 hour. severity: critical - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/postgres/postgreshighwalusage + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/postgres/postgreshighwalusage/ - alert: PostgresNotRunning annotations: description: PostgreSQL instance is not running inside the container @@ -27,4 +27,4 @@ spec: labels: issue: PostgreSQL instance is not running inside the container severity: critical - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/postgres/postgresnotrunning \ No newline at end of file + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/postgres/postgresnotrunning/ \ No newline at end of file diff --git a/infrastructure/base/system-upgrades/monitoring.yaml b/infrastructure/base/system-upgrades/monitoring.yaml index 008e423a6a60b333fb555ed7860b19c1c40e24d8..f027f2b3ad724668123abf1cd70a0ea0d847685e 100644 --- a/infrastructure/base/system-upgrades/monitoring.yaml +++ b/infrastructure/base/system-upgrades/monitoring.yaml @@ -16,5 +16,5 @@ spec: labels: issue: The node {{$labels.node}} has been marked as unscheduable for more than 24h. severity: critical - runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/system-upgrades/kubenodeunschedulablecritical + runbook_url: https://runbooks.s3.shivering-isles.com/runbooks/system-upgrades/kubenodeunschedulablecritical/