Skip to content
Snippets Groups Projects
Unverified Commit 09135ee9 authored by Jan-Otto Kröpke's avatar Jan-Otto Kröpke Committed by GitHub
Browse files

Enable Multi Cluster alerts by default (#2099)

parent 5ac666d8
No related branches found
No related tags found
No related merge requests found
...@@ -11,6 +11,7 @@ local defaults = { ...@@ -11,6 +11,7 @@ local defaults = {
mixin:: { mixin:: {
ruleLabels: {}, ruleLabels: {},
_config: { _config: {
showMultiCluster: true,
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"', cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
kubeletSelector: 'job="kubelet", metrics_path="/metrics"', kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
kubeStateMetricsSelector: 'job="kube-state-metrics"', kubeStateMetricsSelector: 'job="kube-state-metrics"',
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
summary: 'One or more targets are unreachable.', summary: 'One or more targets are unreachable.',
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.', description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.',
}, },
expr: '100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10', expr: '100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10',
'for': '10m', 'for': '10m',
labels: { labels: {
severity: 'warning', severity: 'warning',
......
...@@ -38,6 +38,7 @@ local defaults = { ...@@ -38,6 +38,7 @@ local defaults = {
prometheus: defaults.name, prometheus: defaults.name,
}, },
_config: { _config: {
groupLabels: 'cluster,controller,namespace',
prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"', prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"',
runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/%s', runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/%s',
}, },
......
This diff is collapsed.
...@@ -83,6 +83,9 @@ spec: ...@@ -83,6 +83,9 @@ spec:
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster - mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster
name: grafana-dashboard-k8s-resources-cluster name: grafana-dashboard-k8s-resources-cluster
readOnly: false readOnly: false
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-multicluster
name: grafana-dashboard-k8s-resources-multicluster
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-namespace - mountPath: /grafana-dashboard-definitions/0/k8s-resources-namespace
name: grafana-dashboard-k8s-resources-namespace name: grafana-dashboard-k8s-resources-namespace
readOnly: false readOnly: false
...@@ -180,6 +183,9 @@ spec: ...@@ -180,6 +183,9 @@ spec:
- configMap: - configMap:
name: grafana-dashboard-k8s-resources-cluster name: grafana-dashboard-k8s-resources-cluster
name: grafana-dashboard-k8s-resources-cluster name: grafana-dashboard-k8s-resources-cluster
- configMap:
name: grafana-dashboard-k8s-resources-multicluster
name: grafana-dashboard-k8s-resources-multicluster
- configMap: - configMap:
name: grafana-dashboard-k8s-resources-namespace name: grafana-dashboard-k8s-resources-namespace
name: grafana-dashboard-k8s-resources-namespace name: grafana-dashboard-k8s-resources-namespace
......
...@@ -18,7 +18,7 @@ spec: ...@@ -18,7 +18,7 @@ spec:
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
summary: One or more targets are unreachable. summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
......
...@@ -247,50 +247,50 @@ spec: ...@@ -247,50 +247,50 @@ spec:
rules: rules:
- alert: KubeCPUOvercommit - alert: KubeCPUOvercommit
annotations: annotations:
description: Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure. description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests. summary: Cluster has overcommitted CPU resource requests.
expr: | expr: |
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})) > 0 sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
and and
(sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})) > 0 (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
- alert: KubeMemoryOvercommit - alert: KubeMemoryOvercommit
annotations: annotations:
description: Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure. description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
summary: Cluster has overcommitted memory resource requests. summary: Cluster has overcommitted memory resource requests.
expr: | expr: |
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})) > 0 sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
and and
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})) > 0 (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
- alert: KubeCPUQuotaOvercommit - alert: KubeCPUQuotaOvercommit
annotations: annotations:
description: Cluster has overcommitted CPU resource requests for Namespaces. description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests. summary: Cluster has overcommitted CPU resource requests.
expr: | expr: |
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
/ /
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
> 1.5 > 1.5
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
- alert: KubeMemoryQuotaOvercommit - alert: KubeMemoryQuotaOvercommit
annotations: annotations:
description: Cluster has overcommitted memory resource requests for Namespaces. description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests. summary: Cluster has overcommitted memory resource requests.
expr: | expr: |
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
/ /
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
> 1.5 > 1.5
for: 5m for: 5m
labels: labels:
......
...@@ -20,7 +20,7 @@ spec: ...@@ -20,7 +20,7 @@ spec:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors
summary: Errors while performing list operations in controller. summary: Errors while performing list operations in controller.
expr: | expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 (sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
...@@ -30,7 +30,7 @@ spec: ...@@ -30,7 +30,7 @@ spec:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors
summary: Errors while performing watch operations in controller. summary: Errors while performing watch operations in controller.
expr: | expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.4 (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.4
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
...@@ -50,7 +50,7 @@ spec: ...@@ -50,7 +50,7 @@ spec:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors
summary: Errors while reconciling controller. summary: Errors while reconciling controller.
expr: | expr: |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
...@@ -70,7 +70,7 @@ spec: ...@@ -70,7 +70,7 @@ spec:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready summary: Prometheus operator not ready
expr: | expr: |
min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment