diff --git a/assets/kubernetes-autoscaling/alerts.yaml b/assets/kubernetes-autoscaling/alerts.yaml index 2b65212a0761182cd7f25b8ac3925e1309edfd3e..b2070605bec5fae1d06a6dab02a1347540e4c71a 100644 --- a/assets/kubernetes-autoscaling/alerts.yaml +++ b/assets/kubernetes-autoscaling/alerts.yaml @@ -14,7 +14,7 @@ groups: job=~"karpenter" }[5m] ) - ) by (namespace, job, provider, controller, method) > 0 + ) by (cluster, namespace, job, provider, controller, method) > 0 for: 5m labels: severity: warning @@ -30,13 +30,13 @@ groups: karpenter_nodeclaims_termination_duration_seconds_sum{ job=~"karpenter" } - ) by (namespace, job, nodepool) + ) by (cluster, namespace, job, nodepool) / sum( karpenter_nodeclaims_termination_duration_seconds_count{ job=~"karpenter" } - ) by (namespace, job, nodepool) > 1200 + ) by (cluster, namespace, job, nodepool) > 1200 for: 15m labels: severity: warning @@ -50,11 +50,11 @@ groups: expr: | sum ( karpenter_nodepools_usage{job=~"karpenter"} - ) by (namespace, job, nodepool, resource_type) + ) by (cluster, namespace, job, nodepool, resource_type) / sum ( karpenter_nodepools_limit{job=~"karpenter"} - ) by (namespace, job, nodepool, resource_type) + ) by (cluster, namespace, job, nodepool, resource_type) * 100 > 75 for: 15m labels: @@ -70,11 +70,11 @@ groups: expr: | sum ( cluster_autoscaler_nodes_count{job=~"cluster-autoscaler"} - ) by (namespace, job) + ) by (cluster, namespace, job) / sum ( cluster_autoscaler_max_nodes_count{job=~"cluster-autoscaler"} - ) by (namespace, job) + ) by (cluster, namespace, job) * 100 > 75 for: 15m labels: @@ -88,7 +88,7 @@ groups: expr: | sum ( cluster_autoscaler_unschedulable_pods_count{job=~"cluster-autoscaler"} - ) by (namespace, job) + ) by (cluster, namespace, job) > 0 for: 15m labels: diff --git a/site/content/kubernetes-autoscaling/_index.md b/site/content/kubernetes-autoscaling/_index.md index 4a94d09f6f6ab24f3aaf165c78ae9eb72b320aea..c881284f4d7bd2e91d8967c511b9446315d02efc 100644 --- a/site/content/kubernetes-autoscaling/_index.md +++ b/site/content/kubernetes-autoscaling/_index.md @@ -34,7 +34,7 @@ expr: | job=~"karpenter" }[5m] ) - ) by (namespace, job, provider, controller, method) > 0 + ) by (cluster, namespace, job, provider, controller, method) > 0 for: 5m labels: severity: warning @@ -55,13 +55,13 @@ expr: | karpenter_nodeclaims_termination_duration_seconds_sum{ job=~"karpenter" } - ) by (namespace, job, nodepool) + ) by (cluster, namespace, job, nodepool) / sum( karpenter_nodeclaims_termination_duration_seconds_count{ job=~"karpenter" } - ) by (namespace, job, nodepool) > 1200 + ) by (cluster, namespace, job, nodepool) > 1200 for: 15m labels: severity: warning @@ -79,11 +79,11 @@ annotations: expr: | sum ( karpenter_nodepools_usage{job=~"karpenter"} - ) by (namespace, job, nodepool, resource_type) + ) by (cluster, namespace, job, nodepool, resource_type) / sum ( karpenter_nodepools_limit{job=~"karpenter"} - ) by (namespace, job, nodepool, resource_type) + ) by (cluster, namespace, job, nodepool, resource_type) * 100 > 75 for: 15m labels: @@ -104,11 +104,11 @@ annotations: expr: | sum ( cluster_autoscaler_nodes_count{job=~"cluster-autoscaler"} - ) by (namespace, job) + ) by (cluster, namespace, job) / sum ( cluster_autoscaler_max_nodes_count{job=~"cluster-autoscaler"} - ) by (namespace, job) + ) by (cluster, namespace, job) * 100 > 75 for: 15m labels: @@ -127,7 +127,7 @@ annotations: expr: | sum ( cluster_autoscaler_unschedulable_pods_count{job=~"cluster-autoscaler"} - ) by (namespace, job) + ) by (cluster, namespace, job) > 0 for: 15m labels: