diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 80bacaa79f615b6348391892b7b9dc534560bda0..6c7d433c8212a0211a32785d1f056e8b35306296 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -48,8 +48,8 @@ "subdir": "" } }, - "version": "8524aa43d49914b170b84816fc182319da04a167", - "sum": "J06UiBvcfpRzLM5VbLRAhP39Zaz+EKguJ5sSTBDeygs=" + "version": "e0dc3563dcbf2e54e0ffe8e83f3f51b237ef33be", + "sum": "egi2xHFco6VkCxettVvAju/yrsGnB3AFoPpCGKfWhtU=" }, { "source": { diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index ecab2f6a4c589b099547e2f939962c3cc7bd3ad3..e0a6ab561bca1079a874bf2af698fa5316f3d249 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -5602,7 +5602,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", + "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5854,7 +5854,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", + "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -6258,7 +6258,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6267,7 +6267,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6276,7 +6276,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6676,7 +6676,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6685,7 +6685,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6694,7 +6694,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6703,7 +6703,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, diff --git a/manifests/kubernetes-prometheusRule.yaml b/manifests/kubernetes-prometheusRule.yaml index 203e2c345f8ecb4e485834b7172643664ca73fc1..208116ff702f8e6f1dfd9f5e97f0617b65204a00 100644 --- a/manifests/kubernetes-prometheusRule.yaml +++ b/manifests/kubernetes-prometheusRule.yaml @@ -19,6 +19,8 @@ spec: summary: Pod is crash looping. expr: | increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) > 0 + and + sum without (phase) (kube_pod_status_phase{phase!="Running",job="kube-state-metrics"} == 1) for: 15m labels: severity: warning @@ -214,19 +216,19 @@ spec: runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpareplicasmismatch summary: HPA has not matched descired number of replicas. expr: | - (kube_hpa_status_desired_replicas{job="kube-state-metrics"} + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} != - kube_hpa_status_current_replicas{job="kube-state-metrics"}) + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) and - (kube_hpa_status_current_replicas{job="kube-state-metrics"} + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} > - kube_hpa_spec_min_replicas{job="kube-state-metrics"}) + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"}) and - (kube_hpa_status_current_replicas{job="kube-state-metrics"} + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} < - kube_hpa_spec_max_replicas{job="kube-state-metrics"}) + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}) and - changes(kube_hpa_status_current_replicas{job="kube-state-metrics"}[15m]) == 0 + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}[15m]) == 0 for: 15m labels: severity: warning @@ -236,9 +238,9 @@ spec: runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpamaxedout summary: HPA is running at max replicas expr: | - kube_hpa_status_current_replicas{job="kube-state-metrics"} + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} == - kube_hpa_spec_max_replicas{job="kube-state-metrics"} + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"} for: 15m labels: severity: warning @@ -1250,6 +1252,28 @@ spec: ) ) record: namespace_cpu:kube_pod_container_resource_requests:sum + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_limits:sum + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_limits:sum - expr: | max by (cluster, namespace, workload, pod) ( label_replace(