From af5fb9ee09ecbc33a69b6aae19d85f710d571822 Mon Sep 17 00:00:00 2001 From: Sergiusz Urbaniak <sergiusz.urbaniak@gmail.com> Date: Thu, 23 Aug 2018 13:38:13 +0200 Subject: [PATCH] kubernetes-prometheus: regenerate Signed-off-by: Sergiusz Urbaniak <sergiusz.urbaniak@gmail.com> --- manifests/grafana-dashboardDefinitions.yaml | 32 ++++++++++----------- manifests/node-exporter-daemonset.yaml | 2 +- manifests/prometheus-rules.yaml | 21 ++++++++++---- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index af2b2b0a..db2d0939 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1920,7 +1920,7 @@ items: "steppedLine": false, "targets": [ { - "expr": ":node_cpu_utilisation:avg1m", + "expr": "1 - avg(rate(node_cpu{mode=\"idle\"}[1m]))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2172,7 +2172,7 @@ items: "steppedLine": false, "targets": [ { - "expr": ":node_memory_utilisation:", + "expr": "1 - sum(:node_memory_MemFreeCachedBuffers:sum) / sum(:node_memory_MemTotal:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2256,7 +2256,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(node_memory_MemTotal)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(:node_memory_MemTotal:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2340,7 +2340,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(node_memory_MemTotal)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(:node_memory_MemTotal:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2435,7 +2435,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_cpu_usage_seconds_total[1m])) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -2664,7 +2664,7 @@ items: ], "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2682,7 +2682,7 @@ items: "step": 10 }, { - "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2700,7 +2700,7 @@ items: "step": 10 }, { - "expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3247,7 +3247,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[1m])) by (pod_name)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}) by (pod_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod_name}}", @@ -3476,7 +3476,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3494,7 +3494,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3512,7 +3512,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\"}[5m]), \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4086,7 +4086,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[1m])) by (container_name)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4315,7 +4315,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4333,7 +4333,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4351,7 +4351,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index a2669187..b3febf8c 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -76,5 +76,5 @@ spec: path: /sys name: sys - hostPath: - path: /root + path: / name: root diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index bf07b9a4..121b974f 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -13,6 +13,11 @@ spec: - expr: | sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace) record: namespace:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum by (namespace, pod_name, container_name) ( + rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m]) + ) + record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate - expr: | sum(container_memory_usage_bytes{job="kubelet", image!=""}) by (namespace) record: namespace:container_memory_usage_bytes:sum @@ -151,6 +156,12 @@ spec: / sum(node_memory_MemTotal{job="node-exporter"}) record: ':node_memory_utilisation:' + - expr: | + sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + record: :node_memory_MemFreeCachedBuffers:sum + - expr: | + sum(node_memory_MemTotal{job="node-exporter"}) + record: :node_memory_MemTotal:sum - expr: | sum by (node) ( (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) @@ -796,10 +807,10 @@ spec: }}' is experiencing {{ printf "%0.0f" $value }}% errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | - sum(rate(rest_client_requests_total{code!~"2.."}[5m])) by (instance, job) * 100 + (sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job) / - sum(rate(rest_client_requests_total[5m])) by (instance, job) - > 1 + sum(rate(rest_client_requests_total[5m])) by (instance, job)) + * 100 > 1 for: 15m labels: severity: warning @@ -829,7 +840,7 @@ spec: for {{$labels.verb}} {{$labels.resource}}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | - cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m labels: severity: warning @@ -839,7 +850,7 @@ spec: for {{$labels.verb}} {{$labels.resource}}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | - cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m labels: severity: critical -- GitLab