diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 5f6e091add20075d356a1033164342dcfe064237..2ac75e690a3051cacc181b4bcb4d50b6959013b1 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "66226d27f1fca6096c420b46b097b9e2475189e6" + "version": "f6c5c4311b8c8ad699cfa718a6e1226780b8b3a5" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "4090e091fee875fd2baec1531a7ef9c2ab58c99b" + "version": "fee96cc51d22f196c982c6152cc8aee2585f65c0" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "d21784739a9adc7992c0382d1efa42be4ddb3044" + "version": "7be7f8e4e8da37cac104d2655ca22fdb8a93ebcd" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "8c4610783991b82ff12e485d24ac4f82d8839743" + "version": "e6fe81715dd802b4c9d9c64f2c44ba6ee56d2000" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "af85949b416547ed0989b396a28fe77f65978828" + "version": "1df1ddff4361ed7f2c0f33571923511889a115ce" } ] } \ No newline at end of file diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 551647b806567f14c66648e9e0b935dddbcb3db3..7bae26bb612b967a2c5c28fd7f43d086ba24b073 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1712,7 +1712,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max ((node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"})\n/ node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (namespace, pod, device)\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{node=\"$node\"}\n", + "expr": "node:node_filesystem_usage:\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{node=\"$node\"}\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -2800,7 +2800,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_rss) by (namespace)", + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -3029,7 +3029,7 @@ items: ], "targets": [ { - "expr": "sum(container_memory_rss) by (namespace)", + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3047,7 +3047,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3065,7 +3065,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3612,7 +3612,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\"}) by (pod_name)", + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", container_name!=\"\"}) by (pod_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod_name}}", @@ -3841,7 +3841,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3859,7 +3859,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3877,7 +3877,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4451,7 +4451,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4680,7 +4680,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4707,7 +4707,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container_name!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4716,7 +4716,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5669,7 +5669,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max ((node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\",instance=\"$instance\"}\n- node_filesystem_avail{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\",instance=\"$instance\"})\n/ node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\",instance=\"$instance\"}) by (namespace, pod, device)\n", + "expr": "node:node_filesystem_usage:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 121b974f2a7567b37cb9d22646a9dfe3cbbaa714..1809db6b106ab2aa5cbeb7a77d974d81881fa27d 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -11,26 +11,26 @@ spec: - name: k8s.rules rules: - expr: | - sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace) + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace) record: namespace:container_cpu_usage_seconds_total:sum_rate - expr: | sum by (namespace, pod_name, container_name) ( - rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m]) + rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m]) ) record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate - expr: | - sum(container_memory_usage_bytes{job="kubelet", image!=""}) by (namespace) + sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace) record: namespace:container_memory_usage_bytes:sum - expr: | sum by (namespace, label_name) ( - sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace, pod_name) + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name) * on (namespace, pod_name) group_left(label_name) label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) record: namespace_name:container_cpu_usage_seconds_total:sum_rate - expr: | sum by (namespace, label_name) ( - sum(container_memory_usage_bytes{job="kubelet",image!=""}) by (pod_name, namespace) + sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace) * on (namespace, pod_name) group_left(label_name) label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) @@ -232,6 +232,14 @@ spec: node_namespace_pod:kube_pod_info: ) record: node:node_disk_saturation:avg_irate + - expr: | + max by (namespace, pod, device) ((node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"} + - node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"}) + / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_usage:' + - expr: | + max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_avail:' - expr: | sum(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) @@ -569,7 +577,7 @@ spec: message: '{{ $labels.namespace }}/{{ $labels.pod }} is not ready.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready expr: | - sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase!~"Running|Succeeded"}) > 0 + sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0 for: 1h labels: severity: critical