diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/alerts/node.libsonnet index 2346ecfa249c0cea2e6090737db68f6aa6248752..d1b9caf6def34001693b277676228f17818f82a1 100644 --- a/jsonnet/kube-prometheus/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/alerts/node.libsonnet @@ -1,24 +1,6 @@ { prometheusAlerts+:: { groups+: [ - { - name: 'node-time', - rules: [ - { - alert: 'ClockSkewDetected', - annotations: { - message: 'Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}. Ensure NTP is configured correctly on this host.', - }, - expr: ||| - abs(node_timex_offset_seconds{%(nodeExporterSelector)s}) > 0.05 - ||| % $._config, - 'for': '2m', - labels: { - severity: 'warning', - }, - }, - ], - }, { name: 'node-network', rules: [ diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 4939273854c14502ef1bc1c6b0d21c4b4f94c9d4..b73a78d360a8ffb20ec90600e9d44f4c5c2e4655 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana" } }, - "version": "539a90dbf63c812ad0194d8078dd776868a11c81", - "sum": "b8faWX1qqLGyN67sA36oRqYZ5HX+tHBRMPtrWRqIysE=" + "version": "57b4365eacda291b82e0d55ba7eec573a8198dda", + "sum": "92DWADwGjnCfpZaL7Q07C0GZayxBziGla/O03qWea34=" }, { "source": { @@ -18,7 +18,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "e5c90ebf90cb3692c26240d19406de47414a2b38", + "version": "0eee733220fc766ff0d193d61d9124aa06493986", "sum": "Ko3qhNfC2vN/houLh6C0Ryacjv70gl0DVPGU/PQ4OD0=" }, { @@ -38,8 +38,8 @@ "subdir": "grafonnet" } }, - "version": "c459106d2d2b583dd3a83f6c75eb52abee3af764", - "sum": "CeM3LRgUCUJTolTdMnerfMPGYmhClx7gX5ajrQVEY2Y=" + "version": "815b848ade47c2f4ee866fe5efc435acd9ad799c", + "sum": "J3Vp0EVbxTObr6KydLXsi4Rc0ssNVAEuwLc0NQ+4wqU=" }, { "source": { @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "7ac7da1a0fe165b68cdb718b2521b560d51bd1f4", + "version": "d3c9f46e8f1ab665db6b31446fbe23e399c9f529", "sum": "slxrtftVDiTlQK22ertdfrg4Epnq97gdrLI63ftUfaE=" }, { @@ -69,8 +69,8 @@ "subdir": "" } }, - "version": "b2d7f762bd22be3ba5e7d54a1fcecfe1092f214b", - "sum": "NqrJQnQnRDzkCbrHg7L1zX8XPAzfoE4DS2XBEj6WC8g=" + "version": "bf3064885199f90080bec6790f2d27c5ad08184d", + "sum": "MFXrg/dNmfAHIm+H8bEGdZ957E1Y6B6aS42iFQCE1O0=" }, { "source": { @@ -79,7 +79,7 @@ "subdir": "lib/promgrafonnet" } }, - "version": "b2d7f762bd22be3ba5e7d54a1fcecfe1092f214b", + "version": "bf3064885199f90080bec6790f2d27c5ad08184d", "sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc=" }, { @@ -89,7 +89,7 @@ "subdir": "jsonnet/kube-state-metrics" } }, - "version": "89ede10b19d7ef0145777717351cabe14b113c01", + "version": "ab094dffe1e5c6d59663c8a2de056cba62f6cd2c", "sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA=" }, { @@ -99,7 +99,7 @@ "subdir": "jsonnet/kube-state-metrics-mixin" } }, - "version": "89ede10b19d7ef0145777717351cabe14b113c01", + "version": "ab094dffe1e5c6d59663c8a2de056cba62f6cd2c", "sum": "E1GGavnf9PCWBm4WVrxWnc0FIj72UcbcweqGioWrOdU=" }, { @@ -109,8 +109,8 @@ "subdir": "slo-libsonnet" } }, - "version": "437c402c5f3ad86c3c16db8471f1649284fef0ee", - "sum": "2Zcyku1f558VrUpMaJnI78fahDksPLcS1idmxxwcQ7Q=" + "version": "5ddd7ffc39e7a54c9aca997c2c389a8046fab0ff", + "sum": "S7/+tnAkzVh8Li7sg7Hu4aeIQAWHCtxhRQ+k1OKjoQk=" }, { "source": { @@ -119,8 +119,8 @@ "subdir": "docs/node-mixin" } }, - "version": "0107bc794204f50d887898da60032da890637471", - "sum": "VKdF0zPMSCiuIuXWblSz2VOeBaXzQ7fp40vz9sxj+Bo=" + "version": "7f5a0ea5f633594e2ab4de52c5779a5a5a40f09f", + "sum": "P2H+7fx8/JsMEvB6cMxtxYomRwxB13M4a8VuwNSlM/E=" }, { "source": { @@ -129,8 +129,8 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "1c321ed047ac57e34688e40a55349c9dfe2b72c8", - "sum": "u1YS9CVuBTcw2vks0PZbLb1gtlI/7bVGDVBZsjWFLTw=", + "version": "fac7a4a0504404fa5d4c5abb8fcc9750bd5cbda7", + "sum": "5EUgr6Spr1zNR8Y2/NevjvEkGV9WMvKo6nEScNER1Lc=", "name": "prometheus" }, { diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index e3b229563ff056bfe8aea5880b2e05218226cc71..02b99502ab9de9053e8b2b20faa5c41cafb96811 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -23,7 +23,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -2832,7 +2832,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of TCP Retransimts out of all sent segments", + "title": "Rate of TCP Retransmits out of all sent segments", "tooltip": { "shared": true, "sort": 2, @@ -2935,7 +2935,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of TCP SYN Retransimts out of all retransmits", + "title": "Rate of TCP SYN Retransmits out of all retransmits", "tooltip": { "shared": true, "sort": 2, @@ -2980,7 +2980,7 @@ items: "type": "row" } ], - "refresh": "30s", + "refresh": "10s", "rows": [ ], @@ -3150,7 +3150,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -4305,7 +4305,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[1m]))", + "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[$__interval]))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5807,7 +5807,7 @@ items: ], "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5816,7 +5816,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5825,7 +5825,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5834,7 +5834,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5843,7 +5843,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5852,7 +5852,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5952,7 +5952,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6050,7 +6050,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6148,7 +6148,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6246,7 +6246,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6344,7 +6344,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6442,7 +6442,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6540,7 +6540,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6638,7 +6638,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6747,37 +6747,29 @@ items: }, { "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", "current": { - "text": "5m", - "value": "5m" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 2, "includeAll": false, "label": null, "multi": false, - "name": "interval", + "name": "cluster", "options": [ - { - "selected": true, - "text": "4h", - "value": "4h" - } + ], - "query": "4h", + "query": "label_values(node_cpu_seconds_total, cluster)", "refresh": 2, "regex": "", - "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ ], "tagsQuery": "", - "type": "interval", + "type": "query", "useTags": false } ] @@ -6840,7 +6832,7 @@ items: "rows": [ { "collapse": false, - "height": "250px", + "height": "100px", "panels": [ { "aliasColors": { @@ -6850,7 +6842,8 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, + "fill": 1, + "format": "percentunit", "id": 1, "legend": { "avg": false, @@ -6862,7 +6855,7 @@ items: "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], @@ -6875,31 +6868,28 @@ items: ], "spaceLength": 10, - "span": 12, - "stack": true, + "span": 3, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", "format": "time_series", + "instant": true, "intervalFactor": 2, - "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], - "thresholds": [ - - ], + "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "CPU Usage", + "title": "CPU Utilisation (from requests)", "tooltip": { "shared": false, "sort": 0, "value_type": "individual" }, - "type": "graph", + "type": "singlestat", "xaxis": { "buckets": null, "mode": "time", @@ -6927,19 +6917,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -6949,6 +6927,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "format": "percentunit", "id": 2, "legend": { "avg": false, @@ -6973,200 +6952,28 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 3, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "CPU Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "CPU Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Pod", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", - "format": "table", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "format": "time_series", "instant": true, "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 + "refId": "A" } ], - "thresholds": [ - - ], + "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "CPU Quota", + "title": "CPU Utilisation (from limits)", "tooltip": { "shared": false, "sort": 0, "value_type": "individual" }, - "transform": "table", - "type": "table", + "type": "singlestat", "xaxis": { "buckets": null, "mode": "time", @@ -7194,19 +7001,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Quota", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -7215,7 +7010,8 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, + "fill": 1, + "format": "percentunit", "id": 3, "legend": { "avg": false, @@ -7227,7 +7023,7 @@ items: "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], @@ -7240,31 +7036,28 @@ items: ], "spaceLength": 10, - "span": 12, - "stack": true, + "span": 3, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"})", "format": "time_series", + "instant": true, "intervalFactor": 2, - "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "refId": "A" } ], - "thresholds": [ - - ], + "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "Memory Usage (w/o cache)", + "title": "Memory Utilization (from requests)", "tooltip": { "shared": false, "sort": 0, "value_type": "individual" }, - "type": "graph", + "type": "singlestat", "xaxis": { "buckets": null, "mode": "time", @@ -7276,7 +7069,7 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -7292,19 +7085,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -7314,6 +7095,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "format": "percentunit", "id": 4, "legend": { "avg": false, @@ -7336,6 +7118,234 @@ items: "renderer": "flot", "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation (from limits)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Headlines", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], "spaceLength": 10, "span": 12, @@ -7349,7 +7359,7 @@ items: "type": "hidden" }, { - "alias": "Memory Usage", + "alias": "CPU Usage", "colorMode": null, "colors": [ @@ -7364,10 +7374,10 @@ items: ], "type": "number", - "unit": "bytes" + "unit": "short" }, { - "alias": "Memory Requests", + "alias": "CPU Requests", "colorMode": null, "colors": [ @@ -7382,10 +7392,10 @@ items: ], "type": "number", - "unit": "bytes" + "unit": "short" }, { - "alias": "Memory Requests %", + "alias": "CPU Requests %", "colorMode": null, "colors": [ @@ -7403,7 +7413,7 @@ items: "unit": "percentunit" }, { - "alias": "Memory Limits", + "alias": "CPU Limits", "colorMode": null, "colors": [ @@ -7418,10 +7428,10 @@ items: ], "type": "number", - "unit": "bytes" + "unit": "short" }, { - "alias": "Memory Limits %", + "alias": "CPU Limits %", "colorMode": null, "colors": [ @@ -7438,60 +7448,6 @@ items: "type": "number", "unit": "percentunit" }, - { - "alias": "Memory Usage (RSS)", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #F", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Usage (Cache)", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #G", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Usage (Swap)", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #H", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, { "alias": "Pod", "colorMode": null, @@ -7528,7 +7484,7 @@ items: ], "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7537,7 +7493,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7546,7 +7502,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7555,7 +7511,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7564,39 +7520,146 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, "legendFormat": "", "refId": "E", "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)", - "format": "table", - "instant": true, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\"}) by (pod)", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "F", + "legendFormat": "{{pod}}", + "legendLink": null, "step": 10 }, { - "expr": "sum(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)", - "format": "table", - "instant": true, + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.memory\"})", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "G", + "legendFormat": "quota - requests", + "legendLink": null, "step": 10 }, { - "expr": "sum(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)", - "format": "table", - "instant": true, + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.memory\"})", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "H", + "legendFormat": "quota - limits", + "legendLink": null, "step": 10 } ], @@ -7605,14 +7668,13 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory Quota", + "title": "Memory Usage (w/o cache)", "tooltip": { "shared": false, "sort": 0, "value_type": "individual" }, - "transform": "table", - "type": "table", + "type": "graph", "xaxis": { "buckets": null, "mode": "time", @@ -7624,7 +7686,7 @@ items: }, "yaxes": [ { - "format": "short", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -7646,7 +7708,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Memory Quota", + "title": "Memory Usage", "titleSize": "h6" }, { @@ -7662,7 +7724,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 5, + "id": 8, "legend": { "avg": false, "current": false, @@ -7697,7 +7759,7 @@ items: "type": "hidden" }, { - "alias": "Current Receive Bandwidth", + "alias": "Memory Usage", "colorMode": null, "colors": [ @@ -7712,10 +7774,10 @@ items: ], "type": "number", - "unit": "Bps" + "unit": "bytes" }, { - "alias": "Current Transmit Bandwidth", + "alias": "Memory Requests", "colorMode": null, "colors": [ @@ -7730,10 +7792,10 @@ items: ], "type": "number", - "unit": "Bps" + "unit": "bytes" }, { - "alias": "Rate of Received Packets", + "alias": "Memory Requests %", "colorMode": null, "colors": [ @@ -7748,10 +7810,10 @@ items: ], "type": "number", - "unit": "pps" + "unit": "percentunit" }, { - "alias": "Rate of Transmitted Packets", + "alias": "Memory Limits", "colorMode": null, "colors": [ @@ -7766,10 +7828,10 @@ items: ], "type": "number", - "unit": "pps" + "unit": "bytes" }, { - "alias": "Rate of Received Packets Dropped", + "alias": "Memory Limits %", "colorMode": null, "colors": [ @@ -7784,10 +7846,10 @@ items: ], "type": "number", - "unit": "pps" + "unit": "percentunit" }, { - "alias": "Rate of Transmitted Packets Dropped", + "alias": "Memory Usage (RSS)", "colorMode": null, "colors": [ @@ -7802,7 +7864,43 @@ items: ], "type": "number", - "unit": "pps" + "unit": "bytes" + }, + { + "alias": "Memory Usage (Cache)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #G", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Usage (Swap)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #H", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" }, { "alias": "Pod", @@ -7813,7 +7911,7 @@ items: "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": true, - "linkTooltip": "Drill down to pods", + "linkTooltip": "Drill down", "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -7840,7 +7938,7 @@ items: ], "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7849,7 +7947,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7858,7 +7956,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7867,7 +7965,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7876,7 +7974,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7885,13 +7983,31 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, "legendFormat": "", "refId": "F", "step": 10 + }, + { + "expr": "sum(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "G", + "step": 10 + }, + { + "expr": "sum(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "H", + "step": 10 } ], "thresholds": [ @@ -7899,7 +8015,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Current Network Usage", + "title": "Memory Quota", "tooltip": { "shared": false, "sort": 0, @@ -7940,7 +8056,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Memory Quota", "titleSize": "h6" }, { @@ -7955,8 +8071,8 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 6, + "fill": 1, + "id": 9, "legend": { "avg": false, "current": false, @@ -7967,7 +8083,7 @@ items: "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], @@ -7981,11 +8097,305 @@ items: ], "spaceLength": 10, "span": 12, - "stack": true, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Current Receive Bandwidth", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Current Transmit Bandwidth", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Rate of Received Packets", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Transmitted Packets", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Received Packets Dropped", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Transmitted Packets Dropped", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down to pods", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Network Usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Network", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -8054,7 +8464,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 11, "legend": { "avg": false, "current": false, @@ -8083,7 +8493,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -8152,7 +8562,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 12, "legend": { "avg": false, "current": false, @@ -8181,7 +8591,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -8250,7 +8660,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 9, + "id": 13, "legend": { "avg": false, "current": false, @@ -8279,7 +8689,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -8348,7 +8758,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 10, + "id": 14, "legend": { "avg": false, "current": false, @@ -8377,7 +8787,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -8446,7 +8856,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 11, + "id": 15, "legend": { "avg": false, "current": false, @@ -8475,7 +8885,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -8558,13 +8968,13 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 2, "includeAll": false, - "label": "cluster", + "label": null, "multi": false, "name": "cluster", "options": [ @@ -8573,7 +8983,7 @@ items: "query": "label_values(kube_pod_info, cluster)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -8585,13 +8995,13 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "namespace", + "label": null, "multi": false, "name": "namespace", "options": [ @@ -8600,48 +9010,13 @@ items: "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", "refresh": 1, "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": null, - "multi": false, - "name": "interval", - "options": [ - { - "selected": true, - "text": "4h", - "value": "4h" - } - ], - "query": "4h", - "refresh": 2, - "regex": "", - "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ ], "tagsQuery": "", - "type": "interval", + "type": "query", "useTags": false } ] @@ -9540,13 +9915,13 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 2, "includeAll": false, - "label": "cluster", + "label": null, "multi": false, "name": "cluster", "options": [ @@ -9555,7 +9930,7 @@ items: "query": "label_values(kube_pod_info, cluster)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -9567,13 +9942,13 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "node", + "label": null, "multi": false, "name": "node", "options": [ @@ -9582,7 +9957,7 @@ items: "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, node)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -9683,7 +10058,24 @@ items: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "requests", + "color": "#F2495C", + "fill": 0, + "hideTooltip": true, + "legend": true, + "linewidth": 2, + "stack": false + }, + { + "alias": "limits", + "color": "#FF9830", + "fill": 0, + "hideTooltip": true, + "legend": true, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -9697,6 +10089,22 @@ items: "legendFormat": "{{container}}", "legendLink": null, "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -9759,8 +10167,113 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 10, "id": 2, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}[5m])) by (container)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.25, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Throttling", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Throttling", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, "legend": { "avg": false, "current": false, @@ -10027,7 +10540,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -10048,7 +10561,26 @@ items: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -10056,26 +10588,26 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{container}} (RSS)", + "legendFormat": "{{container}}", "legendLink": null, "step": 10 }, { - "expr": "sum(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", + "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{container}} (Cache)", + "legendFormat": "requests", "legendLink": null, "step": 10 }, { - "expr": "sum(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", + "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{container}} (Swap)", + "legendFormat": "limits", "legendLink": null, "step": 10 } @@ -10141,7 +10673,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -10489,7 +11021,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -10518,7 +11050,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -10587,7 +11119,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -10616,7 +11148,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -10685,7 +11217,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -10714,7 +11246,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -10783,7 +11315,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -10812,7 +11344,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -10881,7 +11413,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 9, + "id": 10, "legend": { "avg": false, "current": false, @@ -10910,7 +11442,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -10979,7 +11511,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 10, + "id": 11, "legend": { "avg": false, "current": false, @@ -11008,7 +11540,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -11091,13 +11623,13 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 2, "includeAll": false, - "label": "cluster", + "label": null, "multi": false, "name": "cluster", "options": [ @@ -11106,7 +11638,7 @@ items: "query": "label_values(kube_pod_info, cluster)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -11118,13 +11650,13 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "namespace", + "label": null, "multi": false, "name": "namespace", "options": [ @@ -11133,7 +11665,7 @@ items: "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -11145,63 +11677,28 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "pod", + "label": null, "multi": false, "name": "pod", "options": [ ], "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": null, - "multi": false, - "name": "interval", - "options": [ - { - "selected": true, - "text": "4h", - "value": "4h" - } - ], - "query": "4h", "refresh": 2, "regex": "", - "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ ], "tagsQuery": "", - "type": "interval", + "type": "query", "useTags": false } ] @@ -12183,7 +12680,7 @@ items: ], "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12192,7 +12689,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12201,7 +12698,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12210,7 +12707,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12219,7 +12716,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12228,7 +12725,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12328,7 +12825,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -12426,7 +12923,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -12524,7 +13021,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -12622,7 +13119,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -12720,7 +13217,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -12818,7 +13315,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -12916,7 +13413,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13014,7 +13511,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod) \ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13097,13 +13594,13 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 2, "includeAll": false, - "label": "cluster", + "label": null, "multi": false, "name": "cluster", "options": [ @@ -13112,7 +13609,7 @@ items: "query": "label_values(kube_pod_info, cluster)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -13124,13 +13621,13 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "namespace", + "label": null, "multi": false, "name": "namespace", "options": [ @@ -13139,7 +13636,7 @@ items: "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -13151,13 +13648,13 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "workload", + "label": null, "multi": false, "name": "workload", "options": [ @@ -13166,7 +13663,7 @@ items: "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -13178,13 +13675,13 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "type", + "label": null, "multi": false, "name": "type", "options": [ @@ -13193,48 +13690,13 @@ items: "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", "refresh": 1, "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": null, - "multi": false, - "name": "interval", - "options": [ - { - "selected": true, - "text": "4h", - "value": "4h" - } - ], - "query": "4h", - "refresh": 2, - "regex": "", - "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ ], "tagsQuery": "", - "type": "interval", + "type": "query", "useTags": false } ] @@ -13329,7 +13791,26 @@ items: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -13343,6 +13824,22 @@ items: "legendFormat": "{{workload}} - {{workload_type}}", "legendLink": null, "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -13739,7 +14236,26 @@ items: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -13753,6 +14269,22 @@ items: "legendFormat": "{{workload}} - {{workload_type}}", "legendLink": null, "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.memory\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.memory\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -14324,7 +14856,7 @@ items: ], "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14333,7 +14865,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14342,7 +14874,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14351,7 +14883,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14360,7 +14892,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14369,7 +14901,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14469,7 +15001,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -14567,7 +15099,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -14665,7 +15197,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -14763,7 +15295,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -14861,7 +15393,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -14959,7 +15491,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -15057,7 +15589,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -15155,7 +15687,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod) \ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod) \ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -15237,50 +15769,28 @@ items: }, { "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" + "text": "deployment", + "value": "deployment" }, "datasource": "$datasource", + "definition": "label_values(mixin_pod_workload{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", "hide": 0, "includeAll": false, - "label": "namespace", + "label": null, "multi": false, - "name": "namespace", + "name": "type", "options": [ ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "query": "label_values(mixin_pod_workload{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", "refresh": 1, "regex": "", - "sort": 2, + "skipUrlSync": false, + "sort": 0, "tagValuesQuery": "", "tags": [ @@ -15291,63 +15801,50 @@ items: }, { "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", "current": { - "text": "5m", - "value": "5m" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 2, "includeAll": false, "label": null, "multi": false, - "name": "interval", + "name": "cluster", "options": [ - { - "selected": true, - "text": "4h", - "value": "4h" - } + ], - "query": "4h", - "refresh": 2, + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, "regex": "", - "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ ], "tagsQuery": "", - "type": "interval", + "type": "query", "useTags": false }, { "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", "current": { - "text": "deployment", - "value": "deployment" + "text": "", + "value": "" }, "datasource": "$datasource", - "definition": "label_values(mixin_pod_workload{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", "hide": 0, "includeAll": false, "label": null, "multi": false, - "name": "type", + "name": "namespace", "options": [ ], - "query": "label_values(mixin_pod_workload{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", "refresh": 1, "regex": "", - "skipUrlSync": false, - "sort": 0, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -15419,7 +15916,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -19105,7 +19602,7 @@ items: "type": "row" } ], - "refresh": "30s", + "refresh": "10s", "rows": [ ], @@ -20744,7 +21241,7 @@ items: "type": "row" } ], - "refresh": "30s", + "refresh": "10s", "rows": [ ], @@ -23872,7 +24369,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -24857,770 +25354,91 @@ items: "hideZero": true, "max": false, "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [ - - ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Transmit Bandwidth", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "collapse": true, - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 20 - }, - "id": 8, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 21 - }, - "id": 9, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [ - - ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Rate of Received Packets", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 21 - }, - "id": 10, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [ - - ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Rate of Transmitted Packets", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Packets", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": true, - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 11, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 32 - }, - "id": 12, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [ - - ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Rate of Received Packets Dropped", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 32 - }, - "id": 13, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [ - - ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Rate of Transmitted Packets Dropped", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Errors", - "titleSize": "h6", - "type": "row" - } - ], - "refresh": "30s", - "rows": [ - - ], - "schemaVersion": 18, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": ".+", - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "kube-system", - "value": "kube-system" - }, - "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total, namespace)", - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(container_network_receive_packets_total, namespace)", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".+", - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "", - "value": "" - }, - "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total{namespace=~\"$namespace\"}, pod)", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "pod", - "options": [ - - ], - "query": "label_values(container_network_receive_packets_total{namespace=~\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "resolution", - "options": [ - { - "selected": false, - "text": "30s", - "value": "30s" - }, - { - "selected": true, - "text": "5m", - "value": "5m" - }, - { - "selected": false, - "text": "1h", - "value": "1h" - } - ], - "query": "30s,5m,1h", - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "interval", - "useTags": false - }, - { - "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": null, - "multi": false, - "name": "interval", - "options": [ - { - "selected": true, - "text": "4h", - "value": "4h" - } - ], - "query": "4h", - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tags": [ + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ - ], - "tagsQuery": "", - "type": "interval", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / Networking / Pod", - "uid": "7a18067ce943a40ae25454675c19ff5c", - "version": 0 - } - kind: ConfigMap - metadata: - name: grafana-dashboard-pod-total - namespace: monitoring -- apiVersion: v1 - data: - pods.json: |- - { - "__inputs": [ + ], + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "__requires": [ + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "$datasource", - "enable": true, - "expr": "time() == BOOL timestamp(rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[2m]) > 0)", - "hide": false, - "iconColor": "rgba(215, 44, 44, 1)", - "name": "Restarts", - "showIn": 0, - "tags": [ - "restart" - ], - "type": "rows" - } - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Transmit Bandwidth", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ], - "refresh": "", - "rows": [ + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, { - "collapse": false, - "collapsed": false, + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 8, "panels": [ { "aliasColors": { @@ -25630,28 +25448,35 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 2, "gridPos": { - + "h": 10, + "w": 12, + "x": 0, + "y": 21 }, - "id": 2, + "id": 9, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 2, "links": [ ], - "nullPointMode": "null", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, @@ -25662,36 +25487,16 @@ items: ], "spaceLength": 10, "span": 12, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum by(container) (container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Current: {{ container }}", - "refId": "A" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\", pod=\"$pod\", container=~\"$container\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "refId": "B" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\", pod=\"$pod\", container=~\"$container\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Limit: {{ container }}", - "refId": "C" - }, - { - "expr": "sum by(container) (container_memory_cache{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\", container=~\"$container\", container!=\"POD\"})", + "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Cache: {{ container }}", - "refId": "D" + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 10 } ], "thresholds": [ @@ -25699,10 +25504,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Rate of Received Packets", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -25717,7 +25522,7 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -25725,7 +25530,7 @@ items: "show": true }, { - "format": "bytes", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -25733,20 +25538,7 @@ items: "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + }, { "aliasColors": { @@ -25755,28 +25547,35 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 2, "gridPos": { - + "h": 10, + "w": 12, + "x": 12, + "y": 21 }, - "id": 3, + "id": 10, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 2, "links": [ ], - "nullPointMode": "null", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, @@ -25787,29 +25586,16 @@ items: ], "spaceLength": 10, "span": 12, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum by (container) (irate(container_cpu_usage_seconds_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", image!=\"\", pod=\"$pod\", container=~\"$container\", container!=\"POD\"}[4m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Current: {{ container }}", - "refId": "A" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\", pod=\"$pod\", container=~\"$container\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", - "refId": "B" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\", pod=\"$pod\", container=~\"$container\"})", + "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Limit: {{ container }}", - "refId": "C" + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 10 } ], "thresholds": [ @@ -25817,10 +25603,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "CPU Usage", + "title": "Rate of Transmitted Packets", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -25835,7 +25621,7 @@ items: }, "yaxes": [ { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -25843,7 +25629,7 @@ items: "show": true }, { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -25856,14 +25642,21 @@ items: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", + "showTitle": true, + "title": "Packets", "titleSize": "h6", "type": "row" }, { - "collapse": false, - "collapsed": false, + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 11, "panels": [ { "aliasColors": { @@ -25873,28 +25666,35 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 2, "gridPos": { - + "h": 10, + "w": 12, + "x": 0, + "y": 32 }, - "id": 4, + "id": 12, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 2, "links": [ ], - "nullPointMode": "null", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, @@ -25905,22 +25705,16 @@ items: ], "spaceLength": 10, "span": 12, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (pod) (irate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[4m])))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RX: {{ pod }}", - "refId": "A" - }, - { - "expr": "sort_desc(sum by (pod) (irate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[4m])))", + "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "TX: {{ pod }}", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 10 } ], "thresholds": [ @@ -25928,10 +25722,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Network I/O", + "title": "Rate of Received Packets Dropped", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -25946,7 +25740,7 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -25954,7 +25748,7 @@ items: "show": true }, { - "format": "bytes", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -25962,20 +25756,7 @@ items: "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + }, { "aliasColors": { @@ -25984,28 +25765,35 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 2, "gridPos": { - + "h": 10, + "w": 12, + "x": 12, + "y": 32 }, - "id": 5, + "id": 13, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 2, "links": [ ], - "nullPointMode": "null", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, @@ -26016,15 +25804,16 @@ items: ], "spaceLength": 10, "span": 12, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "max by (container) (kube_pod_container_status_restarts_total{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Restarts: {{ container }}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 10 } ], "thresholds": [ @@ -26032,10 +25821,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Total Restarts Per Container", + "title": "Rate of Transmitted Packets Dropped", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -26050,7 +25839,7 @@ items: }, "yaxes": [ { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -26058,7 +25847,7 @@ items: "show": true }, { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -26071,13 +25860,17 @@ items: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", + "showTitle": true, + "title": "Errors", "titleSize": "h6", "type": "row" } ], - "schemaVersion": 14, + "refresh": "10s", + "rows": [ + + ], + "schemaVersion": 18, "style": "dark", "tags": [ "kubernetes-mixin" @@ -26101,22 +25894,28 @@ items: "type": "datasource" }, { - "allValue": null, + "allValue": ".+", + "auto": false, + "auto_count": 30, + "auto_min": "10s", "current": { - + "text": "kube-system", + "value": "kube-system" }, "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", + "definition": "label_values(container_network_receive_packets_total, namespace)", + "hide": 0, + "includeAll": true, + "label": null, "multi": false, - "name": "cluster", + "name": "namespace", "options": [ ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 2, + "query": "label_values(container_network_receive_packets_total, namespace)", + "refresh": 1, "regex": "", + "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ @@ -26127,22 +25926,28 @@ items: "useTags": false }, { - "allValue": null, + "allValue": ".+", + "auto": false, + "auto_count": 30, + "auto_min": "10s", "current": { - + "text": "", + "value": "" }, "datasource": "$datasource", + "definition": "label_values(container_network_receive_packets_total{namespace=~\"$namespace\"}, pod)", "hide": 0, "includeAll": false, - "label": "Namespace", + "label": null, "multi": false, - "name": "namespace", + "name": "pod", "options": [ ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 2, + "query": "label_values(container_network_receive_packets_total{namespace=~\"$namespace\"}, pod)", + "refresh": 1, "regex": "", + "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ @@ -26154,54 +25959,82 @@ items: }, { "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", "current": { - + "text": "5m", + "value": "5m" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "Pod", + "label": null, "multi": false, - "name": "pod", + "name": "resolution", "options": [ - + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": true, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + } ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=~\"$namespace\"}, pod)", + "query": "30s,5m,1h", "refresh": 2, "regex": "", + "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ ], "tagsQuery": "", - "type": "query", + "type": "interval", "useTags": false }, { "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", "current": { - + "text": "5m", + "value": "5m" }, "datasource": "$datasource", - "hide": 0, - "includeAll": true, - "label": "Container", + "hide": 2, + "includeAll": false, + "label": null, "multi": false, - "name": "container", + "name": "interval", "options": [ - + { + "selected": true, + "text": "4h", + "value": "4h" + } ], - "query": "label_values(kube_pod_container_info{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}, container)", + "query": "4h", "refresh": 2, "regex": "", + "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ ], "tagsQuery": "", - "type": "query", + "type": "interval", "useTags": false } ] @@ -26236,13 +26069,13 @@ items: ] }, "timezone": "", - "title": "Kubernetes / Pods", - "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea", + "title": "Kubernetes / Networking / Pod", + "uid": "7a18067ce943a40ae25454675c19ff5c", "version": 0 } kind: ConfigMap metadata: - name: grafana-dashboard-pods + name: grafana-dashboard-pod-total namespace: monitoring - apiVersion: v1 data: @@ -29097,7 +28930,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -30292,7 +30125,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -33354,7 +33187,7 @@ items: "type": "row" } ], - "refresh": "30s", + "refresh": "10s", "rows": [ ], diff --git a/manifests/grafana-dashboardSources.yaml b/manifests/grafana-dashboardSources.yaml index d8b401a73eba03ebacc2bf2d0a6474800fe3fe37..fffec986bb45dcfc685e2cae8ea9a8be170ecc3b 100644 --- a/manifests/grafana-dashboardSources.yaml +++ b/manifests/grafana-dashboardSources.yaml @@ -5,7 +5,7 @@ data: "apiVersion": 1, "providers": [ { - "folder": "", + "folder": "Default", "name": "0", "options": { "path": "/grafana-dashboard-definitions/0" diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 5ccf92fc88b192309c1f8d662eafba50d6612ccb..a84164b135f44d5caaa256b3bef060d1eee6f3c7 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,7 +16,8 @@ spec: app: grafana spec: containers: - - image: grafana/grafana:6.6.0 + - env: [] + image: grafana/grafana:6.6.0 name: grafana ports: - containerPort: 3000 @@ -93,9 +94,6 @@ spec: - mountPath: /grafana-dashboard-definitions/0/pod-total name: grafana-dashboard-pod-total readOnly: false - - mountPath: /grafana-dashboard-definitions/0/pods - name: grafana-dashboard-pods - readOnly: false - mountPath: /grafana-dashboard-definitions/0/prometheus-remote-write name: grafana-dashboard-prometheus-remote-write readOnly: false @@ -180,9 +178,6 @@ spec: - configMap: name: grafana-dashboard-pod-total name: grafana-dashboard-pod-total - - configMap: - name: grafana-dashboard-pods - name: grafana-dashboard-pods - configMap: name: grafana-dashboard-prometheus-remote-write name: grafana-dashboard-prometheus-remote-write diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index aababcb5991e0ad776f3f3615ed5b7f3433f4969..1161a8bdf8368cedc90adbd809f2352ff56db4e6 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -65,25 +65,139 @@ spec: rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) ) record: instance:node_network_transmit_drop_excluding_lo:rate1m + - name: kube-apiserver-error + rules: + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[5m] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate5m + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[30m] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate30m + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[1h] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate1h + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[2h] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate2h + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[6h] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate6h + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[1d] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate1d + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[3d] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate3d + - expr: | + sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate5m{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate5m + - expr: | + sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate30m{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate30m + - expr: | + sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate1h{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate1h + - expr: | + sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate2h{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate2h + - expr: | + sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate6h{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate6h + - expr: | + sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate1d{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate1d + - expr: | + sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate3d{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate3d - name: kube-apiserver.rules rules: - expr: | - sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod) + sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) / - sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod) + sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) record: cluster:apiserver_request_duration_seconds:mean5m - expr: | - histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)) + histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: quantile: "0.99" record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - expr: | - histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)) + histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: quantile: "0.9" record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - expr: | - histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)) + histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: quantile: "0.5" record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile @@ -95,23 +209,33 @@ spec: - expr: | sum by (cluster, namespace, pod, container) ( rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) - ) * on (cluster, namespace, pod) group_left(node) max by(cluster, namespace, pod, node) (kube_pod_info) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info) + ) record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate - expr: | container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) record: node_namespace_pod_container:container_memory_working_set_bytes - expr: | container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) record: node_namespace_pod_container:container_memory_rss - expr: | container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) record: node_namespace_pod_container:container_memory_cache - expr: | container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) record: node_namespace_pod_container:container_memory_swap - expr: | sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace) @@ -139,35 +263,39 @@ spec: ) record: namespace:kube_pod_container_resource_requests_cpu_cores:sum - expr: | - sum( + max by (cluster, namespace, workload, pod) ( label_replace( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), "workload", "$1", "owner_name", "(.*)" ) - ) by (cluster, namespace, workload, pod) + ) labels: workload_type: deployment record: mixin_pod_workload - expr: | - sum( + max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" ) - ) by (cluster, namespace, workload, pod) + ) labels: workload_type: daemonset record: mixin_pod_workload - expr: | - sum( + max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)" ) - ) by (cluster, namespace, workload, pod) + ) labels: workload_type: statefulset record: mixin_pod_workload @@ -224,7 +352,10 @@ spec: sum(min(kube_pod_info) by (cluster, node)) record: ':kube_pod_info_node_count:' - expr: | - max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) + topk by(namespace, pod) (1, + max by (node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") + )) record: 'node_namespace_pod:kube_pod_info:' - expr: | count by (cluster, node) (sum by (node, cpu) ( @@ -244,6 +375,23 @@ spec: ) ) by (cluster) record: :node_memory_MemAvailable_bytes:sum + - name: kubelet.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.99" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.9" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.5" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - name: kube-prometheus-node-recording.rules rules: - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY @@ -457,6 +605,47 @@ spec: for: 1h labels: severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ $value | humanizePercentage }} of conntrack entries are used' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit + expr: | + (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + message: Clock on {{ $labels.instance }} is out of sync by more than 300s. + Ensure NTP is configured correctly on this host. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected + summary: Clock skew detected. + expr: | + ( + node_timex_offset_seconds > 0.05 + and + deriv(node_timex_offset_seconds[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds < 0.05 + and + deriv(node_timex_offset_seconds[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP + is configured on this host. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising + summary: Clock not synchronising. + expr: | + min_over_time(node_timex_sync_status[5m]) == 0 + for: 10m + labels: + severity: warning - name: kubernetes-apps rules: - alert: KubePodCrashLooping @@ -498,9 +687,15 @@ spec: matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch expr: | - kube_deployment_spec_replicas{job="kube-state-metrics"} - != - kube_deployment_status_replicas_available{job="kube-state-metrics"} + ( + kube_deployment_spec_replicas{job="kube-state-metrics"} + != + kube_deployment_status_replicas_available{job="kube-state-metrics"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 + ) for: 15m labels: severity: critical @@ -510,9 +705,15 @@ spec: not matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch expr: | - kube_statefulset_status_replicas_ready{job="kube-state-metrics"} - != - kube_statefulset_status_replicas{job="kube-state-metrics"} + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 + ) for: 15m labels: severity: critical @@ -656,7 +857,7 @@ spec: tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | - sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum) + sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) / sum(kube_node_status_allocatable_cpu_cores) > @@ -670,7 +871,7 @@ spec: tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | - sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum) + sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) / sum(kube_node_status_allocatable_memory_bytes) > @@ -799,10 +1000,12 @@ spec: for: 15m labels: severity: warning - - name: kube-apiserver-error + - name: kube-apiserver-error-alerts rules: - alert: ErrorBudgetBurn annotations: + message: 'High requests error budget burn for job=apiserver (current value: + {{ $value }})' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn expr: | ( @@ -821,6 +1024,8 @@ spec: severity: critical - alert: ErrorBudgetBurn annotations: + message: 'High requests error budget burn for job=apiserver (current value: + {{ $value }})' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn expr: | ( @@ -837,118 +1042,6 @@ spec: labels: job: apiserver severity: warning - - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[5m] - ), "status_class", "${1}xx", "code", "([0-9])..") - ) - labels: - job: apiserver - record: status_class:apiserver_request_total:rate5m - - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[30m] - ), "status_class", "${1}xx", "code", "([0-9])..") - ) - labels: - job: apiserver - record: status_class:apiserver_request_total:rate30m - - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[1h] - ), "status_class", "${1}xx", "code", "([0-9])..") - ) - labels: - job: apiserver - record: status_class:apiserver_request_total:rate1h - - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[2h] - ), "status_class", "${1}xx", "code", "([0-9])..") - ) - labels: - job: apiserver - record: status_class:apiserver_request_total:rate2h - - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[6h] - ), "status_class", "${1}xx", "code", "([0-9])..") - ) - labels: - job: apiserver - record: status_class:apiserver_request_total:rate6h - - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[1d] - ), "status_class", "${1}xx", "code", "([0-9])..") - ) - labels: - job: apiserver - record: status_class:apiserver_request_total:rate1d - - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[3d] - ), "status_class", "${1}xx", "code", "([0-9])..") - ) - labels: - job: apiserver - record: status_class:apiserver_request_total:rate3d - - expr: | - sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"}) - / - sum(status_class:apiserver_request_total:rate5m{job="apiserver"}) - labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate5m - - expr: | - sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"}) - / - sum(status_class:apiserver_request_total:rate30m{job="apiserver"}) - labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate30m - - expr: | - sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"}) - / - sum(status_class:apiserver_request_total:rate1h{job="apiserver"}) - labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate1h - - expr: | - sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"}) - / - sum(status_class:apiserver_request_total:rate2h{job="apiserver"}) - labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate2h - - expr: | - sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"}) - / - sum(status_class:apiserver_request_total:rate6h{job="apiserver"}) - labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate6h - - expr: | - sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"}) - / - sum(status_class:apiserver_request_total:rate1d{job="apiserver"}) - labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate1d - - expr: | - sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"}) - / - sum(status_class:apiserver_request_total:rate3d{job="apiserver"}) - labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate3d - name: kubernetes-system-apiserver rules: - alert: KubeAPILatencyHigh @@ -985,30 +1078,6 @@ spec: for: 10m labels: severity: critical - - alert: KubeAPIErrorsHigh - annotations: - message: API server is returning errors for {{ $value | humanizePercentage - }} of requests. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh - expr: | - sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) - / - sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.03 - for: 10m - labels: - severity: critical - - alert: KubeAPIErrorsHigh - annotations: - message: API server is returning errors for {{ $value | humanizePercentage - }} of requests. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh - expr: | - sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) - / - sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.01 - for: 10m - labels: - severity: warning - alert: KubeAPIErrorsHigh annotations: message: API server is returning errors for {{ $value | humanizePercentage @@ -1053,6 +1122,27 @@ spec: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: severity: critical + - alert: AggregatedAPIErrors + annotations: + message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has + reported errors. The number of errors have increased for it in the past + five minutes. High values indicate that the availability of the service + changes too often. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors + expr: | + sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 + labels: + severity: warning + - alert: AggregatedAPIDown + annotations: + message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down. + It has not been available at least for the past five minutes. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown + expr: | + sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0 + for: 5m + labels: + severity: warning - alert: KubeAPIDown annotations: message: KubeAPI has disappeared from Prometheus target discovery. @@ -1088,7 +1178,37 @@ spec: }} of its Pod capacity. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods expr: | - max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95 + max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) by(node) > 0.95 + for: 15m + labels: + severity: warning + - alert: KubeNodeReadinessFlapping + annotations: + message: The readiness status of node {{ $labels.node }} has changed {{ $value + }} times in the last 15 minutes. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping + expr: | + sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 + for: 15m + labels: + severity: warning + - alert: KubeletPlegDurationHigh + annotations: + message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration + of {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh + expr: | + node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning + - alert: KubeletPodStartUpLatencyHigh + annotations: + message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds + on node {{ $labels.node }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh + expr: | + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 60 for: 15m labels: severity: warning @@ -1253,7 +1373,8 @@ spec: - alert: PrometheusRemoteStorageFailures annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send - {{ printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}. + {{ printf "%.1f" $value }}% of the samples to {{ if $labels.queue }}{{ $labels.queue + }}{{ else }}{{ $labels.url }}{{ end }}. summary: Prometheus fails to send samples to remote storage. expr: | ( @@ -1273,7 +1394,8 @@ spec: - alert: PrometheusRemoteWriteBehind annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write - is {{ printf "%.1f" $value }}s behind for queue {{$labels.queue}}. + is {{ printf "%.1f" $value }}s behind for {{ if $labels.queue }}{{ $labels.queue + }}{{ else }}{{ $labels.url }}{{ end }}. summary: Prometheus remote write is behind. expr: | # Without max_over_time, failed scrapes could create false negatives, see @@ -1378,17 +1500,6 @@ spec: expr: vector(1) labels: severity: none - - name: node-time - rules: - - alert: ClockSkewDetected - annotations: - message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod - }}. Ensure NTP is configured correctly on this host. - expr: | - abs(node_timex_offset_seconds{job="node-exporter"}) > 0.05 - for: 2m - labels: - severity: warning - name: node-network rules: - alert: NodeNetworkInterfaceFlapping