diff --git a/README.md b/README.md index b612e0c431a83d1cb7609c94fc6c2d81c7fdf260..bc320b7922796890dfd17d9f4b1ab09bfbed2e5d 100644 --- a/README.md +++ b/README.md @@ -260,12 +260,12 @@ These are the available fields with their respective default values: namespace: "default", versions+:: { - alertmanager: "v0.16.0", + alertmanager: "v0.16.1", nodeExporter: "v0.17.0", kubeStateMetrics: "v1.5.0", kubeRbacProxy: "v0.4.1", - addonResizer: "2.1", - prometheusOperator: "v0.28.0", + addonResizer: "1.8.4", + prometheusOperator: "v0.29.0", prometheus: "v2.5.0", }, @@ -274,7 +274,7 @@ These are the available fields with their respective default values: alertmanager: "quay.io/prometheus/alertmanager", kubeStateMetrics: "quay.io/coreos/kube-state-metrics", kubeRbacProxy: "quay.io/coreos/kube-rbac-proxy", - addonResizer: "gcr.io/google-containers/addon-resizer-amd64", + addonResizer: "k8s.gcr.io/addon-resizer", nodeExporter: "quay.io/prometheus/node-exporter", prometheusOperator: "quay.io/coreos/prometheus-operator", }, @@ -298,7 +298,7 @@ These are the available fields with their respective default values: receiver: 'null' routes: - match: - alertname: DeadMansSwitch + alertname: Watchdog receiver: 'null' receivers: - name: 'null' @@ -402,12 +402,12 @@ To produce the `docker pull/tag/push` commands that will synchronize upstream im ```shell $ jsonnet -J vendor -S --tla-str repository=internal-registry.com/organization sync-to-internal-registry.jsonnet -docker pull gcr.io/google-containers/addon-resizer-amd64:2.1 -docker tag gcr.io/google-containers/addon-resizer-amd64:2.1 internal-registry.com/organization/addon-resizer:2.1 -docker push internal-registry.com/organization/addon-resizer:2.1 -docker pull quay.io/prometheus/alertmanager:v0.15.3 -docker tag quay.io/prometheus/alertmanager:v0.15.3 internal-registry.com/organization/alertmanager:v0.15.3 -docker push internal-registry.com/organization/alertmanager:v0.15.3 +docker pull k8s.gcr.io/addon-resizer:1.8.4 +docker tag k8s.gcr.io/addon-resizer:1.8.4 internal-registry.com/organization/addon-resizer:1.8.4 +docker push internal-registry.com/organization/addon-resizer:1.8.4 +docker pull quay.io/prometheus/alertmanager:v0.16.1 +docker tag quay.io/prometheus/alertmanager:v0.16.1 internal-registry.com/organization/alertmanager:v0.16.1 +docker push internal-registry.com/organization/alertmanager:v0.16.1 ... ``` @@ -497,7 +497,7 @@ The Alertmanager configuration is located in the `_config.alertmanager.config` c receiver: 'null' routes: - match: - alertname: DeadMansSwitch + alertname: Watchdog receiver: 'null' receivers: - name: 'null' @@ -581,7 +581,7 @@ Should the Prometheus `/targets` page show kubelet targets, but not able to succ As described in the [Prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default. -If you are using Google's GKE product, see [docs/GKE-cadvisor-support.md]. +If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md). #### Authentication problem diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index dcd0ad72f4286c0a3512d1e4525a83f740662cc2..671a898706d77b46f57be05788ee3b3c9167ec38 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -49,13 +49,13 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { name: 'example-group', rules: [ { - alert: 'DeadMansSwitch', + alert: 'Watchdog', expr: 'vector(1)', labels: { severity: 'none', }, annotations: { - description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', + description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.', }, }, ], @@ -139,7 +139,75 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` +### Changing default rules +Along with adding additional rules, we give the user the option to filter or adjust the existing rules imported by `kube-prometheus/kube-prometheus.libsonnet`. The recording rules can be found in [kube-prometheus/rules](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus/jsonnet/kube-prometheus/rules) and [kubernetes-mixin/rules](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/rules) while the alerting rules can be found in [kube-prometheus/alerts](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus/jsonnet/kube-prometheus/alerts) and [kubernetes-mixin/alerts](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/alerts). + +Knowing which rules to change, the user can now use functions from the [Jsonnet standard library](https://jsonnet.org/ref/stdlib.html) to make these changes. Below are examples of both a filter and an adjustment being made to the default rules. These changes can be assigned to a local variable and then added to the `local kp` object as seen in the examples above. + +#### Filter +Here the alert `KubeStatefulSetReplicasMismatch` is being filtered out of the group `kubernetes-apps`. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). +```jsonnet +local filter = { + prometheusAlerts+:: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.filter(function(rule) + rule.alert != "KubeStatefulSetReplicasMismatch", + group.rules + ) + } + else + group, + super.groups + ), + }, +}; +``` +#### Adjustment +Here the expression for the alert used above is updated from its previous value. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). +```jsonnet +local update = { + prometheusAlerts+:: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.map( + function(rule) + if rule.alert == "KubeStatefulSetReplicasMismatch" then + rule { + expr: "kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\",statefulset!=\"vault\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\",statefulset!=\"vault\"}" + } + else + rule, + group.rules + ) + } + else + group, + super.groups + ), + }, +}; +``` +Using the example from above about adding in pre-rendered rules, the new local vaiables can be added in as follows: +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + filter + update + { + prometheusAlerts+:: (import 'existingrule.json'), +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` ## Dashboards Dashboards can either be added using jsonnet or simply a pre-rendered json dashboard. diff --git a/examples/alertmanager-config.jsonnet b/examples/alertmanager-config.jsonnet index 162104d71470aba4cc45b9b21f9c793550a4580b..f08dbe19e333073d6553fda87bf5b4671787c0ac 100644 --- a/examples/alertmanager-config.jsonnet +++ b/examples/alertmanager-config.jsonnet @@ -12,7 +12,7 @@ receiver: 'null' routes: - match: - alertname: DeadMansSwitch + alertname: Watchdog receiver: 'null' receivers: - name: 'null' diff --git a/examples/alertmanager-config.yaml b/examples/alertmanager-config.yaml index 78c65b64723bf1ed42857fb13284ace813cc85e5..b341b55f169a71236547468fecabe42ce625551b 100644 --- a/examples/alertmanager-config.yaml +++ b/examples/alertmanager-config.yaml @@ -9,7 +9,7 @@ route: receiver: 'null' routes: - match: - alertname: DeadMansSwitch + alertname: Watchdog receiver: 'null' receivers: - name: 'null' diff --git a/examples/existingrule.json b/examples/existingrule.json index b29a5c454a17b4dbaac88de8404f6d6e51cda2f1..41d6620b87749b68f9aa222b9582a27ac0469068 100644 --- a/examples/existingrule.json +++ b/examples/existingrule.json @@ -1 +1 @@ -{"groups":[{"name":"example-group","rules":[{"alert":"DeadMansSwitch","annotations":{"description":"This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional."},"expr":"vector(1)","labels":{"severity":"none"}}]}]} \ No newline at end of file +{"groups":[{"name":"example-group","rules":[{"alert":"Watchdog","annotations":{"description":"This is a Watchdog meant to ensure that the entire alerting pipeline is functional."},"expr":"vector(1)","labels":{"severity":"none"}}]}]} \ No newline at end of file diff --git a/examples/existingrule.yaml b/examples/existingrule.yaml index 94d9d6915c8fbf1a659f09cc12d54134a2f4c184..6a67032f96accb758e0ab108c1c72be037b7efe4 100644 --- a/examples/existingrule.yaml +++ b/examples/existingrule.yaml @@ -1,9 +1,9 @@ groups: - name: example-group rules: - - alert: DeadMansSwitch + - alert: Watchdog expr: vector(1) labels: severity: "none" annotations: - description: This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional. + description: This is a Watchdog meant to ensure that the entire alerting pipeline is functional. diff --git a/examples/prometheus-additional-alert-rule-example.jsonnet b/examples/prometheus-additional-alert-rule-example.jsonnet index b8d16af87afd81a4bdffaa2a774c203f8752573f..622df03285bb2676f0edf3ffb2236e1220e16cbd 100644 --- a/examples/prometheus-additional-alert-rule-example.jsonnet +++ b/examples/prometheus-additional-alert-rule-example.jsonnet @@ -8,13 +8,13 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { name: 'example-group', rules: [ { - alert: 'DeadMansSwitch', + alert: 'Watchdog', expr: 'vector(1)', labels: { severity: 'none', }, annotations: { - description: 'This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.', + description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.', }, }, ], diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 02909525ec23cc362ee28aa9047df0ea279f0d84..47e61e292069612d0a2f4f4734ddde6dd4f99535 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - alertmanager: 'v0.16.0', + alertmanager: 'v0.16.1', }, imageRepos+:: { @@ -28,7 +28,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; { receiver: 'null', match: { - alertname: 'DeadMansSwitch', + alertname: 'Watchdog', }, }, ], diff --git a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet index b188faa22bf2c00dd65f04674a84db44f0dc9308..a2d0cc67af8a56218028193bcd8d037f74c5e83a 100644 --- a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet @@ -109,7 +109,7 @@ summary: 'Prometheus write-ahead log is corrupted', }, expr: ||| - tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0 + prometheus_tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0 ||| % $._config, 'for': '4h', labels: { diff --git a/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet index a9cf3bb32c74d6086be838f668fae2655904ed19..556fa85639a83f44f916f9ff6b74eaadb41d8fa2 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet @@ -15,7 +15,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) + service.mixin.spec.withClusterIp('None'), kubeDnsPrometheusDiscoveryService: - service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('http-metrics-skydns', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) + + service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('metrics', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) + service.mixin.metadata.withNamespace('kube-system') + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) + service.mixin.spec.withClusterIp('None'), diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet index c99bf38bdc1b6cba17f5d2f51077927efe6ed1b8..4b20b814a58ba415931c1a3a52437ad6902f9675 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus-thanos.libsonnet @@ -5,7 +5,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType; { _config+:: { versions+:: { - thanos: 'v0.2.1', + thanos: 'v0.3.2', }, imageRepos+:: { thanos: 'improbable/thanos', diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index c30f13f9f4b328829350b616ebdcfdf4aade499d..f51ae49ac1aad5a2bb004c83ed6aff2e28bb9d3d 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -43,7 +43,7 @@ local configMapList = k.core.v1.configMapList; namespace: 'default', versions+:: { - grafana: '6.0.0-beta1', + grafana: '6.0.1', }, tlsCipherSuites: [ diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 0f59af4d5d590231ab471016c0d17d3241fe4faf..5172fb9427c37fcd9af4ba4a2ecac848cfe85212 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -18,13 +18,13 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; versions+:: { kubeStateMetrics: 'v1.5.0', kubeRbacProxy: 'v0.4.1', - addonResizer: '2.1', + addonResizer: '1.8.4', }, imageRepos+:: { kubeStateMetrics: 'quay.io/coreos/kube-state-metrics', kubeRbacProxy: 'quay.io/coreos/kube-rbac-proxy', - addonResizer: 'gcr.io/google-containers/addon-resizer-amd64', + addonResizer: 'k8s.gcr.io/addon-resizer', }, }, @@ -175,7 +175,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '--extra-cpu=' + $._config.kubeStateMetrics.cpuPerNode, '--memory=' + $._config.kubeStateMetrics.baseMemory, '--extra-memory=' + $._config.kubeStateMetrics.memoryPerNode, - '--acceptance-offset=5', + '--threshold=5', '--deployment=kube-state-metrics', ]) + container.withEnv([ diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 453ab79c6f37f4048be0b11b0608d5c113ec082d..ba2adb0594498228475fe0ac5bdfe9ce3d25b851 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - prometheus: 'v2.5.0', + prometheus: 'v2.7.2', }, imageRepos+:: { diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 5d10866c82d36f69b95a12e4e00b8c3899f5703d..ad38b5c937b5fc7143c341fda2334b7828d036e6 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "df002d09f7b7a50321786c4f19c70d371494410b" + "version": "9faab58c2b1cce4def2cc35045162554b8e4a706" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "ccb787a44f2ebdecbb346d57490fa7e49981b323" + "version": "b8b1a40066bd40bf7612bbb1cc9208f76530f44a" }, { "name": "grafonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "5d7e5391010c768a6ddd39163c35662f379e20ca" + "version": "5cc4bfab6e2453266e47d01b78cbae0b2643426e" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "a7e3bd06b2ef0286e1571836997287a81146c25a" + "version": "e1ca3b4434945e57e8e3a451cdbde74a903cc8e1" } ] } diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index 376c17ba18d10cb384793e66677a62a8d1b12793..93c52a859388fec43c976036339e152050e4581f 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -15,4 +15,4 @@ spec: runAsNonRoot: true runAsUser: 1000 serviceAccountName: alertmanager-main - version: v0.16.0 + version: v0.16.1 diff --git a/manifests/alertmanager-secret.yaml b/manifests/alertmanager-secret.yaml index 79fc7a21c3b5b8680d70a51189724e848de8bbee..5ee9c606e1a40273144c81d3ccc3fada567a0499 100644 --- a/manifests/alertmanager-secret.yaml +++ b/manifests/alertmanager-secret.yaml @@ -1,6 +1,6 @@ apiVersion: v1 data: - alertmanager.yaml: Imdsb2JhbCI6IAogICJyZXNvbHZlX3RpbWVvdXQiOiAiNW0iCiJyZWNlaXZlcnMiOiAKLSAibmFtZSI6ICJudWxsIgoicm91dGUiOiAKICAiZ3JvdXBfYnkiOiAKICAtICJqb2IiCiAgImdyb3VwX2ludGVydmFsIjogIjVtIgogICJncm91cF93YWl0IjogIjMwcyIKICAicmVjZWl2ZXIiOiAibnVsbCIKICAicmVwZWF0X2ludGVydmFsIjogIjEyaCIKICAicm91dGVzIjogCiAgLSAibWF0Y2giOiAKICAgICAgImFsZXJ0bmFtZSI6ICJEZWFkTWFuc1N3aXRjaCIKICAgICJyZWNlaXZlciI6ICJudWxsIg== + alertmanager.yaml: Imdsb2JhbCI6IAogICJyZXNvbHZlX3RpbWVvdXQiOiAiNW0iCiJyZWNlaXZlcnMiOiAKLSAibmFtZSI6ICJudWxsIgoicm91dGUiOiAKICAiZ3JvdXBfYnkiOiAKICAtICJqb2IiCiAgImdyb3VwX2ludGVydmFsIjogIjVtIgogICJncm91cF93YWl0IjogIjMwcyIKICAicmVjZWl2ZXIiOiAibnVsbCIKICAicmVwZWF0X2ludGVydmFsIjogIjEyaCIKICAicm91dGVzIjogCiAgLSAibWF0Y2giOiAKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg== kind: Secret metadata: name: alertmanager-main diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index d8bf4633a74b67ff5efd58974cd2e6720cc1fe86..cdb8ff3f81b7b66b3bf7f7b858e6470f778498a3 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -60,7 +60,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:cluster_cpu_utilisation:ratio", + "expr": "node:cluster_cpu_utilisation:ratio{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -75,7 +75,7 @@ items: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -146,7 +146,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_cpu_saturation_load1: / scalar(sum(min(kube_pod_info) by (node)))", + "expr": "node:node_cpu_saturation_load1:{cluster=\"$cluster\"} / scalar(sum(min(kube_pod_info{cluster=\"$cluster\"}) by (node)))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -161,7 +161,7 @@ items: "timeShift": null, "title": "CPU Saturation (Load1)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -244,7 +244,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:cluster_memory_utilisation:ratio", + "expr": "node:cluster_memory_utilisation:ratio{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -259,7 +259,7 @@ items: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -330,7 +330,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_memory_swap_io_bytes:sum_rate", + "expr": "node:node_memory_swap_io_bytes:sum_rate{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -345,7 +345,7 @@ items: "timeShift": null, "title": "Memory Saturation (Swap I/O)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -428,7 +428,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_disk_utilisation:avg_irate / scalar(:kube_pod_info_node_count:)", + "expr": "node:node_disk_utilisation:avg_irate{cluster=\"$cluster\"} / scalar(:kube_pod_info_node_count:{cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -443,7 +443,7 @@ items: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -514,7 +514,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_disk_saturation:avg_irate / scalar(:kube_pod_info_node_count:)", + "expr": "node:node_disk_saturation:avg_irate{cluster=\"$cluster\"} / scalar(:kube_pod_info_node_count:{cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -529,7 +529,7 @@ items: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -612,7 +612,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_net_utilisation:sum_irate", + "expr": "node:node_net_utilisation:sum_irate{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -627,7 +627,7 @@ items: "timeShift": null, "title": "Net Utilisation (Transmitted)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -698,7 +698,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_net_saturation:sum_irate", + "expr": "node:node_net_saturation:sum_irate{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -713,7 +713,7 @@ items: "timeShift": null, "title": "Net Saturation (Dropped)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{cluster=\"$cluster\"}\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -811,7 +811,7 @@ items: "timeShift": null, "title": "Disk Capacity", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -875,6 +875,33 @@ items: "refresh": 1, "regex": "", "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_node_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -976,7 +1003,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_cpu_utilisation:avg1m{node=\"$node\"}", + "expr": "node:node_cpu_utilisation:avg1m{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Utilisation", @@ -991,7 +1018,7 @@ items: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1062,7 +1089,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_cpu_saturation_load1:{node=\"$node\"}", + "expr": "node:node_cpu_saturation_load1:{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Saturation", @@ -1077,7 +1104,7 @@ items: "timeShift": null, "title": "CPU Saturation (Load1)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1160,7 +1187,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_memory_utilisation:{node=\"$node\"}", + "expr": "node:node_memory_utilisation:{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Memory", @@ -1175,7 +1202,7 @@ items: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1246,7 +1273,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_memory_swap_io_bytes:sum_rate{node=\"$node\"}", + "expr": "node:node_memory_swap_io_bytes:sum_rate{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Swap IO", @@ -1261,7 +1288,7 @@ items: "timeShift": null, "title": "Memory Saturation (Swap I/O)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1344,7 +1371,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_disk_utilisation:avg_irate{node=\"$node\"}", + "expr": "node:node_disk_utilisation:avg_irate{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Utilisation", @@ -1359,7 +1386,7 @@ items: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1430,7 +1457,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_disk_saturation:avg_irate{node=\"$node\"}", + "expr": "node:node_disk_saturation:avg_irate{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Saturation", @@ -1445,7 +1472,7 @@ items: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1528,7 +1555,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_net_utilisation:sum_irate{node=\"$node\"}", + "expr": "node:node_net_utilisation:sum_irate{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Utilisation", @@ -1543,7 +1570,7 @@ items: "timeShift": null, "title": "Net Utilisation (Transmitted)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1614,7 +1641,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_net_saturation:sum_irate{node=\"$node\"}", + "expr": "node:node_net_saturation:sum_irate{cluster=\"$cluster\", node=\"$node\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Saturation", @@ -1629,7 +1656,7 @@ items: "timeShift": null, "title": "Net Saturation (Dropped)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1712,7 +1739,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_filesystem_usage:\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{node=\"$node\"}\n", + "expr": "node:node_filesystem_usage:{cluster=\"$cluster\"}\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{cluster=\"$cluster\", node=\"$node\"}\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -1727,7 +1754,7 @@ items: "timeShift": null, "title": "Disk Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1792,6 +1819,33 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_node_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -1807,7 +1861,7 @@ items: "options": [ ], - "query": "label_values(kube_node_info, node)", + "query": "label_values(kube_node_info{cluster=\"$cluster\"}, node)", "refresh": 1, "regex": "", "sort": 2, @@ -1920,7 +1974,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))", + "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[1m]))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -1932,7 +1986,7 @@ items: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2004,7 +2058,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores) / sum(node:node_num_cpu:sum)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) / sum(node:node_num_cpu:sum{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2016,7 +2070,7 @@ items: "timeShift": null, "title": "CPU Requests Commitment", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2088,7 +2142,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores) / sum(node:node_num_cpu:sum)", + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) / sum(node:node_num_cpu:sum{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2100,7 +2154,7 @@ items: "timeShift": null, "title": "CPU Limits Commitment", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2172,7 +2226,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - sum(:node_memory_MemFreeCachedBuffers_bytes:sum) / sum(:node_memory_MemTotal_bytes:sum)", + "expr": "1 - sum(:node_memory_MemFreeCachedBuffers_bytes:sum{cluster=\"$cluster\"}) / sum(:node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2184,7 +2238,7 @@ items: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2256,7 +2310,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) / sum(:node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2268,7 +2322,7 @@ items: "timeShift": null, "title": "Memory Requests Commitment", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2340,7 +2394,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) / sum(:node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2352,7 +2406,7 @@ items: "timeShift": null, "title": "Memory Limits Commitment", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2435,7 +2489,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -2450,7 +2504,7 @@ items: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2638,7 +2692,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -2664,7 +2718,7 @@ items: ], "targets": [ { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2673,7 +2727,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2682,7 +2736,7 @@ items: "step": 10 }, { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2691,7 +2745,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2700,7 +2754,7 @@ items: "step": 10 }, { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2716,7 +2770,7 @@ items: "timeShift": null, "title": "CPU Quota", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -2800,7 +2854,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -2815,7 +2869,7 @@ items: "timeShift": null, "title": "Memory Usage (w/o cache)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -3003,7 +3057,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -3029,7 +3083,7 @@ items: ], "targets": [ { - "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3038,7 +3092,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3047,7 +3101,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3056,7 +3110,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3065,7 +3119,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3081,7 +3135,7 @@ items: "timeShift": null, "title": "Requests by Namespace", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -3146,6 +3200,33 @@ items: "refresh": 1, "regex": "", "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(node_cpu_seconds_total, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -3247,7 +3328,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}) by (pod_name)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod_name}}", @@ -3262,7 +3343,7 @@ items: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -3450,7 +3531,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -3476,7 +3557,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3485,7 +3566,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3494,7 +3575,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3503,7 +3584,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3512,7 +3593,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3528,7 +3609,7 @@ items: "timeShift": null, "title": "CPU Quota", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -3612,7 +3693,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", container_name!=\"\"}) by (pod_name)", + "expr": "sum(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"}) by (pod_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod_name}}", @@ -3627,7 +3708,7 @@ items: "timeShift": null, "title": "Memory Usage (w/o cache)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -3869,7 +3950,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -3895,7 +3976,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3904,7 +3985,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3913,7 +3994,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3922,7 +4003,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3931,7 +4012,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3940,7 +4021,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_rss{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3949,7 +4030,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_cache{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3958,7 +4039,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_swap{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "expr": "sum(label_replace(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -3974,7 +4055,7 @@ items: "timeShift": null, "title": "Memory Quota", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -4040,6 +4121,33 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -4055,7 +4163,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info, namespace)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", "refresh": 1, "regex": "", "sort": 2, @@ -4167,7 +4275,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", cluster=\"$cluster\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4182,7 +4290,7 @@ items: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -4396,7 +4504,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4405,7 +4513,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4414,7 +4522,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4423,7 +4531,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4432,7 +4540,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4448,7 +4556,7 @@ items: "timeShift": null, "title": "CPU Quota", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -4532,7 +4640,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_rss{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}} (RSS)", @@ -4540,7 +4648,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_cache{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", + "expr": "sum(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}} (Cache)", @@ -4548,7 +4656,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_swap{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", + "expr": "sum(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}} (Swap)", @@ -4563,7 +4671,7 @@ items: "timeShift": null, "title": "Memory Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -4831,7 +4939,7 @@ items: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4840,7 +4948,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4849,7 +4957,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4858,7 +4966,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4867,7 +4975,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4876,7 +4984,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_rss{namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4885,7 +4993,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_cache{namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4894,7 +5002,7 @@ items: "step": 10 }, { - "expr": "sum(label_replace(container_memory_swap{namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4910,7 +5018,7 @@ items: "timeShift": null, "title": "Memory Quota", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -4976,6 +5084,33 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -4991,7 +5126,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info, namespace)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", "refresh": 1, "regex": "", "sort": 2, @@ -5018,7 +5153,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{namespace=\"$namespace\"}, pod)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)", "refresh": 1, "regex": "", "sort": 2, @@ -5143,21 +5278,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(node_load1{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_load1{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 1m", "refId": "A" }, { - "expr": "max(node_load5{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_load5{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 5m", "refId": "B" }, { - "expr": "max(node_load15{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_load15{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "load 15m", @@ -5171,7 +5306,7 @@ items: "timeShift": null, "title": "System load", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5248,7 +5383,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m]))", + "expr": "sum by (cpu) (irate(node_cpu_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cpu}}", @@ -5262,7 +5397,7 @@ items: "timeShift": null, "title": "Usage Per Core", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5352,7 +5487,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max (sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n", + "expr": "max (sum by (cpu) (irate(node_cpu_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n", "format": "time_series", "intervalFactor": 10, "legendFormat": "{{ cpu }}", @@ -5364,9 +5499,9 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "CPU Utilizaion", + "title": "CPU Utilization", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5460,7 +5595,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "avg(sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", + "expr": "avg(sum by (cpu) (irate(node_cpu_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -5469,6 +5604,9 @@ items: ], "thresholds": "80, 90", "title": "CPU Usage", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -5537,28 +5675,28 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "expr": "max(\n node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory used", "refId": "A" }, { - "expr": "max(node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_Buffers_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory buffers", "refId": "B" }, { - "expr": "max(node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_Cached_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory cached", "refId": "C" }, { - "expr": "max(node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_MemFree_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory free", @@ -5572,7 +5710,7 @@ items: "timeShift": null, "title": "Memory Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5666,7 +5804,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(\n (\n (\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", + "expr": "max(\n (\n (\n node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -5675,6 +5813,9 @@ items: ], "thresholds": "80, 90", "title": "Memory Usage", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -5750,21 +5891,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_read_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "read", "refId": "A" }, { - "expr": "max(rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_written_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "written", "refId": "B" }, { - "expr": "max(rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_io_time_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "io time", @@ -5778,7 +5919,7 @@ items: "timeShift": null, "title": "Disk I/O", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5855,7 +5996,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_filesystem_usage:\n", + "expr": "node:node_filesystem_usage:{cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5869,7 +6010,7 @@ items: "timeShift": null, "title": "Disk Space Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -5959,7 +6100,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_receive_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5973,7 +6114,7 @@ items: "timeShift": null, "title": "Network Received", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6050,7 +6191,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_transmit_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6064,7 +6205,7 @@ items: "timeShift": null, "title": "Network Transmitted", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6154,14 +6295,14 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(\n node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "expr": "max(\n node_filesystem_files{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "inodes used", "refId": "A" }, { - "expr": "max(node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_filesystem_files_free{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "inodes free", @@ -6175,7 +6316,7 @@ items: "timeShift": null, "title": "Inodes Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6269,7 +6410,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(\n (\n (\n node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", + "expr": "max(\n (\n (\n node_filesystem_files{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_filesystem_files{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -6278,6 +6419,9 @@ items: ], "thresholds": "80, 90", "title": "Inodes Usage", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -6326,6 +6470,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -6336,7 +6506,7 @@ items: "options": [ ], - "query": "label_values(node_boot_time_seconds{job=\"node-exporter\"}, instance)", + "query": "label_values(node_boot_time_seconds{cluster=\"$cluster\", job=\"node-exporter\"}, instance)", "refresh": 2, "regex": "", "sort": 0, @@ -6461,7 +6631,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(kubelet_volume_stats_capacity_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} - kubelet_volume_stats_available_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"}) / kubelet_volume_stats_capacity_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", + "expr": "(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", persistentvolumeclaim=\"$volume\"} - kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", persistentvolumeclaim=\"$volume\"}) / kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ Usage }}", @@ -6475,7 +6645,7 @@ items: "timeShift": null, "title": "Volume Space Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6565,7 +6735,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "kubelet_volume_stats_inodes_used{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} / kubelet_volume_stats_inodes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", + "expr": "kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", persistentvolumeclaim=\"$volume\"} / kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ Usage }}", @@ -6579,7 +6749,7 @@ items: "timeShift": null, "title": "Volume inodes Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6649,6 +6819,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kubelet_volume_stats_capacity_bytes, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -6659,7 +6855,7 @@ items: "options": [ ], - "query": "label_values(kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}, exported_namespace)", + "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\"}, exported_namespace)", "refresh": 2, "regex": "", "sort": 0, @@ -6685,7 +6881,7 @@ items: "options": [ ], - "query": "label_values(kubelet_volume_stats_capacity_bytes{job=\"kubelet\", exported_namespace=\"$namespace\"}, persistentvolumeclaim)", + "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", exported_namespace=\"$namespace\"}, persistentvolumeclaim)", "refresh": 2, "regex": "", "sort": 0, @@ -6805,25 +7001,26 @@ items: ], "spaceLength": 10, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Current: {{ container_name }}", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Requested: {{ container }}", "refId": "B" }, { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Limit: {{ container }}", @@ -6837,7 +7034,7 @@ items: "timeShift": null, "title": "Memory Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -6922,11 +7119,12 @@ items: ], "spaceLength": 10, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ container_name }}", @@ -6940,7 +7138,7 @@ items: "timeShift": null, "title": "CPU Usage", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -7025,11 +7223,12 @@ items: ], "spaceLength": 10, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))", + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ pod_name }}", @@ -7043,7 +7242,7 @@ items: "timeShift": null, "title": "Network I/O", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -7113,6 +7312,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -7123,7 +7348,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info, namespace)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", "refresh": 2, "regex": "", "sort": 0, @@ -7149,7 +7374,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", + "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\u007e\"$namespace\"}, pod)", "refresh": 2, "regex": "", "sort": 0, @@ -7175,7 +7400,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "query": "label_values(kube_pod_container_info{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}, container)", "refresh": 2, "regex": "", "sort": 0, @@ -7316,7 +7541,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7325,6 +7550,9 @@ items: ], "thresholds": "", "title": "CPU", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7396,7 +7624,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7405,6 +7633,9 @@ items: ], "thresholds": "", "title": "Memory", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7476,7 +7707,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7485,6 +7716,9 @@ items: ], "thresholds": "", "title": "Network", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7571,7 +7805,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7580,6 +7814,9 @@ items: ], "thresholds": "", "title": "Desired Replicas", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7652,7 +7889,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7661,6 +7898,9 @@ items: ], "thresholds": "", "title": "Replicas of current version", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7733,7 +7973,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7742,6 +7982,9 @@ items: ], "thresholds": "", "title": "Observed Generation", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7814,7 +8057,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", namespace=\"$namespace\"}) without (instance, pod)", + "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -7823,6 +8066,9 @@ items: ], "thresholds": "", "title": "Metadata Generation", + "tooltip": { + "shared": false + }, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -7890,35 +8136,35 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "replicas specified", "refId": "A" }, { - "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "replicas created", "refId": "B" }, { - "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "ready", "refId": "C" }, { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "replicas of current version", "refId": "D" }, { - "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "updated", @@ -7932,7 +8178,7 @@ items: "timeShift": null, "title": "Replicas", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -8002,6 +8248,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_statefulset_metadata_generation, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index d6d1567212f117ddc7c596a3113c85d2e862c982..d57ea12d0ea1db86f9d77d47a49735f935a9d33b 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -71,7 +71,7 @@ spec: - --extra-cpu=2m - --memory=150Mi - --extra-memory=30Mi - - --acceptance-offset=5 + - --threshold=5 - --deployment=kube-state-metrics env: - name: MY_POD_NAME @@ -84,7 +84,7 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace - image: gcr.io/google-containers/addon-resizer-amd64:2.1 + image: k8s.gcr.io/addon-resizer:1.8.4 name: addon-resizer resources: limits: diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 19432b5f396f581aef94961720c5d8a71b1fd307..7f04e057280949ae7950056807175b1619b757ff 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -225,21 +225,21 @@ spec: ) record: node:node_memory_swap_io_bytes:sum_rate - expr: | - avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])) + avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) record: :node_disk_utilisation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) + irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -769,9 +769,9 @@ spec: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10 + sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3 for: 10m labels: severity: critical @@ -780,9 +780,33 @@ spec: message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1 + for: 10m + labels: + severity: warning + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests for + {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests for + {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5 for: 10m labels: severity: warning @@ -951,7 +975,7 @@ spec: log (WAL).' summary: Prometheus write-ahead log is corrupted expr: | - tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0 + prometheus_tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0 for: 4h labels: severity: warning