diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 41e051876498ccf22256c52b731d2a86be297d2d..9e1b488199671eff5c1a8f6f9f5d52bb8f20db3c 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -1,10 +1,18 @@ #!/usr/bin/env bash +# exit immediately when a command fails +set -e +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail +# error on unset variables +set -u +# print each command before executing it +set -x manifest_prefix=${1-.} kubectl create namespace monitoring -kubectl apply -f ${manifest_prefix}/manifests/prometheus-operator/ +find ${manifest_prefix}/manifests/prometheus-operator/ -type f ! -name prometheus-operator-service-monitor.yaml -exec kubectl apply -f {} \; # Wait for CRDs to be ready. printf "Waiting for Operator to register custom resource definitions..." @@ -16,9 +24,15 @@ until kubectl get prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep until kubectl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done echo "done!" +# need to ensure that ServiceMonitors are registered before we can create the prometheus-operator ServiceMonitor +kubectl apply -f ${manifest_prefix}/manifests/prometheus-operator/prometheus-operator-service-monitor.yaml + kubectl apply -f ${manifest_prefix}/manifests/node-exporter/ kubectl apply -f ${manifest_prefix}/manifests/kube-state-metrics/ -kubectl apply -f ${manifest_prefix}/manifests/grafana/ +find ${manifest_prefix}/manifests/grafana/ -type f ! -name grafana-dashboard-definitions.yaml -exec kubectl apply -f {} \; + +# kubectl apply wants to put the previous version in an annotation, which is too large, therefore create instead of apply +kubectl create -f ${manifest_prefix}/manifests/grafana/grafana-dashboard-definitions.yaml kubectl apply -f ${manifest_prefix}/manifests/prometheus-k8s/ kubectl apply -f ${manifest_prefix}/manifests/alertmanager-main/ diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet index 2319aa356018ee946a77043820a931d2a9b053a0..631e5fa566ab5edd1e68316b7a506d817b2cb6fc 100644 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-config.libsonnet @@ -1,5 +1,5 @@ local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; { - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s-config") + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s-config", "prometheus-k8s") } diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet index f5d38ce7c8c10b74021e0775b5fc4671c82b4086..e88ece995daa60c35498e78e9268354c8cc255ae 100644 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-default.libsonnet @@ -1,5 +1,5 @@ local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; { - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "default", "prometheus-k8s") + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "default", "prometheus-k8s", "prometheus-k8s") } diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet index 04c481cacfd36443c86582c7a6537ee115e70905..33967e0a08cd17996274166bc9e7bb39103daf09 100644 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-kube-system.libsonnet @@ -1,5 +1,5 @@ local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; { - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "kube-system", "prometheus-k8s") + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, "kube-system", "prometheus-k8s", "prometheus-k8s") } diff --git a/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet b/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet index 7833f78534e4351653f4365d660a92029d0435e0..d70ed6ac803235eecb2e0f1c9c045cdf0486540f 100644 --- a/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet +++ b/jsonnet/prometheus/prometheus-k8s-role-binding-namespace.libsonnet @@ -1,5 +1,5 @@ local prometheusNamespaceRoleBinding = import "prometheus-namespace-role-binding.libsonnet"; { - new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s") + new(namespace):: prometheusNamespaceRoleBinding.new(namespace, namespace, "prometheus-k8s", "prometheus-k8s") } diff --git a/jsonnet/prometheus/prometheus-k8s-rules.libsonnet b/jsonnet/prometheus/prometheus-k8s-rules.libsonnet index abe98fa943dd632ff6fbda18a2ea7dd7af3f0183..d20145694b4cc86856beee3ee435594aa0bfea88 100644 --- a/jsonnet/prometheus/prometheus-k8s-rules.libsonnet +++ b/jsonnet/prometheus/prometheus-k8s-rules.libsonnet @@ -4,5 +4,6 @@ local configMap = k.core.v1.configMap; { new(namespace, ruleFiles):: configMap.new("prometheus-k8s-rules", ruleFiles) + + configMap.mixin.metadata.withLabels({role: "alert-rules", prometheus: "k8s"}) + configMap.mixin.metadata.withNamespace(namespace) } diff --git a/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet b/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet index 8b255fa0fd54534f908748ca2d0bc5d85937692e..a63bcc9ccde4497b8f5eb37fb77e145444a14dd2 100644 --- a/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet +++ b/jsonnet/prometheus/prometheus-namespace-role-binding.libsonnet @@ -2,12 +2,12 @@ local k = import "ksonnet.beta.3/k.libsonnet"; local roleBinding = k.rbac.v1.roleBinding; { - new(serviceAccountNamespace, namespace, name):: + new(serviceAccountNamespace, namespace, roleName, serviceAccountName):: roleBinding.new() + - roleBinding.mixin.metadata.withName(name) + + roleBinding.mixin.metadata.withName(roleName) + roleBinding.mixin.metadata.withNamespace(namespace) + roleBinding.mixin.roleRef.withApiGroup("rbac.authorization.k8s.io") + - roleBinding.mixin.roleRef.withName(name) + + roleBinding.mixin.roleRef.withName(roleName) + roleBinding.mixin.roleRef.mixinInstance({kind: "Role"}) + - roleBinding.withSubjects([{kind: "ServiceAccount", name: name, namespace: serviceAccountNamespace}]) + roleBinding.withSubjects([{kind: "ServiceAccount", name: serviceAccountName, namespace: serviceAccountNamespace}]) } diff --git a/manifests/grafana/grafana-dashboard-definitions.yaml b/manifests/grafana/grafana-dashboard-definitions.yaml index df4c5203b2537f72ed93f6d438ef9a188e7e2961..573281af2e269348f2a248d33089fd7e37a0a6dc 100644 --- a/manifests/grafana/grafana-dashboard-definitions.yaml +++ b/manifests/grafana/grafana-dashboard-definitions.yaml @@ -3502,7 +3502,7 @@ data: "title": "Kubernetes Cluster Status", "version": 0 } - kubernetes-kubelet-dashboard.json: |- + kubernetes-control-plane-status-dashboard.json: |- { "annotations": { "list": [ @@ -3517,7 +3517,7 @@ data: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -3529,16 +3529,16 @@ data: "colorBackground": false, "colorValue": false, "colors": [ - "#299c46", + "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", - "#d44a3a" + "rgba(50, 172, 45, 0.97)" ], "datasource": "prometheus", - "format": "none", + "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, - "show": false, + "show": true, "thresholdLabels": false, "thresholdMarkers": true }, @@ -3575,151 +3575,130 @@ data: "to": "null" } ], - "span": 2, + "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, "lineColor": "rgb(31, 120, 193)", - "show": true + "show": false }, "tableColumn": "", "targets": [ { - "expr": "sum(kubelet_running_pod_count{instance=\u007e\"$instance\"})", + "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", "format": "time_series", "intervalFactor": 2, "legendFormat": "" } ], - "thresholds": "", - "title": "Count", + "thresholds": "50, 80", + "title": "API Servers UP", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", - "text": "0", + "text": "N/A", "value": "null" } ], "valueName": "current" }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], "datasource": "prometheus", - "fill": 1, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, "gridPos": { }, "id": 3, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + "interval": null, + "links": [ ], - "spaceLength": 10, - "span": 10, - "stack": true, - "steppedLine": false, - "targets": [ + "mappingType": 1, + "mappingTypes": [ { - "expr": "kubelet_running_pod_count{instance=\u007e\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ instance }}", - "refId": "A" + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 } ], - "thresholds": [ - + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } ], - "timeFrom": null, - "timeShift": null, - "title": "Count", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false }, - "yaxes": [ + "tableColumn": "", + "targets": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "50, 80", + "title": "Controller Mangers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true + "op": "=", + "text": "N/A", + "value": "null" } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Pods", - "titleSize": "h4", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ + ], + "valueName": "current" + }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ - "#299c46", + "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", - "#d44a3a" + "rgba(50, 172, 45, 0.97)" ], "datasource": "prometheus", - "format": "none", + "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, - "show": false, + "show": true, "thresholdLabels": false, "thresholdMarkers": true }, @@ -3756,129 +3735,122 @@ data: "to": "null" } ], - "span": 2, + "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, "lineColor": "rgb(31, 120, 193)", - "show": true + "show": false }, "tableColumn": "", "targets": [ { - "expr": "sum(kubelet_running_container_count{instance=\u007e\"$instance\"})", + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", "format": "time_series", "intervalFactor": 2, "legendFormat": "" } ], - "thresholds": "", - "title": "Count", + "thresholds": "50, 80", + "title": "Schedulers UP", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", - "text": "0", + "text": "N/A", "value": "null" } ], "valueName": "current" }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], "datasource": "prometheus", - "fill": 1, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, "gridPos": { }, "id": 5, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + "interval": null, + "links": [ ], - "spaceLength": 10, - "span": 10, - "stack": true, - "steppedLine": false, - "targets": [ + "mappingType": 1, + "mappingTypes": [ { - "expr": "kubelet_running_container_count{instance=\u007e\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ instance }}", - "refId": "A" + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 } ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Count", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false }, - "yaxes": [ + "tableColumn": "", + "targets": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, + "expr": "max(sum by(instance) (rate(apiserver_request_count{code=\u007e\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "50, 80", + "title": "API Request Error Rate", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true + "op": "=", + "text": "N/A", + "value": "null" } - ] + ], + "valueName": "current" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": true, - "title": "Containers", - "titleSize": "h4", + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", "type": "row" }, { @@ -3894,7 +3866,6 @@ data: "dashLength": 10, "dashes": false, "datasource": "prometheus", - "description": "Rate of Kubelet Operations in 5min", "fill": 1, "gridPos": { @@ -3928,10 +3899,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubelet_runtime_operations{instance=\u007e\"$instance\"}[5m])) by (instance)", + "expr": "sum by(verb) (rate(apiserver_latency_seconds:quantile[5m]) >= 0)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ instance }}", + "legendFormat": "", "refId": "A" } ], @@ -3940,7 +3911,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Operations", + "title": "API Request Latency", "tooltip": { "shared": true, "sort": 0, @@ -3962,7 +3933,7 @@ data: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -3970,7 +3941,7 @@ data: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -3979,97 +3950,11 @@ data: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": true, - "title": "Kubelet", - "titleSize": "h4", + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "allValue": null, - "current": { - - }, - "datasource": "prometheus", - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "instance", - "options": [ - - ], - "query": "label_values(kubelet_running_pod_count,instance)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kubelet", - "version": 0 - } - nodes.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ + }, { "collapse": false, "collapsed": false, @@ -4087,7 +3972,7 @@ data: "gridPos": { }, - "id": 2, + "id": 7, "legend": { "alignAsTable": false, "avg": false, @@ -4111,16 +3996,23 @@ data: ], "spaceLength": 10, - "span": 6, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "expr": "sum by(instance) (rate(apiserver_request_count{code!\u007e\"2..\"}[5m]))", "format": "time_series", - "intervalFactor": 10, - "legendFormat": "{{cpu}}", + "intervalFactor": 2, + "legendFormat": "Error Rate", "refId": "A" + }, + { + "expr": "sum by(instance) (rate(apiserver_request_count[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Request Rate", + "refId": "B" } ], "thresholds": [ @@ -4128,7 +4020,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Idle CPU", + "title": "API Request Rate", "tooltip": { "shared": true, "sort": 0, @@ -4146,23 +4038,37 @@ data: }, "yaxes": [ { - "format": "percent", + "format": "short", "label": null, "logBase": 1, - "max": 100, - "min": 0, + "max": null, + "min": null, "show": true }, { - "format": "percent", + "format": "short", "label": null, "logBase": 1, - "max": 100, - "min": 0, + "max": null, + "min": null, "show": true } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ { "aliasColors": { @@ -4175,7 +4081,7 @@ data: "gridPos": { }, - "id": 3, + "id": 8, "legend": { "alignAsTable": false, "avg": false, @@ -4199,30 +4105,16 @@ data: ], "spaceLength": 10, - "span": 6, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "node_load1{instance=\"$server\"} * 100", + "expr": "cluster:scheduler_e2e_scheduling_latency_seconds:quantile", "format": "time_series", "intervalFactor": 2, - "legendFormat": "load 1m", + "legendFormat": "", "refId": "A" - }, - { - "expr": "node_load5{instance=\"$server\"} * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 5m", - "refId": "B" - }, - { - "expr": "node_load15{instance=\"$server\"} * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 15m", - "refId": "C" } ], "thresholds": [ @@ -4230,7 +4122,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "System load", + "title": "End to End Scheduling Latency", "tooltip": { "shared": true, "sort": 0, @@ -4248,7 +4140,7 @@ data: }, "yaxes": [ { - "format": "percent", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -4256,7 +4148,7 @@ data: "show": true }, { - "format": "percent", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -4273,17 +4165,157 @@ data: "title": "Dashboard Row", "titleSize": "h6", "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Control Plane Status", + "version": 0 + } + kubernetes-kubelet-dashboard.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(kubelet_running_pod_count{instance=\u007e\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Count", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { + + }, + "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", @@ -4291,7 +4323,7 @@ data: "gridPos": { }, - "id": 4, + "id": 3, "legend": { "alignAsTable": false, "avg": false, @@ -4315,37 +4347,16 @@ data: ], "spaceLength": 10, - "span": 9, - "stack": false, + "span": 10, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "expr": "kubelet_running_pod_count{instance=\u007e\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "memory used", + "legendFormat": "{{ instance }}", "refId": "A" - }, - { - "expr": "node_memory_Buffers{instance=\"$server\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "refId": "B" - }, - { - "expr": "node_memory_Cached{instance=\"$server\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory cached", - "refId": "C" - }, - { - "expr": "node_memory_MemFree{instance=\"$server\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory free", - "refId": "D" } ], "thresholds": [ @@ -4353,7 +4364,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Count", "tooltip": { "shared": true, "sort": 0, @@ -4371,45 +4382,59 @@ data: }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Pods", + "titleSize": "h4", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ - "rgba(50, 172, 45, 0.97)", + "#299c46", "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" + "#d44a3a" ], "datasource": "prometheus", - "format": "percent", + "format": "none", "gauge": { "maxValue": 100, "minValue": 0, - "show": true, + "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { }, - "id": 5, + "id": 4, "interval": null, "links": [ @@ -4439,49 +4464,34 @@ data: "to": "null" } ], - "span": 3, + "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, "lineColor": "rgb(31, 120, 193)", - "show": false + "show": true }, "tableColumn": "", "targets": [ { - "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "expr": "sum(kubelet_running_container_count{instance=\u007e\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "" } ], - "thresholds": "80, 90", - "title": "Memory Usage", + "thresholds": "", + "title": "Count", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", - "text": "N/A", + "text": "0", "value": "null" } ], "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -4494,7 +4504,7 @@ data: "gridPos": { }, - "id": 6, + "id": 5, "legend": { "alignAsTable": false, "avg": false, @@ -4515,40 +4525,19 @@ data: "renderer": "flot", "repeat": null, "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 10, + "stack": true, + "steppedLine": false, + "targets": [ { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "expr": "kubelet_running_container_count{instance=\u007e\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "read", + "legendFormat": "{{ instance }}", "refId": "A" - }, - { - "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "written", - "refId": "B" - }, - { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "io time", - "refId": "C" } ], "thresholds": [ @@ -4556,7 +4545,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Disk I/O", + "title": "Count", "tooltip": { "shared": true, "sort": 0, @@ -4574,110 +4563,30 @@ data: }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "ms", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true } ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "prometheus", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) * 100", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" - } - ], - "thresholds": "80, 90", - "title": "Disk Space Usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", + "showTitle": true, + "title": "Containers", + "titleSize": "h4", "type": "row" }, { @@ -4693,99 +4602,12 @@ data: "dashLength": 10, "dashes": false, "datasource": "prometheus", + "description": "Rate of Kubelet Operations in 5min", "fill": 1, "gridPos": { }, - "id": 8, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!\u007e\"lo\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{device}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Network Received", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - - }, - "id": 9, + "id": 6, "legend": { "alignAsTable": false, "avg": false, @@ -4809,15 +4631,15 @@ data: ], "spaceLength": 10, - "span": 6, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!\u007e\"lo\"}[5m])", + "expr": "sum(rate(kubelet_runtime_operations{instance=\u007e\"$instance\"}[5m])) by (instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{device}}", + "legendFormat": "{{ instance }}", "refId": "A" } ], @@ -4826,7 +4648,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Network Transmitted", + "title": "Operations", "tooltip": { "shared": true, "sort": 0, @@ -4844,19 +4666,19 @@ data: }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true } ] @@ -4865,9 +4687,9 @@ data: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", + "showTitle": true, + "title": "Kubelet", + "titleSize": "h4", "type": "row" } ], @@ -4885,14 +4707,14 @@ data: }, "datasource": "prometheus", "hide": 0, - "includeAll": false, + "includeAll": true, "label": null, "multi": false, - "name": "server", + "name": "instance", "options": [ ], - "query": "label_values(node_boot_time, instance)", + "query": "label_values(kubelet_running_pod_count,instance)", "refresh": 2, "regex": "", "sort": 0, @@ -4936,10 +4758,10 @@ data: ] }, "timezone": "browser", - "title": "Nodes", + "title": "Kubelet", "version": 0 } - pods-dashboard.json: |- + kubernetes-resource-requests-dashboard.json: |- { "annotations": { "list": [ @@ -4954,7 +4776,7 @@ data: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -4975,12 +4797,12 @@ data: }, "id": 2, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, "show": true, "total": false, "values": false @@ -4997,30 +4819,23 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 9, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Current: {{ container_name }}", + "legendFormat": "Allocatable CPU Cores", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Requested: {{ container }}", + "legendFormat": "Requested CPU Cores", "refId": "B" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Limit: {{ container }}", - "refId": "C" } ], "thresholds": [ @@ -5028,7 +4843,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "CPU Cores", "tooltip": { "shared": true, "sort": 0, @@ -5046,7 +4861,7 @@ data: }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -5054,7 +4869,7 @@ data: "show": true }, { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -5062,6 +4877,86 @@ data: "show": true } ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 3, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" } ], "repeat": null, @@ -5089,81 +4984,2158 @@ data: "gridPos": { }, - "id": 3, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Allocatable Memory", + "refId": "A" + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Requested Memory", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Resource Requests", + "version": 0 + } + nodes.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "format": "time_series", + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Idle CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"} * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 1m", + "refId": "A" + }, + { + "expr": "node_load5{instance=\"$server\"} * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 5m", + "refId": "B" + }, + { + "expr": "node_load15{instance=\"$server\"} * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 15m", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory used", + "refId": "A" + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "refId": "B" + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory cached", + "refId": "C" + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory free", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "Memory Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "written", + "refId": "B" + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "io time", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 7, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "80, 90", + "title": "Disk Space Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!\u007e\"lo\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Received", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!\u007e\"lo\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Transmitted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [ + + ], + "query": "label_values(node_boot_time, instance)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 0 + } + pods-dashboard.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Current: {{ container_name }}", + "refId": "A" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "refId": "B" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\", container!=\"POD\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{pod_name=\"$pod\"}[1m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [ + + ], + "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [ + + ], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 0 + } + statefulset-dashboard.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}[3m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "CPU", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 3, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}) / 1024^3", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 4, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$statefulset_namespace\",pod_name=\u007e\"$statefulset_name.*\"}[3m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "100px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_replicas{namespace=\"$statefulset_namespace\",statefulset=\"$statefulset_name\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 6, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "min(kube_statefulset_status_replicas_current{namespace=\"$statefulset_namespace\",statefulset=\"$statefulset_name\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Replicas of current version", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 7, + "interval": null, + "links": [ ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", "targets": [ { - "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "expr": "max(kube_statefulset_status_observed_generation{namespace=\"$statefulset_namespace\",statefulset=\"$statefulset_name\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ container_name }}", - "refId": "A" + "legendFormat": "" } ], - "thresholds": [ - + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "gridPos": { - ] }, - "yaxes": [ + "id": 8, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true + "name": "value to text", + "value": 1 }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true + "name": "range to text", + "value": 2 } - ] + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_metadata_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "" + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" } ], "repeat": null, @@ -5191,14 +7163,14 @@ data: "gridPos": { }, - "id": 4, + "id": 9, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, + "alignAsTable": false, + "avg": false, + "current": false, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": false, "values": false @@ -5220,11 +7192,39 @@ data: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{pod_name=\"$pod\"}[1m])))", + "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", + "legendFormat": "replicas specified", "refId": "A" + }, + { + "expr": "max(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas created", + "refId": "B" + }, + { + "expr": "min(kube_statefulset_status_replicas_ready{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "ready", + "refId": "C" + }, + { + "expr": "min(kube_statefulset_status_replicas_current{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas of current version", + "refId": "D" + }, + { + "expr": "min(kube_statefulset_status_replicas_updated{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "E" } ], "thresholds": [ @@ -5232,7 +7232,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Network I/O", + "title": "Replicas", "tooltip": { "shared": true, "sort": 0, @@ -5250,19 +7250,19 @@ data: }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -5294,11 +7294,11 @@ data: "includeAll": false, "label": "Namespace", "multi": false, - "name": "namespace", + "name": "statefulset_namespace", "options": [ ], - "query": "label_values(kube_pod_info, namespace)", + "query": "label_values(kube_statefulset_metadata_generation, namespace)", "refresh": 2, "regex": "", "sort": 0, @@ -5318,39 +7318,13 @@ data: "datasource": "prometheus", "hide": 0, "includeAll": false, - "label": "Pod", - "multi": false, - "name": "pod", - "options": [ - - ], - "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "prometheus", - "hide": 0, - "includeAll": true, - "label": "Container", + "label": "Name", "multi": false, - "name": "container", + "name": "statefulset_name", "options": [ ], - "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "query": "label_values(kube_statefulset_metadata_generation{namespace=\"$statefulset_namespace\"}, statefulset)", "refresh": 2, "regex": "", "sort": 0, @@ -5394,7 +7368,7 @@ data: ] }, "timezone": "browser", - "title": "Pods", + "title": "StatefulSets", "version": 0 } kind: ConfigMap diff --git a/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml b/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml index f21ff3c5978a18636dd43473d13eb3e62a697f8b..ec0129db5bf7f474b8aa124b63be72d50e97d667 100644 --- a/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-role-binding-config.yaml @@ -9,5 +9,5 @@ roleRef: name: prometheus-k8s-config subjects: - kind: ServiceAccount - name: prometheus-k8s-config + name: prometheus-k8s namespace: monitoring diff --git a/manifests/prometheus-k8s/prometheus-k8s-rules.yaml b/manifests/prometheus-k8s/prometheus-k8s-rules.yaml index 90cb3f3e165c6a07d0206fa98682b00814bc02ae..0c03de56dab52307ffac3ec4be317e5e8af61f9a 100644 --- a/manifests/prometheus-k8s/prometheus-k8s-rules.yaml +++ b/manifests/prometheus-k8s/prometheus-k8s-rules.yaml @@ -583,5 +583,8 @@ data: rejected\n" kind: ConfigMap metadata: + labels: + prometheus: k8s + role: alert-rules name: prometheus-k8s-rules namespace: monitoring