From 326453cf4748f7330468f5661dbb00f5b2bd2cea Mon Sep 17 00:00:00 2001 From: paulfantom <pawel@krupa.net.pl> Date: Mon, 16 Dec 2019 11:24:04 +0100 Subject: [PATCH] manifests: regenerate --- manifests/grafana-dashboardDefinitions.yaml | 326 +++++++++++++++++--- manifests/prometheus-rules.yaml | 150 +++++++++ 2 files changed, 436 insertions(+), 40 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 26b21ab1..757147e9 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1752,6 +1752,8 @@ items: "step": 10 } ], + "timeFrom": null, + "timeShift": null, "title": "Current Status", "type": "table" }, @@ -17985,6 +17987,8 @@ items: "step": 10 } ], + "timeFrom": null, + "timeShift": null, "title": "Current Status", "type": "table" }, @@ -19288,6 +19292,8 @@ items: "step": 10 } ], + "timeFrom": null, + "timeShift": null, "title": "Current Status", "type": "table" }, @@ -25920,12 +25926,9 @@ items: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": "queue", + "repeat": null, "seriesOverrides": [ - { - "alias": "/max_shards/", - "yaxis": 2 - } + ], "spaceLength": 10, "span": 12, @@ -25933,32 +25936,284 @@ items: "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_shards_max{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "max_shards:{{queue}}", + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Shards", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "expr": "prometheus_remote_storage_shards_min{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_remote_storage_shards_max{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "min_shards:{{queue}}", - "refId": "B" + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Max Shards", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "expr": "prometheus_remote_storage_shards_desired{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_remote_storage_shards_min{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "desired_shards:{{queue}}", - "refId": "C" + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Min Shards", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "expr": "prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_remote_storage_shards_desired{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "current_shards:{{queue}}", - "refId": "D" + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" } ], "thresholds": [ @@ -25966,7 +26221,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Shards: $queue", + "title": "Desired Shards", "tooltip": { "shared": true, "sort": 0, @@ -26026,7 +26281,7 @@ items: "gridPos": { }, - "id": 6, + "id": 9, "legend": { "alignAsTable": false, "avg": false, @@ -26048,7 +26303,7 @@ items: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": "queue", + "repeat": null, "seriesOverrides": [ ], @@ -26070,7 +26325,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Shard Capacity: $queue", + "title": "Shard Capacity", "tooltip": { "shared": true, "sort": 0, @@ -26117,7 +26372,7 @@ items: "gridPos": { }, - "id": 7, + "id": 10, "legend": { "alignAsTable": false, "avg": false, @@ -26139,7 +26394,7 @@ items: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": "queue", + "repeat": null, "seriesOverrides": [ ], @@ -26161,7 +26416,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Pending Samples: $queue", + "title": "Pending Samples", "tooltip": { "shared": true, "sort": 0, @@ -26221,7 +26476,7 @@ items: "gridPos": { }, - "id": 8, + "id": 11, "legend": { "alignAsTable": false, "avg": false, @@ -26312,7 +26567,7 @@ items: "gridPos": { }, - "id": 9, + "id": 12, "legend": { "alignAsTable": false, "avg": false, @@ -26416,7 +26671,7 @@ items: "gridPos": { }, - "id": 10, + "id": 13, "legend": { "alignAsTable": false, "avg": false, @@ -26507,7 +26762,7 @@ items: "gridPos": { }, - "id": 11, + "id": 14, "legend": { "alignAsTable": false, "avg": false, @@ -26598,7 +26853,7 @@ items: "gridPos": { }, - "id": 12, + "id": 15, "legend": { "alignAsTable": false, "avg": false, @@ -26689,7 +26944,7 @@ items: "gridPos": { }, - "id": 13, + "id": 16, "legend": { "alignAsTable": false, "avg": false, @@ -26870,16 +27125,7 @@ items: { "allValue": null, "current": { - "text": { - "selected": true, - "text": "All", - "value": "$__all" - }, - "value": { - "selected": true, - "text": "All", - "value": "$__all" - } + }, "datasource": "$datasource", "hide": 0, @@ -26890,7 +27136,7 @@ items: "options": [ ], - "query": "label_values(prometheus_remote_storage_shards, queue)", + "query": "label_values(prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}, queue)", "refresh": 2, "regex": "", "sort": 0, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index ae67997e..bfa759bf 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -762,6 +762,156 @@ spec: for: 15m labels: severity: warning + - name: kube-apiserver-error + rules: + - alert: ErrorBudgetBurn + annotations: + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn + expr: | + ( + status_class_5xx:apiserver_request_total:ratio_rate1h{job="apiserver"} > (14.4*0.010000) + and + status_class_5xx:apiserver_request_total:ratio_rate5m{job="apiserver"} > (14.4*0.010000) + ) + or + ( + status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (6*0.010000) + and + status_class_5xx:apiserver_request_total:ratio_rate30m{job="apiserver"} > (6*0.010000) + ) + labels: + job: apiserver + severity: critical + - alert: ErrorBudgetBurn + annotations: + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn + expr: | + ( + status_class_5xx:apiserver_request_total:ratio_rate1d{job="apiserver"} > (3*0.010000) + and + status_class_5xx:apiserver_request_total:ratio_rate2h{job="apiserver"} > (3*0.010000) + ) + or + ( + status_class_5xx:apiserver_request_total:ratio_rate3d{job="apiserver"} > (0.010000) + and + status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (0.010000) + ) + labels: + job: apiserver + severity: warning + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[5m] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate5m + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[30m] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate30m + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[1h] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate1h + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[2h] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate2h + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[6h] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate6h + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[1d] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate1d + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[3d] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate3d + - expr: | + sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate5m{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate5m + - expr: | + sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate30m{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate30m + - expr: | + sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate1h{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate1h + - expr: | + sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate2h{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate2h + - expr: | + sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate6h{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate6h + - expr: | + sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate1d{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate1d + - expr: | + sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate3d{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate3d - name: kubernetes-system-apiserver rules: - alert: KubeAPILatencyHigh -- GitLab