diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 435ece5ef480224dc1c3f76625012f7bd23ecd43..819ed41e3e6f6bd193431508b94842a74e19dc30 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "cbc1340af53f50728181f97f6bce442ac33d8993", - "sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw=" + "version": "fb7703ab430ca2f23cd2f70bad8d1d38f9b165fb", + "sum": "Ko3qhNfC2vN/houLh6C0Ryacjv70gl0DVPGU/PQ4OD0=" }, { "name": "grafana", @@ -30,7 +30,7 @@ "subdir": "grafana-builder" } }, - "version": "67ab3dc52f3cdbc3b29d30afd3261375b5ad13fd", + "version": "8813f9108550cd510b17ce5a8bac9261dd140e13", "sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE=" }, { @@ -41,8 +41,8 @@ "subdir": "grafonnet" } }, - "version": "b82411476842f583817e67feff5becf1228fd540", - "sum": "mEosZ6hZCTCw8AaASEtRFjY8PSmpvqI3xj6IWpwcroU=" + "version": "22420e64b58913754871a1c7d6c7b3cfd3651460", + "sum": "z/+CW6854tMTaI4+iRbx0nwP/m/vzf5LezBfZVXZKqU=" }, { "name": "ksonnet", @@ -72,8 +72,8 @@ "subdir": "" } }, - "version": "252be5f3d16950aed725601ce1020dd5917aac67", - "sum": "PmOBMPbWn9/P8zVY4er20VKZCcuDW1NQXkjrX100lnE=" + "version": "a7ee9d1abe1b1a3670a02ede1135cadb660b9d0c", + "sum": "bFGN7aEnh4Hs+tkuRkmlWboht0RWt38qy+Y4gGS+pa0=" }, { "name": "node-mixin", @@ -83,7 +83,7 @@ "subdir": "docs/node-mixin" } }, - "version": "20fe5bfb5be4caf3c8c11533b7fb35cb97d810f5", + "version": "2cae917bb7e0b6379221e8a24da012b16e63d661", "sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg=" }, { @@ -94,8 +94,8 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "431844f0a7c289e4255a68f09a18fcca09637fb2", - "sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM=" + "version": "fce2e131dba3d4150e1b44b7b2d1a5e62f655dc7", + "sum": "/cohvDTaIiLElG66tKeQsi4v1M9mlGDKjOBSWivL9TU=" }, { "name": "prometheus-operator", @@ -116,8 +116,19 @@ "subdir": "lib/promgrafonnet" } }, - "version": "325f8a46fac9605f1de8bc20ca811cb92d1ef7e5", + "version": "a7ee9d1abe1b1a3670a02ede1135cadb660b9d0c", "sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc=" + }, + { + "name": "slo-libsonnet", + "source": { + "git": { + "remote": "https://github.com/metalmatze/slo-libsonnet", + "subdir": "slo-libsonnet" + } + }, + "version": "437c402c5f3ad86c3c16db8471f1649284fef0ee", + "sum": "2Zcyku1f558VrUpMaJnI78fahDksPLcS1idmxxwcQ7Q=" } ] } diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 26b21ab17c82e5c6696c494eb9d666a29a3028b6..757147e9e795abd993d523e0f02333541285d1d0 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1752,6 +1752,8 @@ items: "step": 10 } ], + "timeFrom": null, + "timeShift": null, "title": "Current Status", "type": "table" }, @@ -17985,6 +17987,8 @@ items: "step": 10 } ], + "timeFrom": null, + "timeShift": null, "title": "Current Status", "type": "table" }, @@ -19288,6 +19292,8 @@ items: "step": 10 } ], + "timeFrom": null, + "timeShift": null, "title": "Current Status", "type": "table" }, @@ -25920,12 +25926,9 @@ items: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": "queue", + "repeat": null, "seriesOverrides": [ - { - "alias": "/max_shards/", - "yaxis": 2 - } + ], "spaceLength": 10, "span": 12, @@ -25933,32 +25936,284 @@ items: "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_shards_max{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "max_shards:{{queue}}", + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Shards", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "expr": "prometheus_remote_storage_shards_min{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_remote_storage_shards_max{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "min_shards:{{queue}}", - "refId": "B" + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Max Shards", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "expr": "prometheus_remote_storage_shards_desired{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_remote_storage_shards_min{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "desired_shards:{{queue}}", - "refId": "C" + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Min Shards", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "expr": "prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_remote_storage_shards_desired{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "current_shards:{{queue}}", - "refId": "D" + "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "refId": "A" } ], "thresholds": [ @@ -25966,7 +26221,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Shards: $queue", + "title": "Desired Shards", "tooltip": { "shared": true, "sort": 0, @@ -26026,7 +26281,7 @@ items: "gridPos": { }, - "id": 6, + "id": 9, "legend": { "alignAsTable": false, "avg": false, @@ -26048,7 +26303,7 @@ items: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": "queue", + "repeat": null, "seriesOverrides": [ ], @@ -26070,7 +26325,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Shard Capacity: $queue", + "title": "Shard Capacity", "tooltip": { "shared": true, "sort": 0, @@ -26117,7 +26372,7 @@ items: "gridPos": { }, - "id": 7, + "id": 10, "legend": { "alignAsTable": false, "avg": false, @@ -26139,7 +26394,7 @@ items: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": "queue", + "repeat": null, "seriesOverrides": [ ], @@ -26161,7 +26416,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Pending Samples: $queue", + "title": "Pending Samples", "tooltip": { "shared": true, "sort": 0, @@ -26221,7 +26476,7 @@ items: "gridPos": { }, - "id": 8, + "id": 11, "legend": { "alignAsTable": false, "avg": false, @@ -26312,7 +26567,7 @@ items: "gridPos": { }, - "id": 9, + "id": 12, "legend": { "alignAsTable": false, "avg": false, @@ -26416,7 +26671,7 @@ items: "gridPos": { }, - "id": 10, + "id": 13, "legend": { "alignAsTable": false, "avg": false, @@ -26507,7 +26762,7 @@ items: "gridPos": { }, - "id": 11, + "id": 14, "legend": { "alignAsTable": false, "avg": false, @@ -26598,7 +26853,7 @@ items: "gridPos": { }, - "id": 12, + "id": 15, "legend": { "alignAsTable": false, "avg": false, @@ -26689,7 +26944,7 @@ items: "gridPos": { }, - "id": 13, + "id": 16, "legend": { "alignAsTable": false, "avg": false, @@ -26870,16 +27125,7 @@ items: { "allValue": null, "current": { - "text": { - "selected": true, - "text": "All", - "value": "$__all" - }, - "value": { - "selected": true, - "text": "All", - "value": "$__all" - } + }, "datasource": "$datasource", "hide": 0, @@ -26890,7 +27136,7 @@ items: "options": [ ], - "query": "label_values(prometheus_remote_storage_shards, queue)", + "query": "label_values(prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}, queue)", "refresh": 2, "regex": "", "sort": 0, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index ae67997e560cf1f307cfc0f74f8a9a6e9986da8d..bfa759bf7c4f8488da6b6d6a6d4c14d2d4534388 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -762,6 +762,156 @@ spec: for: 15m labels: severity: warning + - name: kube-apiserver-error + rules: + - alert: ErrorBudgetBurn + annotations: + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn + expr: | + ( + status_class_5xx:apiserver_request_total:ratio_rate1h{job="apiserver"} > (14.4*0.010000) + and + status_class_5xx:apiserver_request_total:ratio_rate5m{job="apiserver"} > (14.4*0.010000) + ) + or + ( + status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (6*0.010000) + and + status_class_5xx:apiserver_request_total:ratio_rate30m{job="apiserver"} > (6*0.010000) + ) + labels: + job: apiserver + severity: critical + - alert: ErrorBudgetBurn + annotations: + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn + expr: | + ( + status_class_5xx:apiserver_request_total:ratio_rate1d{job="apiserver"} > (3*0.010000) + and + status_class_5xx:apiserver_request_total:ratio_rate2h{job="apiserver"} > (3*0.010000) + ) + or + ( + status_class_5xx:apiserver_request_total:ratio_rate3d{job="apiserver"} > (0.010000) + and + status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (0.010000) + ) + labels: + job: apiserver + severity: warning + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[5m] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate5m + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[30m] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate30m + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[1h] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate1h + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[2h] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate2h + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[6h] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate6h + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[1d] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate1d + - expr: | + sum by (status_class) ( + label_replace( + rate(apiserver_request_total{job="apiserver"}[3d] + ), "status_class", "${1}xx", "code", "([0-9])..") + ) + labels: + job: apiserver + record: status_class:apiserver_request_total:rate3d + - expr: | + sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate5m{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate5m + - expr: | + sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate30m{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate30m + - expr: | + sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate1h{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate1h + - expr: | + sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate2h{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate2h + - expr: | + sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate6h{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate6h + - expr: | + sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate1d{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate1d + - expr: | + sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"}) + / + sum(status_class:apiserver_request_total:rate3d{job="apiserver"}) + labels: + job: apiserver + record: status_class_5xx:apiserver_request_total:ratio_rate3d - name: kubernetes-system-apiserver rules: - alert: KubeAPILatencyHigh