diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 02b99502ab9de9053e8b2b20faa5c41cafb96811..e5e89252e85605180ede0401260ad0a0159a11e4 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -25,6 +25,940 @@ items: ], "refresh": "10s", "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 3, + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "apiserver_request:availability30d{verb=\"all\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Availability (30d) > 99.000", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": 3, + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 8, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 * (apiserver_request:availability30d{verb=\"all\"} - 0.990000)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "errorbudget", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ErrorBudget (30d) > 99.000", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "decimals": 3, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": 3, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 3, + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 4, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "apiserver_request:availability7d{verb=\"read\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Read Availability (30d)", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(code_resource:apiserver_request_total:rate5m{verb=\"read\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Read SLI - Requests", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\",code=~\"5..\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ resource }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Read SLI - Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"read\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ resource }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Read SLI - Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 3, + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 8, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "apiserver_request:availability7d{verb=\"write\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Write Availability (30d)", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(code_resource:apiserver_request_total:rate5m{verb=\"write\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Write SLI - Requests", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\",code=~\"5..\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ resource }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Write SLI - Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"write\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ resource }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Write SLI - Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, { "collapse": false, "collapsed": false, @@ -50,7 +984,7 @@ items: "gridPos": { }, - "id": 2, + "id": 12, "interval": null, "links": [ @@ -125,7 +1059,7 @@ items: "gridPos": { }, - "id": 3, + "id": 13, "legend": { "alignAsTable": false, "avg": false, @@ -237,7 +1171,7 @@ items: "gridPos": { }, - "id": 4, + "id": 14, "legend": { "alignAsTable": true, "avg": false, @@ -341,7 +1275,7 @@ items: "gridPos": { }, - "id": 5, + "id": 15, "legend": { "alignAsTable": false, "avg": false, @@ -432,7 +1366,7 @@ items: "gridPos": { }, - "id": 6, + "id": 16, "legend": { "alignAsTable": false, "avg": false, @@ -523,7 +1457,7 @@ items: "gridPos": { }, - "id": 7, + "id": 17, "legend": { "alignAsTable": true, "avg": false, @@ -627,7 +1561,7 @@ items: "gridPos": { }, - "id": 8, + "id": 18, "legend": { "alignAsTable": false, "avg": false, @@ -718,7 +1652,7 @@ items: "gridPos": { }, - "id": 9, + "id": 19, "legend": { "alignAsTable": false, "avg": false, @@ -816,7 +1750,7 @@ items: "gridPos": { }, - "id": 10, + "id": 20, "legend": { "alignAsTable": false, "avg": false, @@ -927,7 +1861,7 @@ items: "gridPos": { }, - "id": 11, + "id": 21, "legend": { "alignAsTable": false, "avg": false, @@ -1018,7 +1952,7 @@ items: "gridPos": { }, - "id": 12, + "id": 22, "legend": { "alignAsTable": false, "avg": false, @@ -1109,7 +2043,7 @@ items: "gridPos": { }, - "id": 13, + "id": 23, "legend": { "alignAsTable": false, "avg": false, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 1161a8bdf8368cedc90adbd809f2352ff56db4e6..79a8e55790ffe63905c1afaa6b0d5d5ebe55c238 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -65,122 +65,361 @@ spec: rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) ) record: instance:node_network_transmit_drop_excluding_lo:rate1m - - name: kube-apiserver-error + - name: kube-apiserver.rules rules: - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[5m] - ), "status_class", "${1}xx", "code", "([0-9])..") + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1d])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) labels: - job: apiserver - record: status_class:apiserver_request_total:rate5m + verb: read + record: apiserver_request:burnrate1d - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[30m] - ), "status_class", "${1}xx", "code", "([0-9])..") + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) labels: - job: apiserver - record: status_class:apiserver_request_total:rate30m + verb: read + record: apiserver_request:burnrate1h - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[1h] - ), "status_class", "${1}xx", "code", "([0-9])..") + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[2h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) labels: - job: apiserver - record: status_class:apiserver_request_total:rate1h + verb: read + record: apiserver_request:burnrate2h - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[2h] - ), "status_class", "${1}xx", "code", "([0-9])..") + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30m])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) labels: - job: apiserver - record: status_class:apiserver_request_total:rate2h + verb: read + record: apiserver_request:burnrate30m - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[6h] - ), "status_class", "${1}xx", "code", "([0-9])..") + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[3d])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) labels: - job: apiserver - record: status_class:apiserver_request_total:rate6h + verb: read + record: apiserver_request:burnrate3d - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[1d] - ), "status_class", "${1}xx", "code", "([0-9])..") + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[5m])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: - job: apiserver - record: status_class:apiserver_request_total:rate1d + verb: read + record: apiserver_request:burnrate5m - expr: | - sum by (status_class) ( - label_replace( - rate(apiserver_request_total{job="apiserver"}[3d] - ), "status_class", "${1}xx", "code", "([0-9])..") + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[6h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) labels: - job: apiserver - record: status_class:apiserver_request_total:rate3d + verb: read + record: apiserver_request:burnrate6h - expr: | - sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"}) + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + ) / - sum(status_class:apiserver_request_total:rate5m{job="apiserver"}) + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate5m + verb: write + record: apiserver_request:burnrate1d - expr: | - sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"}) + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + ) / - sum(status_class:apiserver_request_total:rate30m{job="apiserver"}) + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate30m + verb: write + record: apiserver_request:burnrate1h - expr: | - sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"}) + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + ) / - sum(status_class:apiserver_request_total:rate1h{job="apiserver"}) + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate1h + verb: write + record: apiserver_request:burnrate2h - expr: | - sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"}) + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + ) / - sum(status_class:apiserver_request_total:rate2h{job="apiserver"}) + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate2h + verb: write + record: apiserver_request:burnrate30m - expr: | - sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"}) + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + ) / - sum(status_class:apiserver_request_total:rate6h{job="apiserver"}) + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate6h + verb: write + record: apiserver_request:burnrate3d - expr: | - sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"}) + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + ) / - sum(status_class:apiserver_request_total:rate1d{job="apiserver"}) + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate1d + verb: write + record: apiserver_request:burnrate5m - expr: | - sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"}) + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + ) / - sum(status_class:apiserver_request_total:rate3d{job="apiserver"}) + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) labels: - job: apiserver - record: status_class_5xx:apiserver_request_total:ratio_rate3d - - name: kube-apiserver.rules - rules: + verb: write + record: apiserver_request:burnrate6h + - expr: | + 1 - ( + ( + # write too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + - + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + ) + + ( + # read too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) + - + ( + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) + + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + ) + ) + + # errors + sum(code:apiserver_request_total:increase30d{code=~"5.."}) + ) + / + sum(code:apiserver_request_total:increase30d) + labels: + verb: all + record: apiserver_request:availability30d + - expr: | + 1 - ( + sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) + - + ( + # too slow + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) + + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + ) + + + # errors + sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."}) + ) + / + sum(code:apiserver_request_total:increase30d{verb="read"}) + labels: + verb: read + record: apiserver_request:availability30d + - expr: | + 1 - ( + ( + # too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + - + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + ) + + + # errors + sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."}) + ) + / + sum(code:apiserver_request_total:increase30d{verb="write"}) + labels: + verb: write + record: apiserver_request:availability30d + - expr: | + sum by (code) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30d])) + labels: + verb: read + record: code:apiserver_request_total:increase30d + - expr: | + sum by (code) (increase(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30d])) + labels: + verb: write + record: code:apiserver_request_total:increase30d + - expr: | + sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: code_resource:apiserver_request_total:rate5m + - expr: | + sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: code_resource:apiserver_request_total:rate5m + - expr: | + histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 + labels: + quantile: "0.99" + verb: read + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 + labels: + quantile: "0.99" + verb: write + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - expr: | sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) / @@ -628,7 +867,7 @@ spec: ) or ( - node_timex_offset_seconds < 0.05 + node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0 ) @@ -791,7 +1030,7 @@ spec: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled expr: | kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 - for: 10m + for: 15m labels: severity: warning - alert: KubeCronJobRunning @@ -865,11 +1104,11 @@ spec: for: 5m labels: severity: warning - - alert: KubeMemOvercommit + - alert: KubeMemoryOvercommit annotations: message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit expr: | sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) / @@ -881,10 +1120,10 @@ spec: for: 5m labels: severity: warning - - alert: KubeCPUOvercommit + - alert: KubeCPUQuotaOvercommit annotations: message: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) / @@ -893,10 +1132,10 @@ spec: for: 5m labels: severity: warning - - alert: KubeMemOvercommit + - alert: KubeMemoryQuotaOvercommit annotations: message: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) / @@ -934,12 +1173,12 @@ spec: severity: warning - name: kubernetes-storage rules: - - alert: KubePersistentVolumeUsageCritical + - alert: KubePersistentVolumeFillingUp annotations: message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup expr: | kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} / @@ -948,12 +1187,12 @@ spec: for: 1m labels: severity: critical - - alert: KubePersistentVolumeFullInFourDays + - alert: KubePersistentVolumeFillingUp annotations: message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup expr: | ( kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} @@ -964,7 +1203,7 @@ spec: predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 for: 1h labels: - severity: critical + severity: warning - alert: KubePersistentVolumeErrors annotations: message: The persistent volume {{ $labels.persistentvolume }} has status {{ @@ -1000,47 +1239,51 @@ spec: for: 15m labels: severity: warning - - name: kube-apiserver-error-alerts + - name: kube-apiserver-slos rules: - - alert: ErrorBudgetBurn + - alert: KubeAPIErrorBudgetBurn annotations: - message: 'High requests error budget burn for job=apiserver (current value: - {{ $value }})' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn + message: The API server is burning too much error budget + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn expr: | - ( - status_class_5xx:apiserver_request_total:ratio_rate1h{job="apiserver"} > (14.4*0.010000) - and - status_class_5xx:apiserver_request_total:ratio_rate5m{job="apiserver"} > (14.4*0.010000) - ) - or - ( - status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (6*0.010000) - and - status_class_5xx:apiserver_request_total:ratio_rate30m{job="apiserver"} > (6*0.010000) - ) + sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) + and + sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) + for: 2m labels: - job: apiserver severity: critical - - alert: ErrorBudgetBurn + - alert: KubeAPIErrorBudgetBurn annotations: - message: 'High requests error budget burn for job=apiserver (current value: - {{ $value }})' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn + message: The API server is burning too much error budget + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn expr: | - ( - status_class_5xx:apiserver_request_total:ratio_rate1d{job="apiserver"} > (3*0.010000) - and - status_class_5xx:apiserver_request_total:ratio_rate2h{job="apiserver"} > (3*0.010000) - ) - or - ( - status_class_5xx:apiserver_request_total:ratio_rate3d{job="apiserver"} > (0.010000) - and - status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (0.010000) - ) + sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) + and + sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + for: 15m + labels: + severity: critical + - alert: KubeAPIErrorBudgetBurn + annotations: + message: The API server is burning too much error budget + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn + expr: | + sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) + and + sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) + for: 1h + labels: + severity: warning + - alert: KubeAPIErrorBudgetBurn + annotations: + message: The API server is burning too much error budget + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn + expr: | + sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) + and + sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) + for: 3h labels: - job: apiserver severity: warning - name: kubernetes-system-apiserver rules: