diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index e9484736a91237d71cf4d1bf8f2454d039d5b611..578be8e9691028b653fd081700e1bba526ac71e7 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -80,7 +80,7 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "release-2.23", + "version": "release-2.24", "name": "prometheus" }, { diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 7efb4031631a74f9dbfec0b927027a04ff0b2f66..ec3c8607923c335cc7cc13dc05aa51cc838f8b54 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana" } }, - "version": "7176a6d54b3b19e0529ce574ab5ed427f1c721e9", - "sum": "IrxVMYJrTbDliaVMXX72jUKm8Ju2Za8cAbds7d26wuY=" + "version": "4204279da8d3d6317116ee161ac706fadbba9193", + "sum": "VUavLhri7lTnJ2V7F9lDlL+K96NwIhqqlxMtasYBs3Q=" }, { "source": { @@ -18,7 +18,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "ca866c02422ff3f3d1f0876898a30c33dd7bcccf", + "version": "5dcd459ae9c7948f5620002f5b0bb9cf0b8f1502", "sum": "bLqTqEr0jky9zz5MV/7ucn6H5mph2NlXas0TVnGNB1Y=" }, { @@ -28,8 +28,8 @@ "subdir": "grafonnet" } }, - "version": "356bd73e4792ffe107725776ca8946895969c191", - "sum": "CSMZ3dJrpJpwvffie8BqcfrIVVwiKNqdPEN+1XWRBGU=" + "version": "b0d72d6ed0e9fcab83fc2dd954b3bd57113e768c", + "sum": "g2UC37YmOShdIFThAO99Uw89UO+H3sHt+y0ionv9/sA=" }, { "source": { @@ -38,8 +38,8 @@ "subdir": "grafana-builder" } }, - "version": "9c3fb8096e1f80e2f3a84566566906ff187f5a8c", - "sum": "9/eJqljTTtJeq9QRjabdKWL6yD8a7VzLmGKBK3ir77k=" + "version": "2cef89cb717c8b596443ac5de0415d1ffdb42252", + "sum": "EmHrmBY8PbnV0BKXmVWvAEmax6eglRinKSyZbTmVWuc=" }, { "source": { @@ -59,8 +59,8 @@ "subdir": "" } }, - "version": "ead45674dba3c8712e422d99223453177aac6bf4", - "sum": "3i0NkntlBluDS1NRF+iSc2e727Alkv3ziuVjAP12/kE=" + "version": "4a8e078147dbca51067521e6ac59c7b54d44d3bd", + "sum": "D5XwKXhd3c0e+1D5iRgUhStB0qpcT4dSCmytuGQa3+k=" }, { "source": { @@ -69,7 +69,7 @@ "subdir": "lib/promgrafonnet" } }, - "version": "ead45674dba3c8712e422d99223453177aac6bf4", + "version": "4a8e078147dbca51067521e6ac59c7b54d44d3bd", "sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps=" }, { @@ -89,7 +89,7 @@ "subdir": "jsonnet/kube-state-metrics-mixin" } }, - "version": "7bdd62593c9273b5179cf3c9d2d819e9d997aaa4", + "version": "72d6d3106861f992b7d6ecc0a88abe9b12ad5427", "sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo=" }, { @@ -99,7 +99,7 @@ "subdir": "jsonnet/mixin" } }, - "version": "5555f492df250168657b72bb8cb60bec071de71f", + "version": "788d4456425eaf8c1d613582995bdf7de02154b0", "sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=" }, { @@ -119,8 +119,8 @@ "subdir": "doc/alertmanager-mixin" } }, - "version": "193ebba04d1e70d971047e983a0b489112610460", - "sum": "QcftU7gjCQyj7B6M4YJeCAeaPd0kwxd4J4rolo7AnLE=", + "version": "3f46b62d75da4d68d2098388797e6a61fcc5e043", + "sum": "VP1vn/WTGLZaBgGhGMUO81qNTc/fnp5KtzVjcaxad6Q=", "name": "alertmanager" }, { @@ -130,7 +130,7 @@ "subdir": "docs/node-mixin" } }, - "version": "8b466360a35581e0301bd22918be7011cf4203c3", + "version": "cfdd9dd0c983057df5e814e067fadbf8c7781559", "sum": "rvyiD/yCB4BeYAWqYF53bP8c+aCUt2ipLHW2Ea8ELO8=" }, { @@ -140,8 +140,8 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "26d89b4b0776fe4cd5a3656dfa520f119a375273", - "sum": "1VRVMuxAEZ9vdGHFlndmG9iQzDD6AoIXrX80CDpGDaU=", + "version": "e4487274853c587717006eeda8804e597d120340", + "sum": "6kUzElCBWZ5U/3cxEpHNMmoKKPubG45QxpmLu8PGg08=", "name": "prometheus" }, { diff --git a/manifests/alertmanager-prometheusRule.yaml b/manifests/alertmanager-prometheusRule.yaml index ea78ad1123db69d3ab513b2c25cdd2a908e221ca..1fa9c22315d4894fb87f075b8a8fb1daea0c66e7 100644 --- a/manifests/alertmanager-prometheusRule.yaml +++ b/manifests/alertmanager-prometheusRule.yaml @@ -55,17 +55,31 @@ spec: - alert: AlertmanagerClusterFailedToSendAlerts annotations: description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. - summary: All Alertmanager instances in a cluster failed to send notifications. + summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. expr: | - min by (namespace,service) ( - rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) / - rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) ) > 0.01 for: 5m labels: severity: critical + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. + summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning - alert: AlertmanagerConfigInconsistent annotations: description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. diff --git a/manifests/grafana-dashboardDatasources.yaml b/manifests/grafana-dashboardDatasources.yaml index 22d4748885add3680094a37704356ec9d8d70909..18ee57bace54951818ca8a0e7c7e0d8ca2e59699 100644 --- a/manifests/grafana-dashboardDatasources.yaml +++ b/manifests/grafana-dashboardDatasources.yaml @@ -3,6 +3,11 @@ data: datasources.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLm1vbml0b3Jpbmcuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9CiAgICBdCn0= kind: Secret metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-datasources namespace: monitoring type: Opaque diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index e5e62bc938ee47e8af8ea904c18a20c61e413950..5836c3acd1f7ef4393701bace7dbceeefe876be0 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1729,6 +1729,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-apiserver namespace: monitoring - apiVersion: v1 @@ -3595,6 +3600,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-cluster-total namespace: monitoring - apiVersion: v1 @@ -4730,6 +4740,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-controller-manager namespace: monitoring - apiVersion: v1 @@ -7296,6 +7311,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-k8s-resources-cluster namespace: monitoring - apiVersion: v1 @@ -9566,6 +9586,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-k8s-resources-namespace namespace: monitoring - apiVersion: v1 @@ -10528,6 +10553,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-k8s-resources-node namespace: monitoring - apiVersion: v1 @@ -12284,6 +12314,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-k8s-resources-pod namespace: monitoring - apiVersion: v1 @@ -14302,6 +14337,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-k8s-resources-workload namespace: monitoring - apiVersion: v1 @@ -16481,6 +16521,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-k8s-resources-workloads-namespace namespace: monitoring - apiVersion: v1 @@ -18998,6 +19043,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-kubelet namespace: monitoring - apiVersion: v1 @@ -20446,6 +20496,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-namespace-by-pod namespace: monitoring - apiVersion: v1 @@ -22166,6 +22221,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-namespace-by-workload namespace: monitoring - apiVersion: v1 @@ -23114,6 +23174,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-node-cluster-rsrc-use namespace: monitoring - apiVersion: v1 @@ -24089,6 +24154,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-node-rsrc-use namespace: monitoring - apiVersion: v1 @@ -25070,6 +25140,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-nodes namespace: monitoring - apiVersion: v1 @@ -25262,7 +25337,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n", + "expr": "max without(instance,node) (\n(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -25459,7 +25534,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n", + "expr": "max without(instance,node) (\nkubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -25631,6 +25706,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-persistentvolumesusage namespace: monitoring - apiVersion: v1 @@ -26843,6 +26923,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-pod-total namespace: monitoring - apiVersion: v1 @@ -26868,7 +26953,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "60s", "rows": [ { "collapse": false, @@ -27119,7 +27204,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n", + "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))\n- \n (rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -27704,7 +27789,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"} or prometheus_remote_storage_samples_pending{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -28009,7 +28094,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -28102,7 +28187,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -28195,7 +28280,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -28348,7 +28433,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "prometheus-mixin" ], "templating": { "list": [ @@ -28492,11 +28577,16 @@ items: ] }, "timezone": "browser", - "title": "Prometheus Remote Write", + "title": "Prometheus / Remote Write", "version": 0 } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-prometheus-remote-write namespace: monitoring - apiVersion: v1 @@ -28515,7 +28605,7 @@ items: "links": [ ], - "refresh": "10s", + "refresh": "60s", "rows": [ { "collapse": false, @@ -29594,7 +29684,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "prometheus-mixin" ], "templating": { "list": [ @@ -29702,12 +29792,17 @@ items: ] }, "timezone": "utc", - "title": "Prometheus Overview", + "title": "Prometheus / Overview", "uid": "", "version": 0 } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-prometheus namespace: monitoring - apiVersion: v1 @@ -30923,6 +31018,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-proxy namespace: monitoring - apiVersion: v1 @@ -31981,6 +32081,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-scheduler namespace: monitoring - apiVersion: v1 @@ -32893,6 +32998,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-statefulset namespace: monitoring - apiVersion: v1 @@ -34315,6 +34425,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboard-workload-total namespace: monitoring kind: ConfigMapList diff --git a/manifests/grafana-dashboardSources.yaml b/manifests/grafana-dashboardSources.yaml index fffec986bb45dcfc685e2cae8ea9a8be170ecc3b..ca27f99e50cc316096c50d8823a5b0b085017c0d 100644 --- a/manifests/grafana-dashboardSources.yaml +++ b/manifests/grafana-dashboardSources.yaml @@ -17,5 +17,10 @@ data: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana-dashboards namespace: monitoring diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index d6bb77dac9ab38201cf0588dbb88745ede6610ba..0f73af5a06c9b667d446043ecae4bb12c3fac91c 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -2,21 +2,28 @@ apiVersion: apps/v1 kind: Deployment metadata: labels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana namespace: monitoring spec: replicas: 1 selector: matchLabels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus template: metadata: annotations: - checksum/grafana-dashboards: b02ae450c84445cbaca8c685eefaec6c - checksum/grafana-datasources: 48faab41f579fc8efde6034391496f6a + checksum/grafana-datasources: a77789e5440a1e51e204e99e2f0f480a labels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 spec: containers: - env: [] diff --git a/manifests/grafana-service.yaml b/manifests/grafana-service.yaml index 5e7e1453ea6698bd65a4faab5b508e5fe3696daf..32a907461bd98551fbf47b1bc4f5027ce6c81eb2 100644 --- a/manifests/grafana-service.yaml +++ b/manifests/grafana-service.yaml @@ -2,7 +2,10 @@ apiVersion: v1 kind: Service metadata: labels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.5 name: grafana namespace: monitoring spec: @@ -11,5 +14,7 @@ spec: port: 3000 targetPort: http selector: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus type: NodePort diff --git a/manifests/kubernetes-prometheusRule.yaml b/manifests/kubernetes-prometheusRule.yaml index d683cff6db28720100badf601d69070530dcbcbf..d3ee9527b90e186425129efcc5fbaf2bd0f31a2b 100644 --- a/manifests/kubernetes-prometheusRule.yaml +++ b/manifests/kubernetes-prometheusRule.yaml @@ -15,11 +15,11 @@ spec: rules: - alert: KubePodCrashLooping annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes. runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping summary: Pod is crash looping. expr: | - rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 + rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) * 60 * 5 > 0 for: 15m labels: severity: warning @@ -499,11 +499,11 @@ spec: severity: critical - alert: AggregatedAPIErrors annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. + description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m. runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors summary: An aggregated API has reported errors. expr: | - sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 + sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[10m])) > 4 labels: severity: warning - alert: AggregatedAPIDown @@ -526,6 +526,16 @@ spec: for: 15m labels: severity: critical + - alert: KubeAPITerminatedRequests + annotations: + description: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapiterminatedrequests + summary: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. + expr: | + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + for: 5m + labels: + severity: warning - name: kubernetes-system-kubelet rules: - alert: KubeNodeNotReady @@ -1102,77 +1112,80 @@ spec: verb: write record: apiserver_request:availability30d - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d])) + avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 record: code_verb:apiserver_request_total:increase30d - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h - expr: | sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) labels: diff --git a/manifests/prometheus-prometheusRule.yaml b/manifests/prometheus-prometheusRule.yaml index aa4f0ce92218f59a2f5b70e592afad11c32667f2..d4d2ed68965cbbf21cb39be84f5cf9ee2f9755f9 100644 --- a/manifests/prometheus-prometheusRule.yaml +++ b/manifests/prometheus-prometheusRule.yaml @@ -202,9 +202,9 @@ spec: summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. expr: | min without (alertmanager) ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) ) * 100 > 3