diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 90400a7311e897fd6461432476724ec140a5df94..68da316363f34abdbebd7381f1d63a33a7291f48 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -33,6 +33,7 @@ jobs: matrix: kind-image: - 'kindest/node:v1.19.0' + - 'kindest/node:v1.20.0' steps: - uses: actions/checkout@v2 - name: Start KinD diff --git a/README.md b/README.md index fb4fae3a8a0407c069f9e6f0a5da0888b60d3834..4b67ad11ef0a5bf23dee3f0f9afe3acbc90987d1 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ This adapter is an Extension API Server and Kubernetes needs to be have this fea To try out this stack, start [minikube](https://github.com/kubernetes/minikube) with the following command: ```shell -$ minikube delete && minikube start --kubernetes-version=v1.19.0 --memory=6g --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 +$ minikube delete && minikube start --kubernetes-version=v1.20.0 --memory=6g --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 ``` The kube-prometheus stack includes a resource metrics API server, so the metrics-server addon is not necessary. Ensure the metrics-server addon is disabled on minikube: @@ -94,19 +94,19 @@ $ minikube addons disable metrics-server The following versions are supported and work as we test against these versions in their respective branches. But note that other versions might work! -| kube-prometheus stack | Kubernetes 1.14 | Kubernetes 1.15 | Kubernetes 1.16 | Kubernetes 1.17 | Kubernetes 1.18 | Kubernetes 1.19 | -|-----------------------|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| -| `release-0.3` | ✔ | ✔ | ✔ | ✔ | ✗ | ✗ | -| `release-0.4` | ✗ | ✗ | ✔ (v1.16.5+) | ✔ | ✗ | ✗ | -| `release-0.5` | ✗ | ✗ | ✗ | ✗ | ✔ | ✗ | -| `release-0.6` | ✗ | ✗ | ✗ | ✗ | ✔ | ✔ | -| `HEAD` | ✗ | ✗ | ✗ | ✗ | x | ✔ | +| kube-prometheus stack | Kubernetes 1.16 | Kubernetes 1.17 | Kubernetes 1.18 | Kubernetes 1.19 | Kubernetes 1.20 | +|-----------------------|-----------------|-----------------|-----------------|-----------------|-----------------| +| `release-0.4` | ✔ (v1.16.5+) | ✔ | ✗ | ✗ | ✗ | +| `release-0.5` | ✗ | ✗ | ✔ | ✗ | ✗ | +| `release-0.6` | ✗ | ✗ | ✔ | ✔ | ✗ | +| `release-0.7` | ✗ | ✗ | ✗ | ✔ | ✔ | +| `HEAD` | ✗ | ✗ | ✗ | ✔ | ✔ | Note: Due to [two](https://github.com/kubernetes/kubernetes/issues/83778) [bugs](https://github.com/kubernetes/kubernetes/issues/86359) in Kubernetes v1.16.1, and prior to Kubernetes v1.16.5 the kube-prometheus release-0.4 branch only supports v1.16.5 and higher. The `extension-apiserver-authentication-reader` role in the kube-system namespace can be manually edited to include list and watch permissions in order to workaround the second issue with Kubernetes v1.16.2 through v1.16.4. ## Quickstart ->Note: For versions before Kubernetes v1.19.z refer to the [Kubernetes compatibility matrix](#kubernetes-compatibility-matrix) in order to choose a compatible branch. +>Note: For versions before Kubernetes v1.20.z refer to the [Kubernetes compatibility matrix](#kubernetes-compatibility-matrix) in order to choose a compatible branch. This project is intended to be used as a library (i.e. the intent is not for you to create your own modified copy of this repository). diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 3d8e6df03b4ea1a4bbcc056859f39e2af42f10f6..20d65bdf8571f2d84df11e042a91d09df645787d 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -80,7 +80,7 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "release-2.22", + "version": "release-2.23", "name": "prometheus" }, { diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 2f0dc1c4dcd0de4dfe53f997eb5da152b891a243..b4af75ef19154b579f4fefe515bf207e1b1bb5d9 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -38,8 +38,8 @@ "subdir": "grafana-builder" } }, - "version": "cb8787cd974c4291ee6aa2c17f211010429da554", - "sum": "mD0zEP9FVFXeag7EaeS5OvUr2A9D6DQhGemoNn6+PLc=" + "version": "9c3fb8096e1f80e2f3a84566566906ff187f5a8c", + "sum": "9/eJqljTTtJeq9QRjabdKWL6yD8a7VzLmGKBK3ir77k=" }, { "source": { @@ -59,8 +59,8 @@ "subdir": "" } }, - "version": "d4dbc0aa59dd2c35453b53155fd4021719df5cb1", - "sum": "YKCJpap1C7G54dk6vD0BTJ9N6MmRGbooxmsHI2EQRDc=" + "version": "ead45674dba3c8712e422d99223453177aac6bf4", + "sum": "3i0NkntlBluDS1NRF+iSc2e727Alkv3ziuVjAP12/kE=" }, { "source": { @@ -69,7 +69,7 @@ "subdir": "lib/promgrafonnet" } }, - "version": "d4dbc0aa59dd2c35453b53155fd4021719df5cb1", + "version": "ead45674dba3c8712e422d99223453177aac6bf4", "sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps=" }, { @@ -89,7 +89,7 @@ "subdir": "jsonnet/kube-state-metrics-mixin" } }, - "version": "33db2356bf1f0a1f51ddaaeb165bce04ab5aa0df", + "version": "7bdd62593c9273b5179cf3c9d2d819e9d997aaa4", "sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo=" }, { @@ -140,8 +140,8 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "de1c1243f4dd66fbac3e8213e9a7bd8dbc9f38b2", - "sum": "CGxvaHkP7z/gnsLB/8Imvt/AnW+9nJUnTcL+fvIAZUs=", + "version": "26d89b4b0776fe4cd5a3656dfa520f119a375273", + "sum": "1VRVMuxAEZ9vdGHFlndmG9iQzDD6AoIXrX80CDpGDaU=", "name": "prometheus" }, { @@ -151,7 +151,7 @@ "subdir": "mixin" } }, - "version": "d57813b2bc9b349842e1f9a06313731b005c6e00", + "version": "37e6ef61566c7c70793ba6d128f00c4c66cb2402", "sum": "OptiWUMOHFrRGTZhSfxV1RCeXZ90qsefGNTD4lDYVG0=" }, { diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 1247bd913c637f3d884e80ea37e295265f1a07b5..ed5c20a48bf0f46f874846e1f6c637aeb1db8a9d 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -16657,7 +16657,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -16741,7 +16741,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -26924,7 +26924,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}\n)\n", + "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} != 0)\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -27017,7 +27017,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n)\n", + "expr": "clamp_min(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n, 0)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 4b58d710914fd8cf7022dd20635677b845be1b1f..a0e31f774af9cb9c9b899782b2011940f18c5584 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -13,7 +13,7 @@ spec: template: metadata: annotations: - checksum/grafana-dashboards: b44634653e3bb90dacd5c15f42200fae + checksum/grafana-dashboards: ce13f0b50d04c73fb01da858eb1fb608 checksum/grafana-datasources: 48faab41f579fc8efde6034391496f6a labels: app: grafana diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index ed5afa368b01096c6a521c9241135f0e7644992c..97409d23c917bcaaee4dedc2294c1a148bb9f30a 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1902,21 +1902,6 @@ spec: for: 15m labels: severity: warning - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: | - min without(alertmanager) ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical - alert: PrometheusNotConnectedToAlertmanagers annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. @@ -1951,7 +1936,15 @@ spec: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. summary: Prometheus is not ingesting samples. expr: | - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 + ( + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0 + ) + ) for: 10m labels: severity: warning @@ -2001,7 +1994,7 @@ spec: # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) - - on(job, instance) group_right + - ignoring(remote_name, url) group_right max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) ) > 120 @@ -2050,6 +2043,21 @@ spec: for: 15m labels: severity: warning + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: | + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical - name: general.rules rules: - alert: TargetDown