From 0f5400e5fe99d73f9cb988e5a5146ceed0b6a26b Mon Sep 17 00:00:00 2001
From: Lili Cosic <cosiclili@gmail.com>
Date: Thu, 26 Sep 2019 14:53:40 +0200
Subject: [PATCH] manifests: Regenerate files

---
 manifests/grafana-dashboardDefinitions.yaml |  4 +-
 manifests/grafana-deployment.yaml           |  2 +-
 manifests/prometheus-rules.yaml             | 83 ++++++++++++++-------
 3 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml
index 34d90782..6120794d 100644
--- a/manifests/grafana-dashboardDefinitions.yaml
+++ b/manifests/grafana-dashboardDefinitions.yaml
@@ -5485,7 +5485,7 @@ items:
                                   "unit": "bytes"
                               },
                               {
-                                  "alias": "Memory Usage (Swap",
+                                  "alias": "Memory Usage (Swap)",
                                   "colorMode": null,
                                   "colors": [
 
@@ -19485,7 +19485,7 @@ items:
                       "options": [
 
                       ],
-                      "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}, cluster=\"$cluster\", namespace)",
+                      "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
                       "refresh": 2,
                       "regex": "",
                       "sort": 0,
diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml
index 4fe23fa9..fd01d32d 100644
--- a/manifests/grafana-deployment.yaml
+++ b/manifests/grafana-deployment.yaml
@@ -1,4 +1,4 @@
-apiVersion: apps/v1beta2
+apiVersion: apps/v1
 kind: Deployment
 metadata:
   labels:
diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml
index f6413d4c..769e6559 100644
--- a/manifests/prometheus-rules.yaml
+++ b/manifests/prometheus-rules.yaml
@@ -497,7 +497,7 @@ spec:
           state for longer than 15 minutes.
         runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
       expr: |
-        sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) > 0
+        sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!="Job"}) > 0
       for: 15m
       labels:
         severity: critical
@@ -630,7 +630,33 @@ spec:
         message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
         runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
       expr: |
-        kube_job_status_failed{job="kube-state-metrics"}  > 0
+        kube_job_failed{job="kube-state-metrics"}  > 0
+      for: 15m
+      labels:
+        severity: warning
+    - alert: KubeHpaReplicasMismatch
+      annotations:
+        message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the
+          desired number of replicas for longer than 15 minutes.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
+      expr: |
+        (kube_hpa_status_desired_replicas{job="kube-state-metrics"}
+          !=
+        kube_hpa_status_current_replicas{job="kube-state-metrics"})
+          and
+        changes(kube_hpa_status_current_replicas[15m]) == 0
+      for: 15m
+      labels:
+        severity: warning
+    - alert: KubeHpaMaxedOut
+      annotations:
+        message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at
+          max replicas for longer than 15 minutes.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
+      expr: |
+        kube_hpa_status_current_replicas{job="kube-state-metrics"}
+          ==
+        kube_hpa_spec_max_replicas{job="kube-state-metrics"}
       for: 15m
       labels:
         severity: warning
@@ -761,7 +787,7 @@ spec:
     rules:
     - alert: KubeNodeNotReady
       annotations:
-        message: '{{ $labels.node }} has been unready for more than an hour.'
+        message: '{{ $labels.node }} has been unready for more than 15 minutes.'
         runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
       expr: |
         kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
@@ -791,23 +817,13 @@ spec:
       for: 15m
       labels:
         severity: warning
-    - alert: KubeClientErrors
-      annotations:
-        message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
-          }}' is experiencing {{ printf "%0.0f" $value }} errors / second.
-        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
-      expr: |
-        sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
-      for: 15m
-      labels:
-        severity: warning
     - alert: KubeletTooManyPods
       annotations:
-        message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
-          to the limit of 110.
+        message: Kubelet '{{ $labels.node }}' is running at {{ printf "%.4g" $value
+          }}% of its Pod capacity.
         runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
       expr: |
-        kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
+        100 * max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 95
       for: 15m
       labels:
         severity: warning
@@ -991,17 +1007,6 @@ spec:
       for: 4h
       labels:
         severity: warning
-    - alert: PrometheusTSDBWALCorruptions
-      annotations:
-        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
-          {{$value | humanize}} corruptions of the write-ahead log (WAL) over the
-          last 3h.
-        summary: Prometheus is detecting WAL corruptions.
-      expr: |
-        increase(tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
-      for: 4h
-      labels:
-        severity: warning
     - alert: PrometheusNotIngestingSamples
       annotations:
         description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
@@ -1015,7 +1020,8 @@ spec:
     - alert: PrometheusDuplicateTimestamps
       annotations:
         description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
-          {{$value | humanize}} samples/s with different values but duplicated timestamp.
+          {{ printf "%.4g" $value  }} samples/s with different values but duplicated
+          timestamp.
         summary: Prometheus is dropping samples with duplicate timestamps.
       expr: |
         rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@@ -1025,7 +1031,7 @@ spec:
     - alert: PrometheusOutOfOrderTimestamps
       annotations:
         description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
-          {{$value | humanize}} samples/s with timestamps arriving out of order.
+          {{ printf "%.4g" $value  }} samples/s with timestamps arriving out of order.
         summary: Prometheus drops samples with out-of-order timestamps.
       expr: |
         rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@@ -1069,6 +1075,25 @@ spec:
       for: 15m
       labels:
         severity: critical
+    - alert: PrometheusRemoteWriteDesiredShards
+      annotations:
+        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
+          desired shards calculation wants to run {{ printf $value }} shards, which
+          is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}`
+          $labels.instance | query | first | value }}.
+        summary: Prometheus remote write desired shards calculation wants to run more
+          than configured max shards.
+      expr: |
+        # Without max_over_time, failed scrapes could create false negatives, see
+        # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+        (
+          max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m])
+        > on(job, instance) group_right
+          max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m])
+        )
+      for: 15m
+      labels:
+        severity: warning
     - alert: PrometheusRuleFailures
       annotations:
         description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
-- 
GitLab