From 4b6a761dc5d1d66884bc6abb0b7f2b818e2bfc56 Mon Sep 17 00:00:00 2001
From: Frederic Branczyk <fbranczyk@gmail.com>
Date: Mon, 18 Jun 2018 10:56:38 +0200
Subject: [PATCH] kube-prometheus: Update kubernetes monitoring mixing

---
 manifests/grafana-dashboardDefinitions.yaml | 34 ++++++----
 manifests/prometheus-rules.yaml             | 73 +++++++++++++++++++--
 2 files changed, 90 insertions(+), 17 deletions(-)

diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml
index 6f04bb1f..1c973836 100644
--- a/manifests/grafana-dashboardDefinitions.yaml
+++ b/manifests/grafana-dashboardDefinitions.yaml
@@ -64,7 +64,7 @@ items:
                                   "format": "time_series",
                                   "intervalFactor": 2,
                                   "legendFormat": "{{node}}",
-                                  "legendLink": "/dashboard/file/k8s-node-rsrc-use.json",
+                                  "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use",
                                   "step": 10
                               }
                           ],
@@ -150,7 +150,7 @@ items:
                                   "format": "time_series",
                                   "intervalFactor": 2,
                                   "legendFormat": "{{node}}",
-                                  "legendLink": "/dashboard/file/k8s-node-rsrc-use.json",
+                                  "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use",
                                   "step": 10
                               }
                           ],
@@ -248,7 +248,7 @@ items:
                                   "format": "time_series",
                                   "intervalFactor": 2,
                                   "legendFormat": "{{node}}",
-                                  "legendLink": "/dashboard/file/k8s-node-rsrc-use.json",
+                                  "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use",
                                   "step": 10
                               }
                           ],
@@ -334,7 +334,7 @@ items:
                                   "format": "time_series",
                                   "intervalFactor": 2,
                                   "legendFormat": "{{node}}",
-                                  "legendLink": "/dashboard/file/k8s-node-rsrc-use.json",
+                                  "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use",
                                   "step": 10
                               }
                           ],
@@ -432,7 +432,7 @@ items:
                                   "format": "time_series",
                                   "intervalFactor": 2,
                                   "legendFormat": "{{node}}",
-                                  "legendLink": "/dashboard/file/k8s-node-rsrc-use.json",
+                                  "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use",
                                   "step": 10
                               }
                           ],
@@ -518,7 +518,7 @@ items:
                                   "format": "time_series",
                                   "intervalFactor": 2,
                                   "legendFormat": "{{node}}",
-                                  "legendLink": "/dashboard/file/k8s-node-rsrc-use.json",
+                                  "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use",
                                   "step": 10
                               }
                           ],
@@ -616,7 +616,7 @@ items:
                                   "format": "time_series",
                                   "intervalFactor": 2,
                                   "legendFormat": "{{node}}",
-                                  "legendLink": "/dashboard/file/k8s-node-rsrc-use.json",
+                                  "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use",
                                   "step": 10
                               }
                           ],
@@ -702,7 +702,7 @@ items:
                                   "format": "time_series",
                                   "intervalFactor": 2,
                                   "legendFormat": "{{node}}",
-                                  "legendLink": "/dashboard/file/k8s-node-rsrc-use.json",
+                                  "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use",
                                   "step": 10
                               }
                           ],
@@ -800,7 +800,7 @@ items:
                                   "format": "time_series",
                                   "intervalFactor": 2,
                                   "legendFormat": "{{node}}",
-                                  "legendLink": "/dashboard/file/k8s-node-rsrc-use.json",
+                                  "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use",
                                   "step": 10
                               }
                           ],
@@ -909,6 +909,7 @@ items:
           },
           "timezone": "utc",
           "title": "K8s / USE Method / Cluster",
+          "uid": "a6e7d1362e1ddbb79db21d5bb40d7137",
           "version": 0
       }
   kind: ConfigMap
@@ -1851,6 +1852,7 @@ items:
           },
           "timezone": "utc",
           "title": "K8s / USE Method / Node",
+          "uid": "4ac4f123aae0ff6dbaf4f4f66120033b",
           "version": 0
       }
   kind: ConfigMap
@@ -2468,7 +2470,7 @@ items:
                                   "decimals": 2,
                                   "link": true,
                                   "linkTooltip": "Drill down",
-                                  "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell",
+                                  "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell",
                                   "pattern": "namespace",
                                   "thresholds": [
 
@@ -2828,7 +2830,7 @@ items:
                                   "decimals": 2,
                                   "link": true,
                                   "linkTooltip": "Drill down",
-                                  "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell",
+                                  "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell",
                                   "pattern": "namespace",
                                   "thresholds": [
 
@@ -3000,6 +3002,7 @@ items:
           },
           "timezone": "utc",
           "title": "K8s / Compute Resources / Cluster",
+          "uid": "efa86fd1d0c121a26444b636a3f509a8",
           "version": 0
       }
   kind: ConfigMap
@@ -3269,7 +3272,7 @@ items:
                                   "decimals": 2,
                                   "link": true,
                                   "linkTooltip": "Drill down",
-                                  "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell",
+                                  "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell",
                                   "pattern": "pod",
                                   "thresholds": [
 
@@ -3629,7 +3632,7 @@ items:
                                   "decimals": 2,
                                   "link": true,
                                   "linkTooltip": "Drill down",
-                                  "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell",
+                                  "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell",
                                   "pattern": "pod",
                                   "thresholds": [
 
@@ -3828,6 +3831,7 @@ items:
           },
           "timezone": "utc",
           "title": "K8s / Compute Resources / Namespace",
+          "uid": "85a562078cdf77779eaa1add43ccec1e",
           "version": 0
       }
   kind: ConfigMap
@@ -4683,6 +4687,7 @@ items:
           },
           "timezone": "utc",
           "title": "K8s / Compute Resources / Pod",
+          "uid": "6581e46e4e5c7ba40a07646395ef7b23",
           "version": 0
       }
   kind: ConfigMap
@@ -5609,6 +5614,7 @@ items:
           },
           "timezone": "browser",
           "title": "Nodes",
+          "uid": "fa49a4706d07a042595b664c87fb33ea",
           "version": 0
       }
   kind: ConfigMap
@@ -6098,6 +6104,7 @@ items:
           },
           "timezone": "browser",
           "title": "Pods",
+          "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea",
           "version": 0
       }
   kind: ConfigMap
@@ -6950,6 +6957,7 @@ items:
           },
           "timezone": "browser",
           "title": "StatefulSets",
+          "uid": "a31c1f46e6f727cb37c0d731a7245005",
           "version": 0
       }
   kind: ConfigMap
diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml
index cca1c735..75d5f36e 100644
--- a/manifests/prometheus-rules.yaml
+++ b/manifests/prometheus-rules.yaml
@@ -202,21 +202,21 @@ spec:
         )
       record: node:node_memory_swap_io_bytes:sum_rate
     - expr: |
-        avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3)
+        avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3)
       record: :node_disk_utilisation:avg_irate
     - expr: |
         avg by (node) (
-          irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3
+          irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3
         * on (namespace, pod) group_left(node)
           node_namespace_pod:kube_pod_info:
         )
       record: node:node_disk_utilisation:avg_irate
     - expr: |
-        avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3)
+        avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3)
       record: :node_disk_saturation:avg_irate
     - expr: |
         avg by (node) (
-          irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3
+          irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3
         * on (namespace, pod) group_left(node)
           node_namespace_pod:kube_pod_info:
         )
@@ -268,6 +268,7 @@ spec:
     - alert: AlertmanagerDown
       annotations:
         message: Alertmanager has disappeared from Prometheus target discovery.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
       expr: |
         absent(up{job="alertmanager-main"} == 1)
       for: 15m
@@ -276,6 +277,7 @@ spec:
     - alert: KubeAPIDown
       annotations:
         message: KubeAPI has disappeared from Prometheus target discovery.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
       expr: |
         absent(up{job="apiserver"} == 1)
       for: 15m
@@ -284,6 +286,7 @@ spec:
     - alert: KubeControllerManagerDown
       annotations:
         message: KubeControllerManager has disappeared from Prometheus target discovery.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
       expr: |
         absent(up{job="kube-controller-manager"} == 1)
       for: 15m
@@ -292,6 +295,7 @@ spec:
     - alert: KubeSchedulerDown
       annotations:
         message: KubeScheduler has disappeared from Prometheus target discovery.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
       expr: |
         absent(up{job="kube-scheduler"} == 1)
       for: 15m
@@ -300,6 +304,7 @@ spec:
     - alert: KubeStateMetricsDown
       annotations:
         message: KubeStateMetrics has disappeared from Prometheus target discovery.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown
       expr: |
         absent(up{job="kube-state-metrics"} == 1)
       for: 15m
@@ -308,6 +313,7 @@ spec:
     - alert: KubeletDown
       annotations:
         message: Kubelet has disappeared from Prometheus target discovery.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
       expr: |
         absent(up{job="kubelet"} == 1)
       for: 15m
@@ -316,6 +322,7 @@ spec:
     - alert: NodeExporterDown
       annotations:
         message: NodeExporter has disappeared from Prometheus target discovery.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown
       expr: |
         absent(up{job="node-exporter"} == 1)
       for: 15m
@@ -324,6 +331,7 @@ spec:
     - alert: PrometheusDown
       annotations:
         message: Prometheus has disappeared from Prometheus target discovery.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
       expr: |
         absent(up{job="prometheus-k8s"} == 1)
       for: 15m
@@ -332,6 +340,7 @@ spec:
     - alert: PrometheusOperatorDown
       annotations:
         message: PrometheusOperator has disappeared from Prometheus target discovery.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
       expr: |
         absent(up{job="prometheus-operator"} == 1)
       for: 15m
@@ -343,6 +352,7 @@ spec:
       annotations:
         message: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
           }}) is restarting {{ printf "%.2f" $value }} / second'
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
       expr: |
         rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0
       for: 1h
@@ -351,6 +361,7 @@ spec:
     - alert: KubePodNotReady
       annotations:
         message: '{{ $labels.namespace }}/{{ $labels.pod }} is not ready.'
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
       expr: |
         sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase!~"Running|Succeeded"}) > 0
       for: 1h
@@ -360,6 +371,7 @@ spec:
       annotations:
         message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} generation
           mismatch
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
       expr: |
         kube_deployment_status_observed_generation{job="kube-state-metrics"}
           !=
@@ -371,6 +383,7 @@ spec:
       annotations:
         message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica
           mismatch
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
       expr: |
         kube_deployment_spec_replicas{job="kube-state-metrics"}
           !=
@@ -382,6 +395,7 @@ spec:
       annotations:
         message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica
           mismatch
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
       expr: |
         kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
           !=
@@ -393,6 +407,7 @@ spec:
       annotations:
         message: StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation
           mismatch
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
       expr: |
         kube_statefulset_status_observed_generation{job="kube-state-metrics"}
           !=
@@ -404,6 +419,7 @@ spec:
       annotations:
         message: Only {{$value}}% of desired pods scheduled and ready for daemon set
           {{$labels.namespace}}/{{$labels.daemonset}}
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
       expr: |
         kube_daemonset_status_number_ready{job="kube-state-metrics"}
           /
@@ -415,6 +431,7 @@ spec:
       annotations:
         message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}}
           are not scheduled.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
       expr: |
         kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
           -
@@ -426,17 +443,48 @@ spec:
       annotations:
         message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}}
           are running where they are not supposed to run.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
       expr: |
         kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
       for: 10m
       labels:
         severity: warning
+    - alert: KubeCronJobRunning
+      annotations:
+        message: CronJob {{ $labels.namespaces }}/{{ $labels.cronjob }} is taking
+          more than 1h to complete.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
+      expr: |
+        time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
+      for: 1h
+      labels:
+        severity: warning
+    - alert: KubeJobCompletion
+      annotations:
+        message: Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than
+          1h to complete.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
+      expr: |
+        kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"}  > 0
+      for: 1h
+      labels:
+        severity: warning
+    - alert: KubeJobFailed
+      annotations:
+        message: Job {{ $labels.namespaces }}/{{ $labels.job }} failed to complete.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
+      expr: |
+        kube_job_status_failed{job="kube-state-metrics"}  > 0
+      for: 1h
+      labels:
+        severity: warning
   - name: kubernetes-resources
     rules:
     - alert: KubeCPUOvercommit
       annotations:
         message: Overcommited CPU resource requests on Pods, cannot tolerate node
           failure.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
       expr: |
         sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
           /
@@ -450,6 +498,7 @@ spec:
       annotations:
         message: Overcommited Memory resource requests on Pods, cannot tolerate node
           failure.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
       expr: |
         sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
           /
@@ -464,6 +513,7 @@ spec:
     - alert: KubeCPUOvercommit
       annotations:
         message: Overcommited CPU resource request quota on Namespaces.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
       expr: |
         sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"})
           /
@@ -475,6 +525,7 @@ spec:
     - alert: KubeMemOvercommit
       annotations:
         message: Overcommited Memory resource request quota on Namespaces.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
       expr: |
         sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"})
           /
@@ -487,6 +538,7 @@ spec:
       annotations:
         message: '{{ printf "%0.0f" $value }}% usage of {{ $labels.resource }} in
           namespace {{ $labels.namespace }}.'
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
       expr: |
         100 * kube_resourcequota{job="kube-state-metrics", type="used"}
           / ignoring(instance, job, type)
@@ -502,6 +554,7 @@ spec:
         message: The persistent volume claimed by {{ $labels.persistentvolumeclaim
           }} in namespace {{ $labels.namespace }} has {{ printf "%0.0f" $value }}%
           free.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
       expr: |
         100 * kubelet_volume_stats_available_bytes{job="kubelet"}
           /
@@ -515,6 +568,7 @@ spec:
         message: Based on recent sampling, the persistent volume claimed by {{ $labels.persistentvolumeclaim
           }} in namespace {{ $labels.namespace }} is expected to fill up within four
           days.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
       expr: |
         predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[1h], 4 * 24 * 3600) < 0
       for: 5m
@@ -525,6 +579,7 @@ spec:
     - alert: KubeNodeNotReady
       annotations:
         message: '{{ $labels.node }} has been unready for more than an hour'
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
       expr: |
         kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
       for: 1h
@@ -534,6 +589,7 @@ spec:
       annotations:
         message: There are {{ $value }} different versions of Kubernetes components
           running.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
       expr: |
         count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1
       for: 1h
@@ -543,6 +599,7 @@ spec:
       annotations:
         message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
           }}' is experiencing {{ printf "%0.0f" $value }}% errors.'
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
       expr: |
         sum(rate(rest_client_requests_total{code!~"2.."}[5m])) by (instance, job) * 100
           /
@@ -555,6 +612,7 @@ spec:
       annotations:
         message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
           }}' is experiencing {{ printf "%0.0f" $value }} errors / sec.'
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
       expr: |
         sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
       for: 15m
@@ -564,6 +622,7 @@ spec:
       annotations:
         message: Kubelet {{$labels.instance}} is running {{$value}} pods, close to
           the limit of 110.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
       expr: |
         kubelet_running_pod_count{job="kubelet"} > 100
       for: 15m
@@ -573,6 +632,7 @@ spec:
       annotations:
         message: The API server has a 99th percentile latency of {{ $value }} seconds
           for {{$labels.verb}} {{$labels.resource}}.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
       expr: |
         cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
       for: 10m
@@ -582,6 +642,7 @@ spec:
       annotations:
         message: The API server has a 99th percentile latency of {{ $value }} seconds
           for {{$labels.verb}} {{$labels.resource}}.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
       expr: |
         cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
       for: 10m
@@ -590,6 +651,7 @@ spec:
     - alert: KubeAPIErrorsHigh
       annotations:
         message: API server is erroring for {{ $value }}% of requests.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
       expr: |
         sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
           /
@@ -600,6 +662,7 @@ spec:
     - alert: KubeAPIErrorsHigh
       annotations:
         message: API server is erroring for {{ $value }}% of requests.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
       expr: |
         sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
           /
@@ -610,6 +673,7 @@ spec:
     - alert: KubeClientCertificateExpiration
       annotations:
         message: Kubernetes API certificate is expiring in less than 7 days.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
       expr: |
         histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
       labels:
@@ -617,6 +681,7 @@ spec:
     - alert: KubeClientCertificateExpiration
       annotations:
         message: Kubernetes API certificate is expiring in less than 1 day.
+        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
       expr: |
         histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
       labels:
-- 
GitLab