From 4b6a761dc5d1d66884bc6abb0b7f2b818e2bfc56 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk <fbranczyk@gmail.com> Date: Mon, 18 Jun 2018 10:56:38 +0200 Subject: [PATCH] kube-prometheus: Update kubernetes monitoring mixing --- manifests/grafana-dashboardDefinitions.yaml | 34 ++++++---- manifests/prometheus-rules.yaml | 73 +++++++++++++++++++-- 2 files changed, 90 insertions(+), 17 deletions(-) diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 6f04bb1f..1c973836 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -64,7 +64,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -150,7 +150,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -248,7 +248,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -334,7 +334,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -432,7 +432,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -518,7 +518,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -616,7 +616,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -702,7 +702,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -800,7 +800,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -909,6 +909,7 @@ items: }, "timezone": "utc", "title": "K8s / USE Method / Cluster", + "uid": "a6e7d1362e1ddbb79db21d5bb40d7137", "version": 0 } kind: ConfigMap @@ -1851,6 +1852,7 @@ items: }, "timezone": "utc", "title": "K8s / USE Method / Node", + "uid": "4ac4f123aae0ff6dbaf4f4f66120033b", "version": 0 } kind: ConfigMap @@ -2468,7 +2470,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -2828,7 +2830,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -3000,6 +3002,7 @@ items: }, "timezone": "utc", "title": "K8s / Compute Resources / Cluster", + "uid": "efa86fd1d0c121a26444b636a3f509a8", "version": 0 } kind: ConfigMap @@ -3269,7 +3272,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -3629,7 +3632,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -3828,6 +3831,7 @@ items: }, "timezone": "utc", "title": "K8s / Compute Resources / Namespace", + "uid": "85a562078cdf77779eaa1add43ccec1e", "version": 0 } kind: ConfigMap @@ -4683,6 +4687,7 @@ items: }, "timezone": "utc", "title": "K8s / Compute Resources / Pod", + "uid": "6581e46e4e5c7ba40a07646395ef7b23", "version": 0 } kind: ConfigMap @@ -5609,6 +5614,7 @@ items: }, "timezone": "browser", "title": "Nodes", + "uid": "fa49a4706d07a042595b664c87fb33ea", "version": 0 } kind: ConfigMap @@ -6098,6 +6104,7 @@ items: }, "timezone": "browser", "title": "Pods", + "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea", "version": 0 } kind: ConfigMap @@ -6950,6 +6957,7 @@ items: }, "timezone": "browser", "title": "StatefulSets", + "uid": "a31c1f46e6f727cb37c0d731a7245005", "version": 0 } kind: ConfigMap diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index cca1c735..75d5f36e 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -202,21 +202,21 @@ spec: ) record: node:node_memory_swap_io_bytes:sum_rate - expr: | - avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) record: :node_disk_utilisation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3 + irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -268,6 +268,7 @@ spec: - alert: AlertmanagerDown annotations: message: Alertmanager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown expr: | absent(up{job="alertmanager-main"} == 1) for: 15m @@ -276,6 +277,7 @@ spec: - alert: KubeAPIDown annotations: message: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown expr: | absent(up{job="apiserver"} == 1) for: 15m @@ -284,6 +286,7 @@ spec: - alert: KubeControllerManagerDown annotations: message: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown expr: | absent(up{job="kube-controller-manager"} == 1) for: 15m @@ -292,6 +295,7 @@ spec: - alert: KubeSchedulerDown annotations: message: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown expr: | absent(up{job="kube-scheduler"} == 1) for: 15m @@ -300,6 +304,7 @@ spec: - alert: KubeStateMetricsDown annotations: message: KubeStateMetrics has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown expr: | absent(up{job="kube-state-metrics"} == 1) for: 15m @@ -308,6 +313,7 @@ spec: - alert: KubeletDown annotations: message: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown expr: | absent(up{job="kubelet"} == 1) for: 15m @@ -316,6 +322,7 @@ spec: - alert: NodeExporterDown annotations: message: NodeExporter has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown expr: | absent(up{job="node-exporter"} == 1) for: 15m @@ -324,6 +331,7 @@ spec: - alert: PrometheusDown annotations: message: Prometheus has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown expr: | absent(up{job="prometheus-k8s"} == 1) for: 15m @@ -332,6 +340,7 @@ spec: - alert: PrometheusOperatorDown annotations: message: PrometheusOperator has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown expr: | absent(up{job="prometheus-operator"} == 1) for: 15m @@ -343,6 +352,7 @@ spec: annotations: message: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} / second' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping expr: | rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0 for: 1h @@ -351,6 +361,7 @@ spec: - alert: KubePodNotReady annotations: message: '{{ $labels.namespace }}/{{ $labels.pod }} is not ready.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready expr: | sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase!~"Running|Succeeded"}) > 0 for: 1h @@ -360,6 +371,7 @@ spec: annotations: message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} generation mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch expr: | kube_deployment_status_observed_generation{job="kube-state-metrics"} != @@ -371,6 +383,7 @@ spec: annotations: message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch expr: | kube_deployment_spec_replicas{job="kube-state-metrics"} != @@ -382,6 +395,7 @@ spec: annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch expr: | kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != @@ -393,6 +407,7 @@ spec: annotations: message: StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch expr: | kube_statefulset_status_observed_generation{job="kube-state-metrics"} != @@ -404,6 +419,7 @@ spec: annotations: message: Only {{$value}}% of desired pods scheduled and ready for daemon set {{$labels.namespace}}/{{$labels.daemonset}} + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck expr: | kube_daemonset_status_number_ready{job="kube-state-metrics"} / @@ -415,6 +431,7 @@ spec: annotations: message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are not scheduled. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled expr: | kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - @@ -426,17 +443,48 @@ spec: annotations: message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are running where they are not supposed to run. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled expr: | kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning + - alert: KubeCronJobRunning + annotations: + message: CronJob {{ $labels.namespaces }}/{{ $labels.cronjob }} is taking + more than 1h to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning + expr: | + time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 + for: 1h + labels: + severity: warning + - alert: KubeJobCompletion + annotations: + message: Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than + 1h to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion + expr: | + kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning + - alert: KubeJobFailed + annotations: + message: Job {{ $labels.namespaces }}/{{ $labels.job }} failed to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed + expr: | + kube_job_status_failed{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning - name: kubernetes-resources rules: - alert: KubeCPUOvercommit annotations: message: Overcommited CPU resource requests on Pods, cannot tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) / @@ -450,6 +498,7 @@ spec: annotations: message: Overcommited Memory resource requests on Pods, cannot tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) / @@ -464,6 +513,7 @@ spec: - alert: KubeCPUOvercommit annotations: message: Overcommited CPU resource request quota on Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) / @@ -475,6 +525,7 @@ spec: - alert: KubeMemOvercommit annotations: message: Overcommited Memory resource request quota on Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) / @@ -487,6 +538,7 @@ spec: annotations: message: '{{ printf "%0.0f" $value }}% usage of {{ $labels.resource }} in namespace {{ $labels.namespace }}.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded expr: | 100 * kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) @@ -502,6 +554,7 @@ spec: message: The persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} has {{ printf "%0.0f" $value }}% free. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical expr: | 100 * kubelet_volume_stats_available_bytes{job="kubelet"} / @@ -515,6 +568,7 @@ spec: message: Based on recent sampling, the persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is expected to fill up within four days. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays expr: | predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[1h], 4 * 24 * 3600) < 0 for: 5m @@ -525,6 +579,7 @@ spec: - alert: KubeNodeNotReady annotations: message: '{{ $labels.node }} has been unready for more than an hour' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready expr: | kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 for: 1h @@ -534,6 +589,7 @@ spec: annotations: message: There are {{ $value }} different versions of Kubernetes components running. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch expr: | count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 for: 1h @@ -543,6 +599,7 @@ spec: annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }}% errors.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | sum(rate(rest_client_requests_total{code!~"2.."}[5m])) by (instance, job) * 100 / @@ -555,6 +612,7 @@ spec: annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }} errors / sec.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 for: 15m @@ -564,6 +622,7 @@ spec: annotations: message: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods expr: | kubelet_running_pod_count{job="kubelet"} > 100 for: 15m @@ -573,6 +632,7 @@ spec: annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m @@ -582,6 +642,7 @@ spec: annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m @@ -590,6 +651,7 @@ spec: - alert: KubeAPIErrorsHigh annotations: message: API server is erroring for {{ $value }}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) / @@ -600,6 +662,7 @@ spec: - alert: KubeAPIErrorsHigh annotations: message: API server is erroring for {{ $value }}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) / @@ -610,6 +673,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: message: Kubernetes API certificate is expiring in less than 7 days. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: @@ -617,6 +681,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: message: Kubernetes API certificate is expiring in less than 1 day. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: -- GitLab