diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index dec38499c044435004ecf36ae52b99d0990602f6..71ec54472c84c11fb6fded6926c871639265557b 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -146,6 +146,8 @@ local kubeRbacProxyContainer = import './kube-rbac-proxy/container.libsonnet'; 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', ], + runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', + cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"', kubeletSelector: 'job="kubelet", metrics_path="/metrics"', kubeStateMetricsSelector: 'job="kube-state-metrics"', diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 43951b816c0dfdcc418f0c7aaf781ce615d579dd..cf1bf4ea096c62f064e315f556e1065ca2e18d73 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -785,7 +785,7 @@ spec: - alert: KubeStateMetricsListErrors annotations: description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricslisterrors summary: kube-state-metrics is experiencing errors in list operations. expr: | (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) @@ -798,7 +798,7 @@ spec: - alert: KubeStateMetricsWatchErrors annotations: description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricswatcherrors summary: kube-state-metrics is experiencing errors in watch operations. expr: | (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) @@ -813,7 +813,7 @@ spec: - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 24 hours. expr: | ( @@ -829,7 +829,7 @@ spec: - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 4 hours. expr: | ( @@ -845,7 +845,7 @@ spec: - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace summary: Filesystem has less than 5% space left. expr: | ( @@ -859,7 +859,7 @@ spec: - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace summary: Filesystem has less than 3% space left. expr: | ( @@ -873,7 +873,7 @@ spec: - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 24 hours. expr: | ( @@ -889,7 +889,7 @@ spec: - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 4 hours. expr: | ( @@ -905,7 +905,7 @@ spec: - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles summary: Filesystem has less than 5% inodes left. expr: | ( @@ -919,7 +919,7 @@ spec: - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles summary: Filesystem has less than 3% inodes left. expr: | ( @@ -933,7 +933,7 @@ spec: - alert: NodeNetworkReceiveErrs annotations: description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkreceiveerrs summary: Network interface is reporting many receive errors. expr: | rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 @@ -943,7 +943,7 @@ spec: - alert: NodeNetworkTransmitErrs annotations: description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworktransmiterrs summary: Network interface is reporting many transmit errors. expr: | rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 @@ -953,7 +953,7 @@ spec: - alert: NodeHighNumberConntrackEntriesUsed annotations: description: '{{ $value | humanizePercentage }} of conntrack entries are used.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodehighnumberconntrackentriesused summary: Number of conntrack are getting close to the limit. expr: | (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 @@ -962,7 +962,7 @@ spec: - alert: NodeTextFileCollectorScrapeError annotations: description: Node Exporter text file collector failed to scrape. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodetextfilecollectorscrapeerror summary: Node Exporter text file collector failed to scrape. expr: | node_textfile_scrape_error{job="node-exporter"} == 1 @@ -971,7 +971,7 @@ spec: - alert: NodeClockSkewDetected annotations: message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclockskewdetected summary: Clock skew detected. expr: | ( @@ -991,7 +991,7 @@ spec: - alert: NodeClockNotSynchronising annotations: message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclocknotsynchronising summary: Clock not synchronising. expr: | min_over_time(node_timex_sync_status[5m]) == 0 @@ -1003,7 +1003,7 @@ spec: - alert: NodeRAIDDegraded annotations: description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddegraded summary: RAID Array is degraded expr: | node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 @@ -1013,7 +1013,7 @@ spec: - alert: NodeRAIDDiskFailure annotations: description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddiskfailure summary: Failed device in RAID array expr: | node_md_disks{state="fail"} > 0 @@ -1024,7 +1024,7 @@ spec: - alert: PrometheusOperatorListErrors annotations: description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorlisterrors + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorlisterrors summary: Errors while performing list operations in controller. expr: | (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 @@ -1034,7 +1034,7 @@ spec: - alert: PrometheusOperatorWatchErrors annotations: description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorwatcherrors summary: Errors while performing watch operations in controller. expr: | (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 @@ -1044,7 +1044,7 @@ spec: - alert: PrometheusOperatorSyncFailed annotations: description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorsyncfailed + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorsyncfailed summary: Last controller reconciliation failed expr: | min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 @@ -1054,7 +1054,7 @@ spec: - alert: PrometheusOperatorReconcileErrors annotations: description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorreconcileerrors + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorreconcileerrors summary: Errors while reconciling controller. expr: | (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 @@ -1064,7 +1064,7 @@ spec: - alert: PrometheusOperatorNodeLookupErrors annotations: description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornodelookuperrors summary: Errors while reconciling Prometheus. expr: | rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 @@ -1074,7 +1074,7 @@ spec: - alert: PrometheusOperatorNotReady annotations: description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornotready + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornotready summary: Prometheus operator not ready expr: | min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) @@ -1084,7 +1084,7 @@ spec: - alert: PrometheusOperatorRejectedResources annotations: description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorrejectedresources + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorrejectedresources summary: Resources rejected by Prometheus operator expr: | min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 @@ -1096,7 +1096,7 @@ spec: - alert: KubePodCrashLooping annotations: description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping summary: Pod is crash looping. expr: | rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 @@ -1106,7 +1106,7 @@ spec: - alert: KubePodNotReady annotations: description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodnotready summary: Pod has been in a non-ready state for more than 15 minutes. expr: | sum by (namespace, pod) ( @@ -1122,7 +1122,7 @@ spec: - alert: KubeDeploymentGenerationMismatch annotations: description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentgenerationmismatch summary: Deployment generation mismatch due to possible roll-back expr: | kube_deployment_status_observed_generation{job="kube-state-metrics"} @@ -1134,7 +1134,7 @@ spec: - alert: KubeDeploymentReplicasMismatch annotations: description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch summary: Deployment has not matched the expected number of replicas. expr: | ( @@ -1152,7 +1152,7 @@ spec: - alert: KubeStatefulSetReplicasMismatch annotations: description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetreplicasmismatch summary: Deployment has not matched the expected number of replicas. expr: | ( @@ -1170,7 +1170,7 @@ spec: - alert: KubeStatefulSetGenerationMismatch annotations: description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetgenerationmismatch summary: StatefulSet generation mismatch due to possible roll-back expr: | kube_statefulset_status_observed_generation{job="kube-state-metrics"} @@ -1182,7 +1182,7 @@ spec: - alert: KubeStatefulSetUpdateNotRolledOut annotations: description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetupdatenotrolledout summary: StatefulSet update has not been rolled out. expr: | ( @@ -1208,7 +1208,7 @@ spec: - alert: KubeDaemonSetRolloutStuck annotations: description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetrolloutstuck summary: DaemonSet rollout is stuck. expr: | ( @@ -1240,7 +1240,7 @@ spec: - alert: KubeContainerWaiting annotations: description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontainerwaiting summary: Pod container waiting longer than 1 hour expr: | sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 @@ -1250,7 +1250,7 @@ spec: - alert: KubeDaemonSetNotScheduled annotations: description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetnotscheduled summary: DaemonSet pods are not scheduled. expr: | kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} @@ -1262,7 +1262,7 @@ spec: - alert: KubeDaemonSetMisScheduled annotations: description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetmisscheduled summary: DaemonSet pods are misscheduled. expr: | kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 @@ -1272,7 +1272,7 @@ spec: - alert: KubeJobCompletion annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobcompletion summary: Job did not complete in time expr: | kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 @@ -1282,7 +1282,7 @@ spec: - alert: KubeJobFailed annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobfailed summary: Job failed to complete. expr: | kube_job_failed{job="kube-state-metrics"} > 0 @@ -1292,7 +1292,7 @@ spec: - alert: KubeHpaReplicasMismatch annotations: description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpareplicasmismatch summary: HPA has not matched descired number of replicas. expr: | (kube_hpa_status_desired_replicas{job="kube-state-metrics"} @@ -1306,7 +1306,7 @@ spec: - alert: KubeHpaMaxedOut annotations: description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpamaxedout summary: HPA is running at max replicas expr: | kube_hpa_status_current_replicas{job="kube-state-metrics"} @@ -1320,7 +1320,7 @@ spec: - alert: KubeCPUOvercommit annotations: description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuovercommit summary: Cluster has overcommitted CPU resource requests. expr: | sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) @@ -1334,7 +1334,7 @@ spec: - alert: KubeMemoryOvercommit annotations: description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryovercommit summary: Cluster has overcommitted memory resource requests. expr: | sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) @@ -1350,7 +1350,7 @@ spec: - alert: KubeCPUQuotaOvercommit annotations: description: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuquotaovercommit summary: Cluster has overcommitted CPU resource requests. expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) @@ -1363,7 +1363,7 @@ spec: - alert: KubeMemoryQuotaOvercommit annotations: description: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryquotaovercommit summary: Cluster has overcommitted memory resource requests. expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) @@ -1376,7 +1376,7 @@ spec: - alert: KubeQuotaAlmostFull annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaalmostfull summary: Namespace quota is going to be full. expr: | kube_resourcequota{job="kube-state-metrics", type="used"} @@ -1389,7 +1389,7 @@ spec: - alert: KubeQuotaFullyUsed annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotafullyused summary: Namespace quota is fully used. expr: | kube_resourcequota{job="kube-state-metrics", type="used"} @@ -1402,7 +1402,7 @@ spec: - alert: KubeQuotaExceeded annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaexceeded summary: Namespace quota has exceeded the limits. expr: | kube_resourcequota{job="kube-state-metrics", type="used"} @@ -1415,7 +1415,7 @@ spec: - alert: CPUThrottlingHigh annotations: description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/cputhrottlinghigh summary: Processes experience elevated CPU throttling. expr: | sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) @@ -1430,7 +1430,7 @@ spec: - alert: KubePersistentVolumeFillingUp annotations: description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: | kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} @@ -1443,7 +1443,7 @@ spec: - alert: KubePersistentVolumeFillingUp annotations: description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: | ( @@ -1459,7 +1459,7 @@ spec: - alert: KubePersistentVolumeErrors annotations: description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumeerrors summary: PersistentVolume is having issues with provisioning. expr: | kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 @@ -1471,7 +1471,7 @@ spec: - alert: KubeVersionMismatch annotations: description: There are {{ $value }} different semantic versions of Kubernetes components running. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeversionmismatch summary: Different semantic versions of Kubernetes components running. expr: | count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 @@ -1481,7 +1481,7 @@ spec: - alert: KubeClientErrors annotations: description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclienterrors summary: Kubernetes API server client is experiencing errors. expr: | (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) @@ -1496,7 +1496,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) @@ -1510,7 +1510,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) @@ -1524,7 +1524,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) @@ -1538,7 +1538,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) @@ -1554,7 +1554,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 @@ -1563,7 +1563,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 @@ -1572,7 +1572,7 @@ spec: - alert: AggregatedAPIErrors annotations: description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors summary: An aggregated API has reported errors. expr: | sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 @@ -1581,7 +1581,7 @@ spec: - alert: AggregatedAPIDown annotations: description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapidown summary: An aggregated API is down. expr: | (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 @@ -1591,7 +1591,7 @@ spec: - alert: KubeAPIDown annotations: description: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapidown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="apiserver"} == 1) @@ -1603,7 +1603,7 @@ spec: - alert: KubeNodeNotReady annotations: description: '{{ $labels.node }} has been unready for more than 15 minutes.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodenotready summary: Node is not ready. expr: | kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 @@ -1613,7 +1613,7 @@ spec: - alert: KubeNodeUnreachable annotations: description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodeunreachable summary: Node is unreachable. expr: | (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 @@ -1623,7 +1623,7 @@ spec: - alert: KubeletTooManyPods annotations: description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubelettoomanypods summary: Kubelet is running at capacity. expr: | count by(node) ( @@ -1639,7 +1639,7 @@ spec: - alert: KubeNodeReadinessFlapping annotations: description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodereadinessflapping summary: Node readiness status is flapping. expr: | sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 @@ -1649,7 +1649,7 @@ spec: - alert: KubeletPlegDurationHigh annotations: description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletplegdurationhigh summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. expr: | node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 @@ -1659,7 +1659,7 @@ spec: - alert: KubeletPodStartUpLatencyHigh annotations: description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletpodstartuplatencyhigh summary: Kubelet Pod startup latency is too high. expr: | histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 @@ -1669,7 +1669,7 @@ spec: - alert: KubeletClientCertificateExpiration annotations: description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration summary: Kubelet client certificate is about to expire. expr: | kubelet_certificate_manager_client_ttl_seconds < 604800 @@ -1678,7 +1678,7 @@ spec: - alert: KubeletClientCertificateExpiration annotations: description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration summary: Kubelet client certificate is about to expire. expr: | kubelet_certificate_manager_client_ttl_seconds < 86400 @@ -1687,7 +1687,7 @@ spec: - alert: KubeletServerCertificateExpiration annotations: description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration summary: Kubelet server certificate is about to expire. expr: | kubelet_certificate_manager_server_ttl_seconds < 604800 @@ -1696,7 +1696,7 @@ spec: - alert: KubeletServerCertificateExpiration annotations: description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration summary: Kubelet server certificate is about to expire. expr: | kubelet_certificate_manager_server_ttl_seconds < 86400 @@ -1705,7 +1705,7 @@ spec: - alert: KubeletClientCertificateRenewalErrors annotations: description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificaterenewalerrors summary: Kubelet has failed to renew its client certificate. expr: | increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 @@ -1715,7 +1715,7 @@ spec: - alert: KubeletServerCertificateRenewalErrors annotations: description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificaterenewalerrors summary: Kubelet has failed to renew its server certificate. expr: | increase(kubelet_server_expiration_renew_errors[5m]) > 0 @@ -1725,7 +1725,7 @@ spec: - alert: KubeletDown annotations: description: Kubelet has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletdown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="kubelet", metrics_path="/metrics"} == 1) @@ -1737,7 +1737,7 @@ spec: - alert: KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeschedulerdown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="kube-scheduler"} == 1) @@ -1749,7 +1749,7 @@ spec: - alert: KubeControllerManagerDown annotations: description: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontrollermanagerdown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="kube-controller-manager"} == 1)