diff --git a/README.md b/README.md index 8e6d669404eb59a2a12d64aa70eb32b7c6429f2d..b0aab96964dc22e9d10b4e5c7112a0220002dee6 100644 --- a/README.md +++ b/README.md @@ -217,25 +217,30 @@ local kp = // (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') + // (import 'kube-prometheus/kube-prometheus-external-metrics.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; -{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + -// serviceMonitor is separated so that it can be created after the CRDs are ready +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kubernetes-' + name]: kp.kubernetesMixin[name] for name in std.objectFields(kp.kubernetesMixin) } ``` And here's the [build.sh](build.sh) script (which uses `vendor/` to render all manifests in a json structure of `{filename: manifest-content}`): diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index f9decdcd5563740b333b37f64bdce9d873496edc..b6c9f978359b6ac9dce30de46a8bf5983a5f2d38 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -21,25 +21,30 @@ local kp = // (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') + // (import 'kube-prometheus/kube-prometheus-external-metrics.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; -{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + -// serviceMonitor is separated so that it can be created after the CRDs are ready +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kubernetes-' + name]: kp.kubernetesMixin[name] for name in std.objectFields(kp.kubernetesMixin) } ``` ## Prometheus rules diff --git a/kustomization.yaml b/kustomization.yaml index 7066018a01560f7e1e5b940a699fe58b39b5fbc9..2ebd021b9be4b6317fb4029991d9443ed42059df 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -2,6 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - ./manifests/alertmanager-alertmanager.yaml +- ./manifests/alertmanager-prometheusRule.yaml - ./manifests/alertmanager-secret.yaml - ./manifests/alertmanager-service.yaml - ./manifests/alertmanager-serviceAccount.yaml @@ -20,15 +21,19 @@ resources: - ./manifests/grafana-service.yaml - ./manifests/grafana-serviceAccount.yaml - ./manifests/grafana-serviceMonitor.yaml +- ./manifests/kube-prometheus-prometheusRule.yaml - ./manifests/kube-state-metrics-clusterRole.yaml - ./manifests/kube-state-metrics-clusterRoleBinding.yaml - ./manifests/kube-state-metrics-deployment.yaml +- ./manifests/kube-state-metrics-prometheusRule.yaml - ./manifests/kube-state-metrics-service.yaml - ./manifests/kube-state-metrics-serviceAccount.yaml - ./manifests/kube-state-metrics-serviceMonitor.yaml +- ./manifests/kubernetes-prometheusRule.yaml - ./manifests/node-exporter-clusterRole.yaml - ./manifests/node-exporter-clusterRoleBinding.yaml - ./manifests/node-exporter-daemonset.yaml +- ./manifests/node-exporter-prometheusRule.yaml - ./manifests/node-exporter-service.yaml - ./manifests/node-exporter-serviceAccount.yaml - ./manifests/node-exporter-serviceMonitor.yaml @@ -46,13 +51,14 @@ resources: - ./manifests/prometheus-adapter-serviceMonitor.yaml - ./manifests/prometheus-clusterRole.yaml - ./manifests/prometheus-clusterRoleBinding.yaml +- ./manifests/prometheus-operator-prometheusRule.yaml - ./manifests/prometheus-operator-serviceMonitor.yaml - ./manifests/prometheus-prometheus.yaml +- ./manifests/prometheus-prometheusRule.yaml - ./manifests/prometheus-roleBindingConfig.yaml - ./manifests/prometheus-roleBindingSpecificNamespaces.yaml - ./manifests/prometheus-roleConfig.yaml - ./manifests/prometheus-roleSpecificNamespaces.yaml -- ./manifests/prometheus-rules.yaml - ./manifests/prometheus-service.yaml - ./manifests/prometheus-serviceAccount.yaml - ./manifests/prometheus-serviceMonitor.yaml diff --git a/manifests/alertmanager-prometheusRule.yaml b/manifests/alertmanager-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea78ad1123db69d3ab513b2c25cdd2a908e221ca --- /dev/null +++ b/manifests/alertmanager-prometheusRule.yaml @@ -0,0 +1,116 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.21.0 + prometheus: k8s + role: alert-rules + name: main-rules + namespace: monitoring +spec: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}. + summary: Reloading an Alertmanager configuration has failed. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster. + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) + for: 10m + labels: + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. + summary: An Alertmanager instance failed to send notifications. + expr: | + ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. + summary: All Alertmanager instances in a cluster failed to send notifications. + expr: | + min by (namespace,service) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. + summary: Alertmanager instances within the same cluster have different configurations. + expr: | + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) + ) + != 1 + for: 20m + labels: + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: | + ( + count by (namespace,service) ( + avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterCrashlooping + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. + expr: | + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 85d3f68d5f1a3cec0078871f6006716aea1b2b8f..d6bb77dac9ab38201cf0588dbb88745ede6610ba 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -13,7 +13,7 @@ spec: template: metadata: annotations: - checksum/grafana-dashboards: a9e19e1ab605dc374f30edda771e6917 + checksum/grafana-dashboards: b02ae450c84445cbaca8c685eefaec6c checksum/grafana-datasources: 48faab41f579fc8efde6034391496f6a labels: app: grafana diff --git a/manifests/kube-prometheus-prometheusRule.yaml b/manifests/kube-prometheus-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26e7da5831d107bdfedf8f1874c6356e2c93f3ad --- /dev/null +++ b/manifests/kube-prometheus-prometheusRule.yaml @@ -0,0 +1,63 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + role: alert-rules + name: kube-prometheus-rules + namespace: monitoring +spec: + groups: + - name: general.rules + rules: + - alert: TargetDown + annotations: + message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' + expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + message: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + expr: vector(1) + labels: + severity: none + - name: node-network + rules: + - alert: NodeNetworkInterfaceFlapping + annotations: + message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" + expr: | + changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kube-prometheus-general.rules + rules: + - expr: count without(instance, pod, node) (up == 1) + record: count:up1 + - expr: count without(instance, pod, node) (up == 0) + record: count:up0 diff --git a/manifests/kube-state-metrics-prometheusRule.yaml b/manifests/kube-state-metrics-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..28c9ec0558a8c20f1eb3209c7762b0e5530fde84 --- /dev/null +++ b/manifests/kube-state-metrics-prometheusRule.yaml @@ -0,0 +1,40 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.9.7 + prometheus: k8s + role: alert-rules + name: kube-state-metrics-rules + namespace: monitoring +spec: + groups: + - name: kube-state-metrics + rules: + - alert: KubeStateMetricsListErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + summary: kube-state-metrics is experiencing errors in list operations. + expr: | + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsWatchErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + summary: kube-state-metrics is experiencing errors in watch operations. + expr: | + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical diff --git a/manifests/prometheus-rules.yaml b/manifests/kubernetes-prometheusRule.yaml similarity index 62% rename from manifests/prometheus-rules.yaml rename to manifests/kubernetes-prometheusRule.yaml index fd56b0aa1b8209f614834704422f52af3f0829e0..d683cff6db28720100badf601d69070530dcbcbf 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/kubernetes-prometheusRule.yaml @@ -2,1338 +2,159 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: prometheus + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-prometheus app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 2.24.0 prometheus: k8s role: alert-rules - name: prometheus-k8s-rules + name: kubernetes-rules namespace: monitoring spec: groups: - - name: node-exporter.rules + - name: kubernetes-apps rules: - - expr: | - count without (cpu) ( - count without (mode) ( - node_cpu_seconds_total{job="node-exporter"} + - alert: KubePodCrashLooping + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping + summary: Pod is crash looping. + expr: | + rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 + for: 15m + labels: + severity: warning + - alert: KubePodNotReady + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: | + sum by (namespace, pod) ( + max by(namespace, pod) ( + kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} + ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( + 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) ) - ) - record: instance:node_num_cpu:sum - - expr: | - 1 - avg without (cpu, mode) ( - rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) - ) - record: instance:node_cpu_utilisation:rate1m - - expr: | - ( - node_load1{job="node-exporter"} - / - instance:node_num_cpu:sum{job="node-exporter"} - ) - record: instance:node_load1_per_cpu:ratio - - expr: | - 1 - ( - node_memory_MemAvailable_bytes{job="node-exporter"} - / - node_memory_MemTotal_bytes{job="node-exporter"} - ) - record: instance:node_memory_utilisation:ratio - - expr: | - rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) - record: instance:node_vmstat_pgmajfault:rate1m - - expr: | - rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_seconds:rate1m - - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_weighted_seconds:rate1m - - expr: | - sum without (device) ( - rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_receive_bytes_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_transmit_bytes_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_receive_drop_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_transmit_drop_excluding_lo:rate1m - - name: kube-apiserver.rules - rules: - - expr: | + ) > 0 + for: 15m + labels: + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + kube_deployment_spec_replicas{job="kube-state-metrics"} + != + kube_deployment_status_replicas_available{job="kube-state-metrics"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + for: 15m labels: - verb: read - record: apiserver_request:burnrate1d - - expr: | + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + for: 15m labels: - verb: read - record: apiserver_request:burnrate1h - - expr: | + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: | ( + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) - ) + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + for: 15m labels: - verb: read - record: apiserver_request:burnrate2h - - expr: | + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: | ( ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) - ) + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} + != + 0 + ) or ( + kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + ) and ( + changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) + == + 0 ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) - labels: - verb: read - record: apiserver_request:burnrate30m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) - labels: - verb: read - record: apiserver_request:burnrate3d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: apiserver_request:burnrate5m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) - labels: - verb: read - record: apiserver_request:burnrate6h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - labels: - verb: write - record: apiserver_request:burnrate1d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - labels: - verb: write - record: apiserver_request:burnrate1h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - labels: - verb: write - record: apiserver_request:burnrate2h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - labels: - verb: write - record: apiserver_request:burnrate30m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - labels: - verb: write - record: apiserver_request:burnrate3d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: apiserver_request:burnrate5m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - labels: - verb: write - record: apiserver_request:burnrate6h - - expr: | - sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: code_resource:apiserver_request_total:rate5m - - expr: | - sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: code_resource:apiserver_request_total:rate5m - - expr: | - histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 - labels: - quantile: "0.99" - verb: read - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 - labels: - quantile: "0.99" - verb: write - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - interval: 3m - name: kube-apiserver-availability.rules - rules: - - expr: | - 1 - ( - ( - # write too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) - ) + - ( - # read too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) - - - ( - ( - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) - or - vector(0) - ) - + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) - + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) - ) - ) + - # errors - sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d) - labels: - verb: all - record: apiserver_request:availability30d - - expr: | - 1 - ( - sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) - - - ( - # too slow - ( - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) - or - vector(0) - ) - + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) - + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) - ) - + - # errors - sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d{verb="read"}) - labels: - verb: read - record: apiserver_request:availability30d - - expr: | - 1 - ( - ( - # too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) - ) - + - # errors - sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d{verb="write"}) - labels: - verb: write - record: apiserver_request:availability30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) - labels: - verb: read - record: code:apiserver_request_total:increase30d - - expr: | - sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - labels: - verb: write - record: code:apiserver_request_total:increase30d - - name: k8s.rules - rules: - - expr: | - sum by (cluster, namespace, pod, container) ( - rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) - ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( - 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate - - expr: | - container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_working_set_bytes - - expr: | - container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_rss - - expr: | - container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_cache - - expr: | - container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_swap - - expr: | - sum by (namespace) ( - sum by (namespace, pod) ( - max by (namespace, pod, container) ( - kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} - ) * on(namespace, pod) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace:kube_pod_container_resource_requests_memory_bytes:sum - - expr: | - sum by (namespace) ( - sum by (namespace, pod) ( - max by (namespace, pod, container) ( - kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} - ) * on(namespace, pod) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace:kube_pod_container_resource_requests_cpu_cores:sum - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, - "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( - 1, max by (replicaset, namespace, owner_name) ( - kube_replicaset_owner{job="kube-state-metrics"} - ) - ), - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: deployment - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: daemonset - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: statefulset - record: namespace_workload_pod:kube_pod_owner:relabel - - name: kube-scheduler.rules - rules: - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - name: node.rules - rules: - - expr: | - topk by(namespace, pod) (1, - max by (node, namespace, pod) ( - label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") - )) - record: 'node_namespace_pod:kube_pod_info:' - - expr: | - count by (cluster, node) (sum by (node, cpu) ( - node_cpu_seconds_total{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - )) - record: node:node_num_cpu:sum - - expr: | - sum( - node_memory_MemAvailable_bytes{job="node-exporter"} or - ( - node_memory_Buffers_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Slab_bytes{job="node-exporter"} - ) - ) by (cluster) - record: :node_memory_MemAvailable_bytes:sum - - name: kubelet.rules - rules: - - expr: | - histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: "0.99" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: "0.9" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: "0.5" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - name: kube-prometheus-node-recording.rules - rules: - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) - record: instance:node_cpu:rate:sum - - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) - record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) - record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) - record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) - record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) - record: cluster:node_cpu:ratio - - name: kube-prometheus-general.rules - rules: - - expr: count without(instance, pod, node) (up == 1) - record: count:up1 - - expr: count without(instance, pod, node) (up == 0) - record: count:up0 - - name: kube-state-metrics - rules: - - alert: KubeStateMetricsListErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricslisterrors - summary: kube-state-metrics is experiencing errors in list operations. - expr: | - (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical - - alert: KubeStateMetricsWatchErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricswatcherrors - summary: kube-state-metrics is experiencing errors in watch operations. - expr: | - (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical - - name: node-exporter - rules: - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 24 hours. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 4 hours. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace - summary: Filesystem has less than 5% space left. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace - summary: Filesystem has less than 3% space left. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 24 hours. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 4 hours. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles - summary: Filesystem has less than 5% inodes left. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles - summary: Filesystem has less than 3% inodes left. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkreceiveerrs - summary: Network interface is reporting many receive errors. - expr: | - rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeNetworkTransmitErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworktransmiterrs - summary: Network interface is reporting many transmit errors. - expr: | - rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{{ $value | humanizePercentage }} of conntrack entries are used.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodehighnumberconntrackentriesused - summary: Number of conntrack are getting close to the limit. - expr: | - (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 - labels: - severity: warning - - alert: NodeTextFileCollectorScrapeError - annotations: - description: Node Exporter text file collector failed to scrape. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodetextfilecollectorscrapeerror - summary: Node Exporter text file collector failed to scrape. - expr: | - node_textfile_scrape_error{job="node-exporter"} == 1 - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclockskewdetected - summary: Clock skew detected. - expr: | - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclocknotsynchronising - summary: Clock not synchronising. - expr: | - min_over_time(node_timex_sync_status[5m]) == 0 - and - node_timex_maxerror_seconds >= 16 - for: 10m - labels: - severity: warning - - alert: NodeRAIDDegraded - annotations: - description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddegraded - summary: RAID Array is degraded - expr: | - node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 - for: 15m - labels: - severity: critical - - alert: NodeRAIDDiskFailure - annotations: - description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddiskfailure - summary: Failed device in RAID array - expr: | - node_md_disks{state="fail"} > 0 - labels: - severity: warning - - name: alertmanager.rules - rules: - - alert: AlertmanagerFailedReload - annotations: - description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedreload - summary: Reloading an Alertmanager configuration has failed. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 - for: 10m - labels: - severity: critical - - alert: AlertmanagerMembersInconsistent - annotations: - description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagermembersinconsistent - summary: A member of an Alertmanager cluster has not found all other cluster members. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) - < on (namespace,service) group_left - count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) - for: 10m - labels: - severity: critical - - alert: AlertmanagerFailedToSendAlerts - annotations: - description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedtosendalerts - summary: An Alertmanager instance failed to send notifications. - expr: | - ( - rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) - / - rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: warning - - alert: AlertmanagerClusterFailedToSendAlerts - annotations: - description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts - summary: All Alertmanager instances in a cluster failed to send notifications. - expr: | - min by (namespace,service) ( - rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) - / - rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: critical - - alert: AlertmanagerConfigInconsistent - annotations: - description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerconfiginconsistent - summary: Alertmanager instances within the same cluster have different configurations. - expr: | - count by (namespace,service) ( - count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) - ) - != 1 - for: 20m - labels: - severity: critical - - alert: AlertmanagerClusterDown - annotations: - description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterdown - summary: Half or more of the Alertmanager instances within the same cluster are down. - expr: | - ( - count by (namespace,service) ( - avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 - ) - / - count by (namespace,service) ( - up{job="alertmanager-main",namespace="monitoring"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical - - alert: AlertmanagerClusterCrashlooping - annotations: - description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclustercrashlooping - summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. - expr: | - ( - count by (namespace,service) ( - changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 - ) - / - count by (namespace,service) ( - up{job="alertmanager-main",namespace="monitoring"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical - - name: prometheus-operator - rules: - - alert: PrometheusOperatorListErrors - annotations: - description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorlisterrors - summary: Errors while performing list operations in controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorWatchErrors - annotations: - description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorwatcherrors - summary: Errors while performing watch operations in controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorSyncFailed - annotations: - description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorsyncfailed - summary: Last controller reconciliation failed - expr: | - min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorReconcileErrors - annotations: - description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorreconcileerrors - summary: Errors while reconciling controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNodeLookupErrors - annotations: - description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornodelookuperrors - summary: Errors while reconciling Prometheus. - expr: | - rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNotReady - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornotready - summary: Prometheus operator not ready - expr: | - min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) - for: 5m - labels: - severity: warning - - alert: PrometheusOperatorRejectedResources - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorrejectedresources - summary: Resources rejected by Prometheus operator - expr: | - min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 - for: 5m - labels: - severity: warning - - name: kubernetes-apps - rules: - - alert: KubePodCrashLooping - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping - summary: Pod is crash looping. - expr: | - rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 - for: 15m - labels: - severity: warning - - alert: KubePodNotReady - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodnotready - summary: Pod has been in a non-ready state for more than 15 minutes. - expr: | - sum by (namespace, pod) ( - max by(namespace, pod) ( - kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} - ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( - 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) - ) - ) > 0 - for: 15m - labels: - severity: warning - - alert: KubeDeploymentGenerationMismatch - annotations: - description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentgenerationmismatch - summary: Deployment generation mismatch due to possible roll-back - expr: | - kube_deployment_status_observed_generation{job="kube-state-metrics"} - != - kube_deployment_metadata_generation{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - alert: KubeDeploymentReplicasMismatch - annotations: - description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: | - ( - kube_deployment_spec_replicas{job="kube-state-metrics"} - != - kube_deployment_status_replicas_available{job="kube-state-metrics"} - ) and ( - changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetReplicasMismatch - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: | - ( - kube_statefulset_status_replicas_ready{job="kube-state-metrics"} - != - kube_statefulset_status_replicas{job="kube-state-metrics"} - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetGenerationMismatch - annotations: - description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetgenerationmismatch - summary: StatefulSet generation mismatch due to possible roll-back - expr: | - kube_statefulset_status_observed_generation{job="kube-state-metrics"} - != - kube_statefulset_metadata_generation{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetUpdateNotRolledOut - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetupdatenotrolledout - summary: StatefulSet update has not been rolled out. - expr: | - ( - max without (revision) ( - kube_statefulset_status_current_revision{job="kube-state-metrics"} - unless - kube_statefulset_status_update_revision{job="kube-state-metrics"} - ) - * - ( - kube_statefulset_replicas{job="kube-state-metrics"} - != - kube_statefulset_status_replicas_updated{job="kube-state-metrics"} - ) - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeDaemonSetRolloutStuck - annotations: - description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetrolloutstuck - summary: DaemonSet rollout is stuck. - expr: | - ( - ( - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} - != - 0 - ) or ( - kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_available{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) - ) and ( - changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m + for: 15m labels: severity: warning - alert: KubeContainerWaiting @@ -1563,532 +384,984 @@ spec: for: 1h labels: severity: warning - - alert: KubePersistentVolumeErrors + - alert: KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical + - name: kubernetes-system + rules: + - alert: KubeVersionMismatch + annotations: + description: There are {{ $value }} different semantic versions of Kubernetes components running. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeversionmismatch + summary: Different semantic versions of Kubernetes components running. + expr: | + count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclienterrors + summary: Kubernetes API server client is experiencing errors. + expr: | + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) + / + sum(rate(rest_client_requests_total[5m])) by (instance, job)) + > 0.01 + for: 15m + labels: + severity: warning + - name: kube-apiserver-slos + rules: + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) + and + sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) + for: 2m + labels: + long: 1h + severity: critical + short: 5m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) + and + sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + for: 15m + labels: + long: 6h + severity: critical + short: 30m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) + and + sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) + for: 1h + labels: + long: 1d + severity: warning + short: 2h + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) + and + sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) + for: 3h + labels: + long: 3d + severity: warning + short: 6h + - name: kubernetes-system-apiserver + rules: + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical + - alert: AggregatedAPIErrors + annotations: + description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors + summary: An aggregated API has reported errors. + expr: | + sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 + labels: + severity: warning + - alert: AggregatedAPIDown + annotations: + description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapidown + summary: An aggregated API is down. + expr: | + (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 + for: 5m + labels: + severity: warning + - alert: KubeAPIDown + annotations: + description: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapidown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-kubelet + rules: + - alert: KubeNodeNotReady + annotations: + description: '{{ $labels.node }} has been unready for more than 15 minutes.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodenotready + summary: Node is not ready. + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: warning + - alert: KubeNodeUnreachable + annotations: + description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodeunreachable + summary: Node is unreachable. + expr: | + (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubelettoomanypods + summary: Kubelet is running at capacity. + expr: | + count by(node) ( + (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + ) + / + max by(node) ( + kube_node_status_capacity_pods{job="kube-state-metrics"} != 1 + ) > 0.95 + for: 15m + labels: + severity: warning + - alert: KubeNodeReadinessFlapping + annotations: + description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodereadinessflapping + summary: Node readiness status is flapping. + expr: | + sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 + for: 15m + labels: + severity: warning + - alert: KubeletPlegDurationHigh + annotations: + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletplegdurationhigh + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. + expr: | + node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning + - alert: KubeletPodStartUpLatencyHigh + annotations: + description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletpodstartuplatencyhigh + summary: Kubelet Pod startup latency is too high. + expr: | + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + for: 15m + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletServerCertificateExpiration annotations: - description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumeerrors - summary: PersistentVolume is having issues with provisioning. + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. expr: | - kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 - for: 5m + kubelet_certificate_manager_server_ttl_seconds < 86400 labels: severity: critical - - name: kubernetes-system - rules: - - alert: KubeVersionMismatch + - alert: KubeletClientCertificateRenewalErrors annotations: - description: There are {{ $value }} different semantic versions of Kubernetes components running. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeversionmismatch - summary: Different semantic versions of Kubernetes components running. + description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificaterenewalerrors + summary: Kubelet has failed to renew its client certificate. expr: | - count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 + increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 for: 15m labels: severity: warning - - alert: KubeClientErrors + - alert: KubeletServerCertificateRenewalErrors annotations: - description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclienterrors - summary: Kubernetes API server client is experiencing errors. + description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificaterenewalerrors + summary: Kubelet has failed to renew its server certificate. expr: | - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) - / - sum(rate(rest_client_requests_total[5m])) by (instance, job)) - > 0.01 + increase(kubelet_server_expiration_renew_errors[5m]) > 0 for: 15m labels: severity: warning - - name: kube-apiserver-slos - rules: - - alert: KubeAPIErrorBudgetBurn + - alert: KubeletDown annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. + description: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletdown + summary: Target disappeared from Prometheus target discovery. expr: | - sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) - and - sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) - for: 2m + absent(up{job="kubelet", metrics_path="/metrics"} == 1) + for: 15m labels: - long: 1h severity: critical - short: 5m - - alert: KubeAPIErrorBudgetBurn + - name: kubernetes-system-scheduler + rules: + - alert: KubeSchedulerDown annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. + description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeschedulerdown + summary: Target disappeared from Prometheus target discovery. expr: | - sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) - and - sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + absent(up{job="kube-scheduler"} == 1) for: 15m labels: - long: 6h severity: critical - short: 30m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) - and - sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) - for: 1h - labels: - long: 1d - severity: warning - short: 2h - - alert: KubeAPIErrorBudgetBurn + - name: kubernetes-system-controller-manager + rules: + - alert: KubeControllerManagerDown annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. + description: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontrollermanagerdown + summary: Target disappeared from Prometheus target discovery. expr: | - sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) - and - sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) - for: 3h + absent(up{job="kube-controller-manager"} == 1) + for: 15m labels: - long: 3d - severity: warning - short: 6h - - name: kubernetes-system-apiserver + severity: critical + - name: kube-apiserver.rules rules: - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) labels: - severity: warning - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + verb: read + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + labels: + verb: read + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + labels: + verb: read + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + labels: + verb: read + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) labels: - severity: critical - - alert: AggregatedAPIErrors - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors - summary: An aggregated API has reported errors. - expr: | - sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 + verb: read + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: - severity: warning - - alert: AggregatedAPIDown - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapidown - summary: An aggregated API is down. - expr: | - (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 - for: 5m + verb: read + record: apiserver_request:burnrate5m + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) labels: - severity: warning - - alert: KubeAPIDown - annotations: - description: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapidown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="apiserver"} == 1) - for: 15m + verb: read + record: apiserver_request:burnrate6h + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) labels: - severity: critical - - name: kubernetes-system-kubelet - rules: - - alert: KubeNodeNotReady - annotations: - description: '{{ $labels.node }} has been unready for more than 15 minutes.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodenotready - summary: Node is not ready. - expr: | - kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 - for: 15m + verb: write + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) labels: - severity: warning - - alert: KubeNodeUnreachable - annotations: - description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodeunreachable - summary: Node is unreachable. - expr: | - (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 - for: 15m + verb: write + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) labels: - severity: warning - - alert: KubeletTooManyPods - annotations: - description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubelettoomanypods - summary: Kubelet is running at capacity. - expr: | - count by(node) ( - (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + verb: write + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) ) / - max by(node) ( - kube_node_status_capacity_pods{job="kube-state-metrics"} != 1 - ) > 0.95 - for: 15m + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) labels: - severity: warning - - alert: KubeNodeReadinessFlapping - annotations: - description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodereadinessflapping - summary: Node readiness status is flapping. - expr: | - sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 - for: 15m + verb: write + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) labels: - severity: warning - - alert: KubeletPlegDurationHigh - annotations: - description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletplegdurationhigh - summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. - expr: | - node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 - for: 5m + verb: write + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: - severity: warning - - alert: KubeletPodStartUpLatencyHigh - annotations: - description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletpodstartuplatencyhigh - summary: Kubelet Pod startup latency is too high. - expr: | - histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 - for: 15m + verb: write + record: apiserver_request:burnrate5m + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 604800 + verb: write + record: apiserver_request:burnrate6h + - expr: | + sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 86400 + verb: read + record: code_resource:apiserver_request_total:rate5m + - expr: | + sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: - severity: critical - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 604800 + verb: write + record: code_resource:apiserver_request_total:rate5m + - expr: | + histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 labels: - severity: warning - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 86400 + quantile: "0.99" + verb: read + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 labels: - severity: critical - - alert: KubeletClientCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificaterenewalerrors - summary: Kubelet has failed to renew its client certificate. - expr: | - increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 - for: 15m + quantile: "0.99" + verb: write + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: KubeletServerCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificaterenewalerrors - summary: Kubelet has failed to renew its server certificate. - expr: | - increase(kubelet_server_expiration_renew_errors[5m]) > 0 - for: 15m + quantile: "0.99" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: KubeletDown - annotations: - description: Kubelet has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kubelet", metrics_path="/metrics"} == 1) - for: 15m + quantile: "0.9" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: - severity: critical - - name: kubernetes-system-scheduler + quantile: "0.5" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - interval: 3m + name: kube-apiserver-availability.rules rules: - - alert: KubeSchedulerDown - annotations: - description: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeschedulerdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kube-scheduler"} == 1) - for: 15m + - expr: | + 1 - ( + ( + # write too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + - + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + ) + + ( + # read too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) + - + ( + ( + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + or + vector(0) + ) + + + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + + + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + ) + ) + + # errors + sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + ) + / + sum(code:apiserver_request_total:increase30d) labels: - severity: critical - - name: kubernetes-system-controller-manager - rules: - - alert: KubeControllerManagerDown - annotations: - description: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontrollermanagerdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kube-controller-manager"} == 1) - for: 15m + verb: all + record: apiserver_request:availability30d + - expr: | + 1 - ( + sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) + - + ( + # too slow + ( + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + or + vector(0) + ) + + + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + + + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + ) + + + # errors + sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + ) + / + sum(code:apiserver_request_total:increase30d{verb="read"}) labels: - severity: critical - - name: prometheus - rules: - - alert: PrometheusBadConfig - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. - summary: Failed Prometheus configuration reload. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 - for: 10m + verb: read + record: apiserver_request:availability30d + - expr: | + 1 - ( + ( + # too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + - + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + ) + + + # errors + sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + ) + / + sum(code:apiserver_request_total:increase30d{verb="write"}) + labels: + verb: write + record: apiserver_request:availability30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + labels: + verb: read + record: code:apiserver_request_total:increase30d + - expr: | + sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) labels: - severity: critical - - alert: PrometheusNotificationQueueRunningFull - annotations: - description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. - summary: Prometheus alert notification queue predicted to run full in less than 30m. - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) - > - min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]) + verb: write + record: code:apiserver_request_total:increase30d + - name: k8s.rules + rules: + - expr: | + sum by (cluster, namespace, pod, container) ( + rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers - annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. - expr: | - ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + - expr: | + container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_working_set_bytes + - expr: | + container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_rss + - expr: | + container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_cache + - expr: | + container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_swap + - expr: | + sum by (namespace) ( + sum by (namespace, pod) ( + max by (namespace, pod, container) ( + kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} + ) * on(namespace, pod) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + sum by (namespace) ( + sum by (namespace, pod) ( + max by (namespace, pod, container) ( + kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} + ) * on(namespace, pod) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace:kube_pod_container_resource_requests_cpu_cores:sum + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) ) - * 100 - > 1 - for: 15m - labels: - severity: warning - - alert: PrometheusNotConnectedToAlertmanagers - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. - summary: Prometheus is not connected to any Alertmanagers. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 - for: 10m - labels: - severity: warning - - alert: PrometheusTSDBReloadsFailing - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. - summary: Prometheus has issues reloading blocks from disk. - expr: | - increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 - for: 4h labels: - severity: warning - - alert: PrometheusTSDBCompactionsFailing - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. - summary: Prometheus has issues compacting blocks. - expr: | - increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 - for: 4h + workload_type: deployment + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) labels: - severity: warning - - alert: PrometheusNotIngestingSamples - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. - summary: Prometheus is not ingesting samples. - expr: | - ( - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 - and - ( - sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0 - or - sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0 + workload_type: daemonset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" ) ) - for: 10m labels: - severity: warning - - alert: PrometheusDuplicateTimestamps - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. - summary: Prometheus is dropping samples with duplicate timestamps. - expr: | - rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 10m + workload_type: statefulset + record: namespace_workload_pod:kube_pod_owner:relabel + - name: kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: PrometheusOutOfOrderTimestamps - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. - summary: Prometheus drops samples with out-of-order timestamps. - expr: | - rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 10m + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: PrometheusRemoteStorageFailures - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} - summary: Prometheus fails to send samples to remote storage. - expr: | - ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - + - rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - ) - * 100 - > 1 - for: 15m + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: - severity: critical - - alert: PrometheusRemoteWriteBehind - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. - summary: Prometheus remote write is behind. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) - - ignoring(remote_name, url) group_right - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - > 120 - for: 15m + quantile: "0.99" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: - severity: critical - - alert: PrometheusRemoteWriteDesiredShards - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. - summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) - > - max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - for: 15m + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: PrometheusRuleFailures - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. - summary: Prometheus is failing rule evaluations. - expr: | - increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: - severity: critical - - alert: PrometheusMissingRuleEvaluations - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. - summary: Prometheus is missing rule evaluations due to slow rule group evaluation. - expr: | - increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m + quantile: "0.9" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: PrometheusTargetLimitHit - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. - summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. - expr: | - increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: | - min without (alertmanager) ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - * 100 - > 3 - for: 15m + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: - severity: critical - - name: general.rules + quantile: "0.5" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - name: node.rules rules: - - alert: TargetDown - annotations: - message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' - expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 - for: 10m - labels: - severity: warning - - alert: Watchdog - annotations: - message: | - This is an alert meant to ensure that the entire alerting pipeline is functional. - This alert is always firing, therefore it should always be firing in Alertmanager - and always fire against a receiver. There are integrations with various notification - mechanisms that send a notification when this alert is not firing. For example the - "DeadMansSnitch" integration in PagerDuty. - expr: vector(1) - labels: - severity: none - - name: node-network + - expr: | + topk by(namespace, pod) (1, + max by (node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + )) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + count by (cluster, node) (sum by (node, cpu) ( + node_cpu_seconds_total{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + )) + record: node:node_num_cpu:sum + - expr: | + sum( + node_memory_MemAvailable_bytes{job="node-exporter"} or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + node_memory_Cached_bytes{job="node-exporter"} + + node_memory_MemFree_bytes{job="node-exporter"} + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) by (cluster) + record: :node_memory_MemAvailable_bytes:sum + - name: kubelet.rules rules: - - alert: NodeNetworkInterfaceFlapping - annotations: - message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" - expr: | - changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 - for: 2m + - expr: | + histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) labels: - severity: warning + quantile: "0.99" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.9" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.5" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile diff --git a/manifests/node-exporter-prometheusRule.yaml b/manifests/node-exporter-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eee95a1a5b39b87d2de04d9d994936fe47b61405 --- /dev/null +++ b/manifests/node-exporter-prometheusRule.yaml @@ -0,0 +1,266 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.0.1 + prometheus: k8s + role: alert-rules + name: node-exporter-rules + namespace: monitoring +spec: + groups: + - name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + summary: Filesystem has less than 5% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + summary: Filesystem has less than 3% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 5% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' + summary: Network interface is reporting many receive errors. + expr: | + rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + summary: Network interface is reporting many transmit errors. + expr: | + rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ $value | humanizePercentage }} of conntrack entries are used.' + summary: Number of conntrack are getting close to the limit. + expr: | + (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: Node Exporter text file collector failed to scrape. + summary: Node Exporter text file collector failed to scrape. + expr: | + node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + summary: Clock skew detected. + expr: | + ( + node_timex_offset_seconds > 0.05 + and + deriv(node_timex_offset_seconds[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds < -0.05 + and + deriv(node_timex_offset_seconds[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. + summary: Clock not synchronising. + expr: | + min_over_time(node_timex_sync_status[5m]) == 0 + and + node_timex_maxerror_seconds >= 16 + for: 10m + labels: + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + summary: RAID Array is degraded + expr: | + node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. + summary: Failed device in RAID array + expr: | + node_md_disks{state="fail"} > 0 + labels: + severity: warning + - name: node-exporter.rules + rules: + - expr: | + count without (cpu) ( + count without (mode) ( + node_cpu_seconds_total{job="node-exporter"} + ) + ) + record: instance:node_num_cpu:sum + - expr: | + 1 - avg without (cpu, mode) ( + rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) + ) + record: instance:node_cpu_utilisation:rate1m + - expr: | + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: | + 1 - ( + node_memory_MemAvailable_bytes{job="node-exporter"} + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: | + rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) + record: instance:node_vmstat_pgmajfault:rate1m + - expr: | + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) + record: instance_device:node_disk_io_time_seconds:rate1m + - expr: | + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate1m + - expr: | + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate1m + - expr: | + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate1m diff --git a/manifests/prometheus-operator-prometheusRule.yaml b/manifests/prometheus-operator-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c1f85086ae3e62466641ca84579109965e1e9850 --- /dev/null +++ b/manifests/prometheus-operator-prometheusRule.yaml @@ -0,0 +1,79 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.45.0 + prometheus: k8s + role: alert-rules + name: prometheus-operator-rules + namespace: monitoring +spec: + groups: + - name: prometheus-operator + rules: + - alert: PrometheusOperatorListErrors + annotations: + description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. + summary: Errors while performing list operations in controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorWatchErrors + annotations: + description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. + summary: Errors while performing watch operations in controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorSyncFailed + annotations: + description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. + summary: Last controller reconciliation failed + expr: | + min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorReconcileErrors + annotations: + description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' + summary: Errors while reconciling controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. + summary: Errors while reconciling Prometheus. + expr: | + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNotReady + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. + summary: Prometheus operator not ready + expr: | + min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) + for: 5m + labels: + severity: warning + - alert: PrometheusOperatorRejectedResources + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. + summary: Resources rejected by Prometheus operator + expr: | + min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 + for: 5m + labels: + severity: warning diff --git a/manifests/prometheus-prometheusRule.yaml b/manifests/prometheus-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aa4f0ce92218f59a2f5b70e592afad11c32667f2 --- /dev/null +++ b/manifests/prometheus-prometheusRule.yaml @@ -0,0 +1,213 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 + prometheus: k8s + role: alert-rules + name: k8s-rules + namespace: monitoring +spec: + groups: + - name: prometheus + rules: + - alert: PrometheusBadConfig + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. + summary: Failed Prometheus configuration reload. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. + summary: Prometheus alert notification queue predicted to run full in less than 30m. + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers + annotations: + description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' + summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. + expr: | + ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. + summary: Prometheus is not connected to any Alertmanagers. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. + summary: Prometheus has issues reloading blocks from disk. + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. + summary: Prometheus has issues compacting blocks. + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. + summary: Prometheus is not ingesting samples. + expr: | + ( + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0 + ) + ) + for: 10m + labels: + severity: warning + - alert: PrometheusDuplicateTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. + summary: Prometheus is dropping samples with duplicate timestamps. + expr: | + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOutOfOrderTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. + summary: Prometheus drops samples with out-of-order timestamps. + expr: | + rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusRemoteStorageFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} + summary: Prometheus fails to send samples to remote storage. + expr: | + ( + rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + / + ( + rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + + + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + ) + * 100 + > 1 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteBehind + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. + summary: Prometheus remote write is behind. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + - ignoring(remote_name, url) group_right + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + > 120 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteDesiredShards + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. + summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) + > + max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusRuleFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. + summary: Prometheus is failing rule evaluations. + expr: | + increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: PrometheusMissingRuleEvaluations + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. + summary: Prometheus is missing rule evaluations due to slow rule group evaluation. + expr: | + increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. + summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: | + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical