diff --git a/assets/prometheus/rules/alertmanager.rules.yaml b/assets/prometheus/rules/alertmanager.rules.yaml index 8f65c5da6fb45d0342221ab29d0e7778603c7685..fdfdfd0f546ebd3365a0429acca6c46a3acbf700 100644 --- a/assets/prometheus/rules/alertmanager.rules.yaml +++ b/assets/prometheus/rules/alertmanager.rules.yaml @@ -1,5 +1,5 @@ groups: -- name: ./alertmanager.rules +- name: alertmanager.rules rules: - alert: AlertmanagerConfigInconsistent expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) @@ -11,7 +11,6 @@ groups: annotations: description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. - summary: Alertmanager configurations are inconsistent - alert: AlertmanagerDownOrMissing expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 @@ -21,8 +20,7 @@ groups: annotations: description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. - summary: Alertmanager down or not discovered - - alert: FailedReload + - alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful == 0 for: 10m labels: @@ -30,4 +28,3 @@ groups: annotations: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. - summary: Alertmanager configuration reload has failed diff --git a/assets/prometheus/rules/general.rules.yaml b/assets/prometheus/rules/general.rules.yaml index 355e12f3ac36b4b1f7217b08c934b70c260ad06a..84ce6b47fc9013df48a2eb23e958205e83f42bba 100644 --- a/assets/prometheus/rules/general.rules.yaml +++ b/assets/prometheus/rules/general.rules.yaml @@ -1,5 +1,5 @@ groups: -- name: ./general.rules +- name: general.rules rules: - alert: TargetDown expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 @@ -7,7 +7,7 @@ groups: labels: severity: warning annotations: - description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' + description: '{{ $value }}% of {{ $labels.job }} targets are down.' summary: Targets are down - alert: DeadMansSwitch expr: vector(1) @@ -17,32 +17,23 @@ groups: description: This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional. summary: Alerting DeadMansSwitch - - alert: TooManyOpenFileDescriptors - expr: 100 * (process_open_fds / process_max_fds) > 95 - for: 10m - labels: - severity: critical - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.' - summary: too many open file descriptors - - record: instance:fd_utilization + - record: fd_utilization expr: process_open_fds / process_max_fds - alert: FdExhaustionClose - expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 for: 10m labels: severity: warning annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) instance will exhaust in file/socket descriptors soon' + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance + will exhaust in file/socket descriptors within the next 4 hours' summary: file descriptors soon exhausted - alert: FdExhaustionClose - expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 + expr: predict_linear(fd_utilization[10m], 3600) > 1 for: 10m labels: severity: critical annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) instance will exhaust in file/socket descriptors soon' + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance + will exhaust in file/socket descriptors within the next hour' summary: file descriptors soon exhausted diff --git a/assets/prometheus/rules/kube-apiserver.rules.yaml b/assets/prometheus/rules/kube-apiserver.rules.yaml deleted file mode 100644 index 50982b0546567437b07f88eb93638e5b10f07583..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/kube-apiserver.rules.yaml +++ /dev/null @@ -1,22 +0,0 @@ -groups: -- name: ./kube-apiserver.rules - rules: - - alert: K8SApiserverDown - expr: absent(up{job="apiserver"} == 1) - for: 5m - labels: - severity: critical - annotations: - description: Prometheus failed to scrape API server(s), or all API servers have - disappeared from service discovery. - summary: API server unreachable - - alert: K8SApiServerLatency - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m])) - by (le)) / 1e+06 > 1 - for: 10m - labels: - severity: warning - annotations: - description: 99th percentile Latency for {{ $labels.verb }} requests to the - kube-apiserver is higher than 1s. - summary: Kubernetes apiserver latency is high diff --git a/assets/prometheus/rules/kube-controller-manager.rules.yaml b/assets/prometheus/rules/kube-controller-manager.rules.yaml index f23bbde3852bf376866c87d1b3ece76d54cd999b..4ea82ed1c24988c97454cf5bcb435ec4511009c7 100644 --- a/assets/prometheus/rules/kube-controller-manager.rules.yaml +++ b/assets/prometheus/rules/kube-controller-manager.rules.yaml @@ -1,5 +1,5 @@ groups: -- name: ./kube-controller-manager.rules +- name: kube-controller-manager.rules rules: - alert: K8SControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) diff --git a/assets/prometheus/rules/kube-scheduler.rules.yaml b/assets/prometheus/rules/kube-scheduler.rules.yaml index 0383b3b18a1a23d63ead066265080ba1e5dd2691..8f0c01fd2e959c9aeea573245adf368c7be18cb6 100644 --- a/assets/prometheus/rules/kube-scheduler.rules.yaml +++ b/assets/prometheus/rules/kube-scheduler.rules.yaml @@ -1,6 +1,51 @@ groups: -- name: ./kube-scheduler.rules +- name: kube-scheduler.rules rules: + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" - alert: K8SSchedulerDown expr: absent(up{job="kube-scheduler"} == 1) for: 5m diff --git a/assets/prometheus/rules/kube-state-metrics.rules.yaml b/assets/prometheus/rules/kube-state-metrics.rules.yaml new file mode 100644 index 0000000000000000000000000000000000000000..32b99fa20a194f91d0e9913ceec2039a284f4953 --- /dev/null +++ b/assets/prometheus/rules/kube-state-metrics.rules.yaml @@ -0,0 +1,55 @@ +groups: +- name: kube-state-metrics.rules + rules: + - alert: DeploymentGenerationMismatch + expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation + for: 15m + labels: + severity: warning + annotations: + description: Observed deployment generation does not match expected one for + deployment {{$labels.namespaces}}{{$labels.deployment}} + - alert: DeploymentReplicasNotUpdated + expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) + or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) + unless (kube_deployment_spec_paused == 1) + for: 15m + labels: + severity: warning + annotations: + description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} + - alert: DaemonSetRolloutStuck + expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled + * 100 < 100 + for: 15m + labels: + severity: warning + annotations: + description: Only {{$value}}% of desired pods scheduled and ready for daemon + set {{$labels.namespaces}}/{{$labels.daemonset}} + - alert: K8SDaemonSetsNotScheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled + > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are not scheduled. + summary: Daemonsets are not scheduled correctly + - alert: DaemonSetsMissScheduled + expr: kube_daemonset_status_number_misscheduled > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are running where they are not supposed + to run. + summary: Daemonsets are not scheduled correctly + - alert: PodFrequentlyRestarting + expr: increase(kube_pod_container_status_restarts[1h]) > 5 + for: 10m + labels: + severity: warning + annotations: + description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} + times within the last hour diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml index 03ea03da60568f563e4d0e31c34a11528a83b886..a1fc93cbaebeb65134e8e35f8a3c8b29cb390dd5 100644 --- a/assets/prometheus/rules/kubelet.rules.yaml +++ b/assets/prometheus/rules/kubelet.rules.yaml @@ -1,5 +1,5 @@ groups: -- name: ./kubelet.rules +- name: kubelet.rules rules: - alert: K8SNodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 @@ -18,20 +18,17 @@ groups: labels: severity: critical annotations: - description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady - state).' - summary: Many Kubernetes nodes are Not Ready + description: '{{ $value }}% of Kubernetes nodes are not ready' - alert: K8SKubeletDown - expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 for: 1h labels: severity: warning annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. - summary: Many Kubelets cannot be scraped - alert: K8SKubeletDown - expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) - > 0.1 + expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) + * 100 > 1 for: 1h labels: severity: critical @@ -41,36 +38,10 @@ groups: summary: Many Kubelets cannot be scraped - alert: K8SKubeletTooManyPods expr: kubelet_running_pod_count > 100 + for: 10m labels: severity: warning annotations: description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 summary: Kubelet is close to pod limit - - alert: K8SDaemonSetsNotScheduled - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled - > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are not scheduled. - summary: Daemonsets are not scheduled correctly - - alert: K8SDaemonSetsNotRunning - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready - > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are not ready. - summary: Daemonsets are not ready - - alert: K8SDaemonSetsMissScheduled - expr: kube_daemonset_status_number_misscheduled > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are running where they are not supposed - to run. - summary: Daemonsets are not scheduled correctly diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml index ab5ccf061ad7f4b72fdfa42a7bf1032ffb030cdf..f13d0088a34aa3a177ca303dac5d8afdae2d8ebf 100644 --- a/assets/prometheus/rules/kubernetes.rules.yaml +++ b/assets/prometheus/rules/kubernetes.rules.yaml @@ -1,115 +1,86 @@ groups: -- name: ./kubernetes.rules +- name: kubernetes.rules rules: - - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes - expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:spec_cpu_shares - expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:cpu_usage:rate - expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:memory_usage:bytes - expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_working_set:bytes - expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:memory_rss:bytes - expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_cache:bytes - expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:disk_usage:bytes - expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate - expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name, scope, type) - - record: cluster_namespace_controller_pod_container:memory_oom:rate - expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name, scope, type) - - record: cluster:memory_allocation:percent - expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) - / sum(machine_memory_bytes) BY (cluster) - - record: cluster:memory_used:percent - expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) - BY (cluster) - - record: cluster:cpu_allocation:percent - expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} - * ON(cluster, instance) machine_cpu_cores) BY (cluster) - - record: cluster:node_cpu_use:percent - expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) - BY (cluster) - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: pod_name:container_memory_usage_bytes:sum + expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY + (pod_name) + - record: pod_name:container_spec_cpu_shares:sum + expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) + - record: pod_name:container_cpu_usage:sum + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) + BY (pod_name) + - record: pod_name:container_fs_usage_bytes:sum + expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) + - record: namespace:container_memory_usage_bytes:sum + expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) + - record: namespace:container_spec_cpu_shares:sum + expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) + - record: namespace:container_cpu_usage:sum + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) + BY (namespace) + - record: cluster:memory_usage:ratio + expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY + (cluster) / sum(machine_memory_bytes) BY (cluster) + - record: cluster:container_spec_cpu_shares:ratio + expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 + / sum(machine_cpu_cores) + - record: cluster:container_cpu_usage:ratio + expr: rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]) + / sum(machine_cpu_cores) + - record: apiserver_latency_seconds:quantile + expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.99" - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.9" - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: apiserver_latency_seconds:quantile + expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.5" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + - alert: APIServerLatencyHigh + expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} + > 1 + for: 10m labels: - quantile: "0.99" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: warning + annotations: + description: the API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}} + - alert: APIServerLatencyHigh + expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} + > 4 + for: 10m labels: - quantile: "0.9" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: critical + annotations: + description: the API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}} + - alert: APIServerErrorsHigh + expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) + * 100 > 2 + for: 10m labels: - quantile: "0.5" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: warning + annotations: + description: API server returns errors for {{ $value }}% of requests + - alert: APIServerErrorsHigh + expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) + * 100 > 5 + for: 10m labels: - quantile: "0.99" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: critical + annotations: + description: API server returns errors for {{ $value }}% of requests + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 20m labels: - quantile: "0.9" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" + severity: critical + annotations: + description: No API servers are reachable or all have disappeared from service + discovery diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml index 9c1641caa9422b035d4bc397cb7652f1e0728a9d..0e7e1bbd3b58a994989882e7ebea589acf1a2953 100644 --- a/assets/prometheus/rules/node.rules.yaml +++ b/assets/prometheus/rules/node.rules.yaml @@ -1,6 +1,23 @@ groups: -- name: ./node.rules +- name: node.rules rules: + - record: instance:node_cpu:rate:sum + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) + BY (instance) + - record: instance:node_filesystem_usage:sum + expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + BY (instance) + - record: instance:node_network_receive_bytes:rate:sum + expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + - record: instance:node_network_transmit_bytes:rate:sum + expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + - record: instance:node_cpu:ratio + expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) + GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + - record: cluster:node_cpu:sum_rate5m + expr: sum(rate(node_cpu{mode!="idle"}[5m])) + - record: cluster:node_cpu:ratio + expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - alert: NodeExporterDown expr: absent(up{job="node-exporter"} == 1) for: 10m @@ -8,30 +25,20 @@ groups: severity: warning annotations: description: Prometheus could not scrape a node-exporter for more than 10m, - or node-exporters have disappeared from discovery. - summary: node-exporter cannot be scraped - - alert: K8SNodeOutOfDisk - expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + or node-exporters have disappeared from discovery + - alert: NodeDiskRunningFull + expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 + for: 30m labels: - service: k8s - severity: critical - annotations: - description: '{{ $labels.node }} has run out of disk space.' - summary: Node ran out of disk space. - - alert: K8SNodeMemoryPressure - expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == - 1 - labels: - service: k8s severity: warning annotations: - description: '{{ $labels.node }} is under memory pressure.' - summary: Node is under memory pressure. - - alert: K8SNodeDiskPressure - expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 24 hours (mounted at {{$labels.mountpoint}}) + - alert: NodeDiskRunningFull + expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 + for: 10m labels: - service: k8s - severity: warning + severity: critical annotations: - description: '{{ $labels.node }} is under disk pressure.' - summary: Node is under disk pressure. + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 2 hours (mounted at {{$labels.mountpoint}}) diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml index 6ed0cd68629f0d2224db09d9c8cf970ac093310c..df51d0106af150aa1b67180ef83c75571fb2b85b 100644 --- a/assets/prometheus/rules/prometheus.rules.yaml +++ b/assets/prometheus/rules/prometheus.rules.yaml @@ -1,12 +1,44 @@ groups: -- name: ./prometheus.rules +- name: prometheus.rules rules: - - alert: FailedReload + - alert: PrometheusConfigReloadFailed expr: prometheus_config_last_reload_successful == 0 for: 10m labels: severity: warning annotations: - description: Reloading Prometheus' configuration has failed for {{ $labels.namespace - }}/{{ $labels.pod}}. - summary: Prometheus configuration reload has failed + description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + - alert: PrometheusNotificationQueueRunningFull + expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity + for: 10m + labels: + severity: warning + annotations: + description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ + $labels.pod}} + - alert: PrometheusErrorSendingAlerts + expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) + > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusErrorSendingAlerts + expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) + > 0.03 + for: 10m + labels: + severity: critical + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusNotConnectedToAlertmanagers + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 10m + labels: + severity: warning + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected + to any Alertmanagers diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 3e6552c1653ac7e2733526f1c920a302f83a1332..6493ff748f1dd598c5023ed6e6840970e8d2998e 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -8,7 +8,7 @@ metadata: data: alertmanager.rules.yaml: |+ groups: - - name: ./alertmanager.rules + - name: alertmanager.rules rules: - alert: AlertmanagerConfigInconsistent expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) @@ -20,7 +20,6 @@ data: annotations: description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. - summary: Alertmanager configurations are inconsistent - alert: AlertmanagerDownOrMissing expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 @@ -30,8 +29,7 @@ data: annotations: description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. - summary: Alertmanager down or not discovered - - alert: FailedReload + - alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful == 0 for: 10m labels: @@ -39,7 +37,6 @@ data: annotations: description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. - summary: Alertmanager configuration reload has failed etcd3.rules.yaml: |+ groups: - name: ./etcd3.rules @@ -166,7 +163,7 @@ data: summary: high commit durations general.rules.yaml: |+ groups: - - name: ./general.rules + - name: general.rules rules: - alert: TargetDown expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 @@ -174,7 +171,7 @@ data: labels: severity: warning annotations: - description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' + description: '{{ $value }}% of {{ $labels.job }} targets are down.' summary: Targets are down - alert: DeadMansSwitch expr: vector(1) @@ -184,61 +181,29 @@ data: description: This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional. summary: Alerting DeadMansSwitch - - alert: TooManyOpenFileDescriptors - expr: 100 * (process_open_fds / process_max_fds) > 95 - for: 10m - labels: - severity: critical - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.' - summary: too many open file descriptors - - record: instance:fd_utilization + - record: fd_utilization expr: process_open_fds / process_max_fds - alert: FdExhaustionClose - expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 for: 10m labels: severity: warning annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) instance will exhaust in file/socket descriptors soon' + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance + will exhaust in file/socket descriptors within the next 4 hours' summary: file descriptors soon exhausted - alert: FdExhaustionClose - expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 + expr: predict_linear(fd_utilization[10m], 3600) > 1 for: 10m labels: severity: critical annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ - $labels.instance }}) instance will exhaust in file/socket descriptors soon' + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance + will exhaust in file/socket descriptors within the next hour' summary: file descriptors soon exhausted - kube-apiserver.rules.yaml: |+ - groups: - - name: ./kube-apiserver.rules - rules: - - alert: K8SApiserverDown - expr: absent(up{job="apiserver"} == 1) - for: 5m - labels: - severity: critical - annotations: - description: Prometheus failed to scrape API server(s), or all API servers have - disappeared from service discovery. - summary: API server unreachable - - alert: K8SApiServerLatency - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m])) - by (le)) / 1e+06 > 1 - for: 10m - labels: - severity: warning - annotations: - description: 99th percentile Latency for {{ $labels.verb }} requests to the - kube-apiserver is higher than 1s. - summary: Kubernetes apiserver latency is high kube-controller-manager.rules.yaml: |+ groups: - - name: ./kube-controller-manager.rules + - name: kube-controller-manager.rules rules: - alert: K8SControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) @@ -252,8 +217,53 @@ data: summary: Controller manager is down kube-scheduler.rules.yaml: |+ groups: - - name: ./kube-scheduler.rules + - name: kube-scheduler.rules rules: + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" - alert: K8SSchedulerDown expr: absent(up{job="kube-scheduler"} == 1) for: 5m @@ -264,9 +274,65 @@ data: to nodes. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler summary: Scheduler is down + kube-state-metrics.rules.yaml: |+ + groups: + - name: kube-state-metrics.rules + rules: + - alert: DeploymentGenerationMismatch + expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation + for: 15m + labels: + severity: warning + annotations: + description: Observed deployment generation does not match expected one for + deployment {{$labels.namespaces}}{{$labels.deployment}} + - alert: DeploymentReplicasNotUpdated + expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) + or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) + unless (kube_deployment_spec_paused == 1) + for: 15m + labels: + severity: warning + annotations: + description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} + - alert: DaemonSetRolloutStuck + expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled + * 100 < 100 + for: 15m + labels: + severity: warning + annotations: + description: Only {{$value}}% of desired pods scheduled and ready for daemon + set {{$labels.namespaces}}/{{$labels.daemonset}} + - alert: K8SDaemonSetsNotScheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled + > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are not scheduled. + summary: Daemonsets are not scheduled correctly + - alert: DaemonSetsMissScheduled + expr: kube_daemonset_status_number_misscheduled > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are running where they are not supposed + to run. + summary: Daemonsets are not scheduled correctly + - alert: PodFrequentlyRestarting + expr: increase(kube_pod_container_status_restarts[1h]) > 5 + for: 10m + labels: + severity: warning + annotations: + description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} + times within the last hour kubelet.rules.yaml: |+ groups: - - name: ./kubelet.rules + - name: kubelet.rules rules: - alert: K8SNodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 @@ -285,20 +351,17 @@ data: labels: severity: critical annotations: - description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady - state).' - summary: Many Kubernetes nodes are Not Ready + description: '{{ $value }}% of Kubernetes nodes are not ready' - alert: K8SKubeletDown - expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 for: 1h labels: severity: warning annotations: description: Prometheus failed to scrape {{ $value }}% of kubelets. - summary: Many Kubelets cannot be scraped - alert: K8SKubeletDown - expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) - > 0.1 + expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) + * 100 > 1 for: 1h labels: severity: critical @@ -308,159 +371,121 @@ data: summary: Many Kubelets cannot be scraped - alert: K8SKubeletTooManyPods expr: kubelet_running_pod_count > 100 + for: 10m labels: severity: warning annotations: description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 summary: Kubelet is close to pod limit - - alert: K8SDaemonSetsNotScheduled - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled - > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are not scheduled. - summary: Daemonsets are not scheduled correctly - - alert: K8SDaemonSetsNotRunning - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready - > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are not ready. - summary: Daemonsets are not ready - - alert: K8SDaemonSetsMissScheduled - expr: kube_daemonset_status_number_misscheduled > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are running where they are not supposed - to run. - summary: Daemonsets are not scheduled correctly kubernetes.rules.yaml: |+ groups: - - name: ./kubernetes.rules + - name: kubernetes.rules rules: - - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes - expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:spec_cpu_shares - expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:cpu_usage:rate - expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:memory_usage:bytes - expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_working_set:bytes - expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:memory_rss:bytes - expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_cache:bytes - expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:disk_usage:bytes - expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate - expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name, scope, type) - - record: cluster_namespace_controller_pod_container:memory_oom:rate - expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name, scope, type) - - record: cluster:memory_allocation:percent - expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) - / sum(machine_memory_bytes) BY (cluster) - - record: cluster:memory_used:percent - expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) - BY (cluster) - - record: cluster:cpu_allocation:percent - expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} - * ON(cluster, instance) machine_cpu_cores) BY (cluster) - - record: cluster:node_cpu_use:percent - expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) - BY (cluster) - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: pod_name:container_memory_usage_bytes:sum + expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY + (pod_name) + - record: pod_name:container_spec_cpu_shares:sum + expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) + - record: pod_name:container_cpu_usage:sum + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) + BY (pod_name) + - record: pod_name:container_fs_usage_bytes:sum + expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) + - record: namespace:container_memory_usage_bytes:sum + expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) + - record: namespace:container_spec_cpu_shares:sum + expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) + - record: namespace:container_cpu_usage:sum + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) + BY (namespace) + - record: cluster:memory_usage:ratio + expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY + (cluster) / sum(machine_memory_bytes) BY (cluster) + - record: cluster:container_spec_cpu_shares:ratio + expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 + / sum(machine_cpu_cores) + - record: cluster:container_cpu_usage:ratio + expr: rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]) + / sum(machine_cpu_cores) + - record: apiserver_latency_seconds:quantile + expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.99" - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.9" - - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + - record: apiserver_latency_seconds:quantile + expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 labels: quantile: "0.5" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + - alert: APIServerLatencyHigh + expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} + > 1 + for: 10m labels: - quantile: "0.9" - - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: warning + annotations: + description: the API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}} + - alert: APIServerLatencyHigh + expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} + > 4 + for: 10m labels: - quantile: "0.5" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: critical + annotations: + description: the API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}} + - alert: APIServerErrorsHigh + expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) + * 100 > 2 + for: 10m labels: - quantile: "0.99" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: warning + annotations: + description: API server returns errors for {{ $value }}% of requests + - alert: APIServerErrorsHigh + expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) + * 100 > 5 + for: 10m labels: - quantile: "0.9" - - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + severity: critical + annotations: + description: API server returns errors for {{ $value }}% of requests + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 20m labels: - quantile: "0.5" + severity: critical + annotations: + description: No API servers are reachable or all have disappeared from service + discovery node.rules.yaml: |+ groups: - - name: ./node.rules + - name: node.rules rules: + - record: instance:node_cpu:rate:sum + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) + BY (instance) + - record: instance:node_filesystem_usage:sum + expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + BY (instance) + - record: instance:node_network_receive_bytes:rate:sum + expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + - record: instance:node_network_transmit_bytes:rate:sum + expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + - record: instance:node_cpu:ratio + expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) + GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + - record: cluster:node_cpu:sum_rate5m + expr: sum(rate(node_cpu{mode!="idle"}[5m])) + - record: cluster:node_cpu:ratio + expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - alert: NodeExporterDown expr: absent(up{job="node-exporter"} == 1) for: 10m @@ -468,43 +493,65 @@ data: severity: warning annotations: description: Prometheus could not scrape a node-exporter for more than 10m, - or node-exporters have disappeared from discovery. - summary: node-exporter cannot be scraped - - alert: K8SNodeOutOfDisk - expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 - labels: - service: k8s - severity: critical - annotations: - description: '{{ $labels.node }} has run out of disk space.' - summary: Node ran out of disk space. - - alert: K8SNodeMemoryPressure - expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == - 1 + or node-exporters have disappeared from discovery + - alert: NodeDiskRunningFull + expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 + for: 30m labels: - service: k8s severity: warning annotations: - description: '{{ $labels.node }} is under memory pressure.' - summary: Node is under memory pressure. - - alert: K8SNodeDiskPressure - expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 24 hours (mounted at {{$labels.mountpoint}}) + - alert: NodeDiskRunningFull + expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 + for: 10m labels: - service: k8s - severity: warning + severity: critical annotations: - description: '{{ $labels.node }} is under disk pressure.' - summary: Node is under disk pressure. + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 2 hours (mounted at {{$labels.mountpoint}}) prometheus.rules.yaml: |+ groups: - - name: ./prometheus.rules + - name: prometheus.rules rules: - - alert: FailedReload + - alert: PrometheusConfigReloadFailed expr: prometheus_config_last_reload_successful == 0 for: 10m labels: severity: warning annotations: - description: Reloading Prometheus' configuration has failed for {{ $labels.namespace - }}/{{ $labels.pod}}. - summary: Prometheus configuration reload has failed + description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + - alert: PrometheusNotificationQueueRunningFull + expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity + for: 10m + labels: + severity: warning + annotations: + description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ + $labels.pod}} + - alert: PrometheusErrorSendingAlerts + expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) + > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusErrorSendingAlerts + expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) + > 0.03 + for: 10m + labels: + severity: critical + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusNotConnectedToAlertmanagers + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 10m + labels: + severity: warning + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected + to any Alertmanagers