From 8f3b505049e1716758ea88acf41f25a836765fff Mon Sep 17 00:00:00 2001 From: Max Leonard Inden <IndenML@gmail.com> Date: Mon, 4 Jun 2018 17:13:23 +0200 Subject: [PATCH] kube-prometheus: Adjust to new Prometheus rule file CRD 89fc4e306972604eba2dcb961a6d29cc27a668ad introduced the new Prometheus rule file custom resource definition. This patch adjusts the kube-prometheus project to use the new custom resource definition. --- .../prometheus-operator.libsonnet | 4 +- .../prometheus/prometheus.libsonnet | 20 +- .../0prometheus-operator-clusterRole.yaml | 1 + .../0prometheus-operator-deployment.yaml | 2 +- manifests/prometheus-rules.yaml | 1171 +++++++++++------ 5 files changed, 809 insertions(+), 389 deletions(-) diff --git a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet index 11e9c0c0..7fce1836 100644 --- a/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/prometheus-operator/prometheus-operator.libsonnet @@ -7,7 +7,6 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; versions+:: { prometheusOperator: 'v0.19.0', configmapReloader: 'v0.0.1', - prometheusConfigReloader: 'v0.0.4', }, imageRepos+:: { @@ -52,6 +51,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; 'prometheuses/finalizers', 'alertmanagers/finalizers', 'servicemonitors', + 'rulefiles', ]) + policyRule.withVerbs(['*']); @@ -119,7 +119,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; container.withArgs([ '--kubelet-service=kube-system/kubelet', '--config-reloader-image=' + $._config.imageRepos.configmapReloader + ':' + $._config.versions.configmapReloader, - '--prometheus-config-reloader=' + $._config.imageRepos.prometheusConfigReloader + ':' + $._config.versions.prometheusConfigReloader, + '--prometheus-config-reloader=' + $._config.imageRepos.prometheusConfigReloader + ':' + $._config.versions.prometheusOperator, ]) + container.mixin.resources.withRequests({ cpu: '100m', memory: '50Mi' }) + container.mixin.resources.withLimits({ cpu: '200m', memory: '100Mi' }); diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index 09771ebe..f6503fed 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -40,11 +40,21 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; service.mixin.metadata.withNamespace($._config.namespace) + service.mixin.metadata.withLabels({ prometheus: $._config.prometheus.name }), rules: - local configMap = k.core.v1.configMap; - - configMap.new('prometheus-' + $._config.prometheus.name + '-rules', ({ 'all.rules.yaml': std.manifestYamlDoc($._config.prometheus.rules) } + $._config.prometheus.renderedRules)) + - configMap.mixin.metadata.withLabels({ role: 'alert-rules', prometheus: $._config.prometheus.name }) + - configMap.mixin.metadata.withNamespace($._config.namespace), + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'RuleFile', + metadata: { + labels: { + prometheus: $._config.prometheus.name, + role: 'alert-rules', + }, + name: 'prometheus-' + $._config.prometheus.name + '-rules', + namespace: $._config.namespace, + }, + spec: { + groups: $._config.prometheus.rules.groups, + }, + }, roleBindingDefault: local roleBinding = k.rbac.v1.roleBinding; diff --git a/manifests/0prometheus-operator-clusterRole.yaml b/manifests/0prometheus-operator-clusterRole.yaml index 94f5ce09..321859ca 100644 --- a/manifests/0prometheus-operator-clusterRole.yaml +++ b/manifests/0prometheus-operator-clusterRole.yaml @@ -17,6 +17,7 @@ rules: - prometheuses/finalizers - alertmanagers/finalizers - servicemonitors + - rulefiles verbs: - '*' - apiGroups: diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 0105de22..e85bbe1f 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -19,7 +19,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.0.4 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.19.0 image: quay.io/coreos/prometheus-operator:v0.19.0 name: prometheus-operator ports: diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index d916ff29..8550d801 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -1,387 +1,796 @@ -apiVersion: v1 -data: - all.rules.yaml: "\"groups\": \n- \"name\": \"k8s.rules\"\n \"rules\": \n - \"expr\": - |\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\"}[5m])) - by (namespace)\n \"record\": \"namespace:container_cpu_usage_seconds_total:sum_rate\"\n - \ - \"expr\": |\n sum(container_memory_usage_bytes{job=\"kubelet\", image!=\"\"}) - by (namespace)\n \"record\": \"namespace:container_memory_usage_bytes:sum\"\n - \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", - image!=\"\"}[5m])) by (namespace, pod_name)\n * on (namespace, pod_name) - group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, - \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_cpu_usage_seconds_total:sum_rate\"\n - \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(container_memory_usage_bytes{job=\"kubelet\",image!=\"\"}) - by (pod_name, namespace)\n * on (namespace, pod_name) group_left(label_name)\n - \ label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\", - \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_memory_usage_bytes:sum\"\n - \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"}) - by (namespace, pod)\n * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, - \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_memory_bytes:sum\"\n - \ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"} - and on(pod) kube_pod_status_scheduled{condition=\"true\"}) by (namespace, pod)\n - \ * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, - \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_cpu_cores:sum\"\n- - \"name\": \"kube-scheduler.rules\"\n \"rules\": \n - \"expr\": |\n histogram_quantile(0.99, - sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n - \ \"record\": \"cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n - \ \"record\": \"cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n - \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n - \ \"record\": \"cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n - \ \"record\": \"cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n - \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n - \ \"record\": \"cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n - \ \"record\": \"cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n - \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n- - \"name\": \"kube-apiserver.rules\"\n \"rules\": \n - \"expr\": |\n histogram_quantile(0.99, - sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, - pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n \"record\": - \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \"expr\": - |\n histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n - \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) - without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n - \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n- - \"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info) - by (node))\"\n \"record\": \":kube_pod_info_node_count:\"\n - \"expr\": |\n - \ max(label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\", - \"pod\", \"(.*)\")) by (node, namespace, pod)\n \"record\": \"node_namespace_pod:kube_pod_info:\"\n - \ - \"expr\": |\n count by (node) (sum by (node, cpu) (\n node_cpu{job=\"node-exporter\"}\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ ))\n \"record\": \"node:node_num_cpu:sum\"\n - \"expr\": |\n 1 - - avg(rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m]))\n \"record\": - \":node_cpu_utilisation:avg1m\"\n - \"expr\": |\n 1 - avg by (node) (\n - \ rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m])\n * on (namespace, - pod) group_left(node)\n node_namespace_pod:kube_pod_info:)\n \"record\": - \"node:node_cpu_utilisation:avg1m\"\n - \"expr\": |\n sum(node_load1{job=\"node-exporter\"})\n - \ /\n sum(node:node_num_cpu:sum)\n \"record\": \":node_cpu_saturation_load1:\"\n - \ - \"expr\": |\n sum by (node) (\n node_load1{job=\"node-exporter\"}\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n /\n node:node_num_cpu:sum\n \"record\": \"node:node_cpu_saturation_load1:\"\n - \ - \"expr\": |\n 1 -\n sum(node_memory_MemFree{job=\"node-exporter\"} - + node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n - \ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n \"record\": - \":node_memory_utilisation:\"\n - \"expr\": |\n sum by (node) (\n (node_memory_MemFree{job=\"node-exporter\"} - + node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_memory_bytes_available:sum\"\n - \"expr\": - |\n sum by (node) (\n node_memory_MemTotal{job=\"node-exporter\"}\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_memory_bytes_total:sum\"\n - \"expr\": |\n - \ (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)\n - \ /\n scalar(sum(node:node_memory_bytes_total:sum))\n \"record\": - \"node:node_memory_utilisation:ratio\"\n - \"expr\": |\n 1e3 * sum(\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n - \ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n )\n \"record\": - \":node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n 1 -\n sum - by (node) (\n (node_memory_MemFree{job=\"node-exporter\"} + node_memory_Cached{job=\"node-exporter\"} - + node_memory_Buffers{job=\"node-exporter\"})\n * on (namespace, pod) group_left(node)\n - \ node_namespace_pod:kube_pod_info:\n )\n /\n sum by (node) - (\n node_memory_MemTotal{job=\"node-exporter\"}\n * on (namespace, - pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\": - \"node:node_memory_utilisation:\"\n - \"expr\": |\n 1 - (node:node_memory_bytes_available:sum - / node:node_memory_bytes_total:sum)\n \"record\": \"node:node_memory_utilisation_2:\"\n - \ - \"expr\": |\n 1e3 * sum by (node) (\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n - \ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n * on (namespace, - pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\": - \"node:node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n avg(irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) - / 1e3)\n \"record\": \":node_disk_utilisation:avg_irate\"\n - \"expr\": |\n - \ avg by (node) (\n irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) - / 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_disk_utilisation:avg_irate\"\n - \"expr\": - |\n avg(irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) - / 1e3)\n \"record\": \":node_disk_saturation:avg_irate\"\n - \"expr\": |\n - \ avg by (node) (\n irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m]) - / 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_disk_saturation:avg_irate\"\n - \"expr\": - |\n sum(irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m])) - +\n sum(irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n - \ \"record\": \":node_net_utilisation:sum_irate\"\n - \"expr\": |\n sum - by (node) (\n (irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]) - +\n irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_net_utilisation:sum_irate\"\n - \"expr\": - |\n sum(irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m])) - +\n sum(irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n - \ \"record\": \":node_net_saturation:sum_irate\"\n - \"expr\": |\n sum - by (node) (\n (irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m]) - +\n irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n - \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kube-prometheus-node-recording.rules\"\n - \ \"rules\": \n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[3m])) - BY (instance)\"\n \"record\": \"instance:node_cpu:rate:sum\"\n - \"expr\": - \"sum((node_filesystem_size{mountpoint=\\\"/\\\"} - node_filesystem_free{mountpoint=\\\"/\\\"})) - BY (instance)\"\n \"record\": \"instance:node_filesystem_usage:sum\"\n - \"expr\": - \"sum(rate(node_network_receive_bytes[3m])) BY (instance)\"\n \"record\": \"instance:node_network_receive_bytes:rate:sum\"\n - \ - \"expr\": \"sum(rate(node_network_transmit_bytes[3m])) BY (instance)\"\n \"record\": - \"instance:node_network_transmit_bytes:rate:sum\"\n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m])) - WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, - cpu)) BY (instance)\"\n \"record\": \"instance:node_cpu:ratio\"\n - \"expr\": - \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m]))\"\n \"record\": - \"cluster:node_cpu:sum_rate5m\"\n - \"expr\": \"cluster:node_cpu:rate5m / count(sum(node_cpu) - BY (instance, cpu))\"\n \"record\": \"cluster:node_cpu:ratio\"\n- \"name\": - \"kubernetes-absent\"\n \"rules\": \n - \"alert\": \"AlertmanagerDown\"\n \"annotations\": - \n \"message\": \"Alertmanager has disappeared from Prometheus target discovery.\"\n - \ \"expr\": |\n absent(up{job=\"alertmanager-main\"} == 1)\n \"for\": - \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeAPIDown\"\n - \ \"annotations\": \n \"message\": \"KubeAPI has disappeared from Prometheus - target discovery.\"\n \"expr\": |\n absent(up{job=\"apiserver\"} == 1)\n - \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"KubeControllerManagerDown\"\n \"annotations\": \n \"message\": - \"KubeControllerManager has disappeared from Prometheus target discovery.\"\n - \ \"expr\": |\n absent(up{job=\"kube-controller-manager\"} == 1)\n \"for\": - \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeSchedulerDown\"\n - \ \"annotations\": \n \"message\": \"KubeScheduler has disappeared from - Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"kube-scheduler\"} - == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"KubeStateMetricsDown\"\n \"annotations\": \n \"message\": - \"KubeStateMetrics has disappeared from Prometheus target discovery.\"\n \"expr\": - |\n absent(up{job=\"kube-state-metrics\"} == 1)\n \"for\": \"15m\"\n \"labels\": - \n \"severity\": \"critical\"\n - \"alert\": \"KubeletDown\"\n \"annotations\": - \n \"message\": \"Kubelet has disappeared from Prometheus target discovery.\"\n - \ \"expr\": |\n absent(up{job=\"kubelet\"} == 1)\n \"for\": \"15m\"\n - \ \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"NodeExporterDown\"\n - \ \"annotations\": \n \"message\": \"NodeExporter has disappeared from - Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"node-exporter\"} - == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"PrometheusDown\"\n \"annotations\": \n \"message\": \"Prometheus - has disappeared from Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"prometheus-k8s\"} - == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"PrometheusOperatorDown\"\n \"annotations\": \n \"message\": - \"PrometheusOperator has disappeared from Prometheus target discovery.\"\n \"expr\": - |\n absent(up{job=\"prometheus-operator\"} == 1)\n \"for\": \"15m\"\n - \ \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"kubernetes-apps\"\n - \ \"rules\": \n - \"alert\": \"KubePodCrashLooping\"\n \"annotations\": \n - \ \"message\": \"{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container - }}) is restarting {{ printf \\\"%.2f\\\" $value }} / second\"\n \"expr\": |\n - \ rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[15m]) - > 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"KubePodNotReady\"\n \"annotations\": \n \"message\": - \"{{ $labels.namespace }}/{{ $labels.pod }} is not ready.\"\n \"expr\": |\n - \ sum by (namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\", - phase!~\"Running|Succeeded\"}) > 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": - \"critical\"\n - \"alert\": \"KubeDeploymentGenerationMismatch\"\n \"annotations\": - \n \"message\": \"Deployment {{ $labels.namespace }}/{{ $labels.deployment - }} generation mismatch\"\n \"expr\": |\n kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n - \ !=\n kube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n - \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"KubeDeploymentReplicasMismatch\"\n \"annotations\": \n \"message\": - \"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch\"\n - \ \"expr\": |\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n - \ !=\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n - \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"KubeStatefulSetReplicasMismatch\"\n \"annotations\": \n \"message\": - \"StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica mismatch\"\n - \ \"expr\": |\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n - \ !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n - \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"KubeStatefulSetGenerationMismatch\"\n \"annotations\": \n \"message\": - \"StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation mismatch\"\n - \ \"expr\": |\n kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n - \ !=\n kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n - \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"KubeDaemonSetRolloutStuck\"\n \"annotations\": \n \"message\": - \"Only {{$value}}% of desired pods scheduled and ready for daemon set {{$labels.namespace}}/{{$labels.daemonset}}\"\n - \ \"expr\": |\n kube_daemonset_status_number_ready{job=\"kube-state-metrics\"}\n - \ /\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"} - * 100 < 100\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"KubeDaemonSetNotScheduled\"\n \"annotations\": \n \"message\": - \"A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are - not scheduled.\"\n \"expr\": |\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n - \ -\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} - > 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"KubeDaemonSetMisScheduled\"\n \"annotations\": \n \"message\": - \"A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are - running where they are not supposed to run.\"\n \"expr\": |\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} - > 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n- - \"name\": \"kubernetes-resources\"\n \"rules\": \n - \"alert\": \"KubeCPUOvercommit\"\n - \ \"annotations\": \n \"message\": \"Overcommited CPU resource requests - on Pods, cannot tolerate node failure.\"\n \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)\n - \ /\n sum(node:node_num_cpu:sum)\n >\n (count(node:node_num_cpu:sum)-1) - / count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\": \n \"severity\": - \"warning\"\n - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\": - \"Overcommited Memory resource requests on Pods, cannot tolerate node failure.\"\n - \ \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)\n - \ /\n sum(node_memory_MemTotal)\n >\n (count(node:node_num_cpu:sum)-1)\n - \ /\n count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\": - \n \"severity\": \"warning\"\n - \"alert\": \"KubeCPUOvercommit\"\n \"annotations\": - \n \"message\": \"Overcommited CPU resource request quota on Namespaces.\"\n - \ \"expr\": |\n sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", - resource=\"requests.cpu\"})\n /\n sum(node:node_num_cpu:sum)\n > - 1.5\n \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\": - \"Overcommited Memory resource request quota on Namespaces.\"\n \"expr\": |\n - \ sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"requests.memory\"})\n - \ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n > 1.5\n - \ \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": - \"KubeQuotaExceeded\"\n \"annotations\": \n \"message\": \"{{ printf \\\"%0.0f\\\" - $value }}% usage of {{ $labels.resource }} in namespace {{ $labels.namespace }}.\"\n - \ \"expr\": |\n 100 * kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n - \ / ignoring(instance, job, type)\n kube_resourcequota{job=\"kube-state-metrics\", - type=\"hard\"}\n > 90\n \"for\": \"15m\"\n \"labels\": \n \"severity\": - \"warning\"\n- \"name\": \"kubernetes-storage\"\n \"rules\": \n - \"alert\": - \"KubePersistentVolumeUsageCritical\"\n \"annotations\": \n \"message\": - \"The persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace - {{ $labels.namespace }} has {{ printf \\\"%0.0f\\\" $value }}% free.\"\n \"expr\": - |\n 100 * kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\n - \ kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n < 3\n \"for\": - \"1m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubePersistentVolumeFullInFourDays\"\n - \ \"annotations\": \n \"message\": \"Based on recent sampling, the persistent - volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace - }} is expected to fill up within four days.\"\n \"expr\": |\n predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[1h], - 4 * 24 * 3600) < 0\n \"for\": \"5m\"\n \"labels\": \n \"severity\": - \"critical\"\n- \"name\": \"kubernetes-system\"\n \"rules\": \n - \"alert\": - \"KubeNodeNotReady\"\n \"annotations\": \n \"message\": \"{{ $labels.node - }} has been unready for more than an hour\"\n \"expr\": |\n kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} - == 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"KubeVersionMismatch\"\n \"annotations\": \n \"message\": - \"There are {{ $value }} different versions of Kubernetes components running.\"\n - \ \"expr\": |\n count(count(kubernetes_build_info{job!=\"kube-dns\"}) by - (gitVersion)) > 1\n \"for\": \"1h\"\n \"labels\": \n \"severity\": - \"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\": - \"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing - {{ printf \\\"%0.0f\\\" $value }}% errors.'\"\n \"expr\": |\n sum(rate(rest_client_requests_total{code!~\"2..\"}[5m])) - by (instance, job) * 100\n /\n sum(rate(rest_client_requests_total[5m])) - by (instance, job)\n > 1\n \"for\": \"15m\"\n \"labels\": \n \"severity\": - \"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\": - \"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing - {{ printf \\\"%0.0f\\\" $value }} errors / sec.'\"\n \"expr\": |\n sum(rate(ksm_scrape_error_total{job=\"kube-state-metrics\"}[5m])) - by (instance, job) > 0.1\n \"for\": \"15m\"\n \"labels\": \n \"severity\": - \"warning\"\n - \"alert\": \"KubeletTooManyPods\"\n \"annotations\": \n \"message\": - \"Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit - of 110.\"\n \"expr\": |\n kubelet_running_pod_count{job=\"kubelet\"} > - 100\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": - \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} - {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} - > 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": - \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} - {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} - > 4\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": - \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) - without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) - without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": - \"critical\"\n - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": - \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) - without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) - without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": - \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n \"annotations\": - \n \"message\": \"Kubernetes API certificate is expiring in less than 7 days.\"\n - \ \"expr\": |\n histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) - < 604800\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n - \ \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring - in less than 1 day.\"\n \"expr\": |\n histogram_quantile(0.01, sum by - (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) - < 86400\n \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"alertmanager.rules\"\n - \ \"rules\": \n - \"alert\": \"AlertmanagerConfigInconsistent\"\n \"annotations\": - \n \"description\": \"The configuration of the instances of the Alertmanager - cluster `{{$labels.service}}` are out of sync.\"\n \"summary\": \"Configuration - out of sync\"\n \"expr\": |\n count_values(\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\"}) - BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"}, - \"service\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") != 1\n \"for\": - \"5m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"AlertmanagerDownOrMissing\"\n - \ \"annotations\": \n \"description\": \"An unexpected number of Alertmanagers - are scraped or Alertmanagers disappeared from discovery.\"\n \"summary\": - \"Alertmanager down or missing\"\n \"expr\": |\n label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"}, - \"job\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") / ON(job) GROUP_RIGHT() - sum(up{job=\"alertmanager-main\"}) BY (job) != 1\n \"for\": \"5m\"\n \"labels\": - \n \"severity\": \"warning\"\n - \"alert\": \"AlertmanagerFailedReload\"\n - \ \"annotations\": \n \"description\": \"Reloading Alertmanager's configuration - has failed for {{ $labels.namespace }}/{{ $labels.pod}}.\"\n \"summary\": - \"Alertmanager's configuration reload failed\"\n \"expr\": |\n alertmanager_config_last_reload_successful{job=\"alertmanager-main\"} - == 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n- - \"name\": \"general.rules\"\n \"rules\": \n - \"alert\": \"TargetDown\"\n \"annotations\": - \n \"description\": \"{{ $value }}% of {{ $labels.job }} targets are down.\"\n - \ \"summary\": \"Targets are down\"\n \"expr\": \"100 * (count(up == 0) - BY (job) / count(up) BY (job)) > 10\"\n \"for\": \"10m\"\n \"labels\": \n - \ \"severity\": \"warning\"\n - \"alert\": \"DeadMansSwitch\"\n \"annotations\": - \n \"description\": \"This is a DeadMansSwitch meant to ensure that the entire - Alerting pipeline is functional.\"\n \"summary\": \"Alerting DeadMansSwitch\"\n - \ \"expr\": \"vector(1)\"\n \"labels\": \n \"severity\": \"none\"\n- - \"name\": \"kube-prometheus-node-alerting.rules\"\n \"rules\": \n - \"alert\": - \"NodeDiskRunningFull\"\n \"annotations\": \n \"description\": \"device - {{$labels.device}} on node {{$labels.instance}} is running full within the next - 24 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node disk - is running full within 24 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[6h], - 3600 * 24) < 0\n \"for\": \"30m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"NodeDiskRunningFull\"\n \"annotations\": \n \"description\": - \"device {{$labels.device}} on node {{$labels.instance}} is running full within - the next 2 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node - disk is running full within 2 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[30m], - 3600 * 2) < 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n- - \"name\": \"prometheus.rules\"\n \"rules\": \n - \"alert\": \"PrometheusConfigReloadFailed\"\n - \ \"annotations\": \n \"description\": \"Reloading Prometheus' configuration - has failed for {{$labels.namespace}}/{{$labels.pod}}\"\n \"summary\": \"Reloading - Promehteus' configuration failed\"\n \"expr\": |\n prometheus_config_last_reload_successful{job=\"prometheus-k8s\"} - == 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"PrometheusNotificationQueueRunningFull\"\n \"annotations\": - \n \"description\": \"Prometheus' alert notification queue is running full - for {{$labels.namespace}}/{{ $labels.pod}}\"\n \"summary\": \"Prometheus' - alert notification queue is running full\"\n \"expr\": |\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\"}[5m], - 60 * 30) > prometheus_notifications_queue_capacity{job=\"prometheus-k8s\"}\n \"for\": - \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusErrorSendingAlerts\"\n - \ \"annotations\": \n \"description\": \"Errors while sending alerts from - Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\"\n - \ \"summary\": \"Errors while sending alert from Prometheus\"\n \"expr\": - |\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m]) - / rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.01\n - \ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": - \"PrometheusErrorSendingAlerts\"\n \"annotations\": \n \"description\": - \"Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} - to Alertmanager {{$labels.Alertmanager}}\"\n \"summary\": \"Errors while - sending alerts from Prometheus\"\n \"expr\": |\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m]) - / rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.03\n - \ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n - - \"alert\": \"PrometheusNotConnectedToAlertmanagers\"\n \"annotations\": \n - \ \"description\": \"Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is - not connected to any Alertmanagers\"\n \"summary\": \"Prometheus is not connected - to any Alertmanagers\"\n \"expr\": |\n prometheus_notifications_alertmanagers_discovered{job=\"prometheus-k8s\"} - < 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"PrometheusTSDBReloadsFailing\"\n \"annotations\": \n \"description\": - \"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures - over the last four hours.\"\n \"summary\": \"Prometheus has issues reloading - data blocks from disk\"\n \"expr\": |\n increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-k8s\"}[2h]) - > 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"PrometheusTSDBCompactionsFailing\"\n \"annotations\": \n \"description\": - \"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction - failures over the last four hours.\"\n \"summary\": \"Prometheus has issues - compacting sample blocks\"\n \"expr\": |\n increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-k8s\"}[2h]) - > 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"PrometheusTSDBWALCorruptions\"\n \"annotations\": \n \"description\": - \"{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).\"\n - \ \"summary\": \"Prometheus write-ahead log is corrupted\"\n \"expr\": - |\n tsdb_wal_corruptions_total{job=\"prometheus-k8s\"} > 0\n \"for\": - \"4h\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusNotIngestingSamples\"\n - \ \"annotations\": \n \"description\": \"Prometheus {{ $labels.namespace - }}/{{ $labels.pod}} isn't ingesting samples.\"\n \"summary\": \"Prometheus - isn't ingesting samples\"\n \"expr\": |\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\"}[5m]) - <= 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \ - \"alert\": \"PrometheusTargetScapesDuplicate\"\n \"annotations\": \n \"description\": - \"{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate - timestamps but different values\"\n \"summary\": \"Prometheus has many samples - rejected\"\n \"expr\": |\n increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-k8s\"}[5m]) - > 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"" -kind: ConfigMap +apiVersion: monitoring.coreos.com/v1 +kind: RuleFile metadata: labels: prometheus: k8s role: alert-rules name: prometheus-k8s-rules namespace: monitoring +spec: + groups: + - name: k8s.rules + rules: + - expr: | + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace) + record: namespace:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum(container_memory_usage_bytes{job="kubelet", image!=""}) by (namespace) + record: namespace:container_memory_usage_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace, pod_name) + * on (namespace, pod_name) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum by (namespace, label_name) ( + sum(container_memory_usage_bytes{job="kubelet",image!=""}) by (pod_name, namespace) + * on (namespace, pod_name) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:container_memory_usage_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum + - name: kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - name: kube-apiserver.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - name: node.rules + rules: + - expr: sum(min(kube_pod_info) by (node)) + record: ':kube_pod_info_node_count:' + - expr: | + max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + count by (node) (sum by (node, cpu) ( + node_cpu{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + )) + record: node:node_num_cpu:sum + - expr: | + 1 - avg(rate(node_cpu{job="node-exporter",mode="idle"}[1m])) + record: :node_cpu_utilisation:avg1m + - expr: | + 1 - avg by (node) ( + rate(node_cpu{job="node-exporter",mode="idle"}[1m]) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info:) + record: node:node_cpu_utilisation:avg1m + - expr: | + sum(node_load1{job="node-exporter"}) + / + sum(node:node_num_cpu:sum) + record: ':node_cpu_saturation_load1:' + - expr: | + sum by (node) ( + node_load1{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + node:node_num_cpu:sum + record: 'node:node_cpu_saturation_load1:' + - expr: | + 1 - + sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + / + sum(node_memory_MemTotal{job="node-exporter"}) + record: ':node_memory_utilisation:' + - expr: | + sum by (node) ( + (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_available:sum + - expr: | + sum by (node) ( + node_memory_MemTotal{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_total:sum + - expr: | + (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) + / + scalar(sum(node:node_memory_bytes_total:sum)) + record: node:node_memory_utilisation:ratio + - expr: | + 1e3 * sum( + (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) + ) + record: :node_memory_swap_io_bytes:sum_rate + - expr: | + 1 - + sum by (node) ( + (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + sum by (node) ( + node_memory_MemTotal{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: 'node:node_memory_utilisation:' + - expr: | + 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) + record: 'node:node_memory_utilisation_2:' + - expr: | + 1e3 * sum by (node) ( + (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_swap_io_bytes:sum_rate + - expr: | + avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3) + record: :node_disk_utilisation:avg_irate + - expr: | + avg by (node) ( + irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3 + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_utilisation:avg_irate + - expr: | + avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3) + record: :node_disk_saturation:avg_irate + - expr: | + avg by (node) ( + irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3 + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_saturation:avg_irate + - expr: | + sum(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) + record: :node_net_utilisation:sum_irate + - expr: | + sum by (node) ( + (irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_utilisation:sum_irate + - expr: | + sum(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) + record: :node_net_saturation:sum_irate + - expr: | + sum by (node) ( + (irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_saturation:sum_irate + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + BY (instance) + record: instance:node_filesystem_usage:sum + - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) + / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kubernetes-absent + rules: + - alert: AlertmanagerDown + annotations: + message: Alertmanager has disappeared from Prometheus target discovery. + expr: | + absent(up{job="alertmanager-main"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeAPIDown + annotations: + message: KubeAPI has disappeared from Prometheus target discovery. + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeControllerManagerDown + annotations: + message: KubeControllerManager has disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeSchedulerDown + annotations: + message: KubeScheduler has disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-scheduler"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsDown + annotations: + message: KubeStateMetrics has disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-state-metrics"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeletDown + annotations: + message: Kubelet has disappeared from Prometheus target discovery. + expr: | + absent(up{job="kubelet"} == 1) + for: 15m + labels: + severity: critical + - alert: NodeExporterDown + annotations: + message: NodeExporter has disappeared from Prometheus target discovery. + expr: | + absent(up{job="node-exporter"} == 1) + for: 15m + labels: + severity: critical + - alert: PrometheusDown + annotations: + message: Prometheus has disappeared from Prometheus target discovery. + expr: | + absent(up{job="prometheus-k8s"} == 1) + for: 15m + labels: + severity: critical + - alert: PrometheusOperatorDown + annotations: + message: PrometheusOperator has disappeared from Prometheus target discovery. + expr: | + absent(up{job="prometheus-operator"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + message: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is restarting {{ printf "%.2f" $value }} / second' + expr: | + rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0 + for: 1h + labels: + severity: critical + - alert: KubePodNotReady + annotations: + message: '{{ $labels.namespace }}/{{ $labels.pod }} is not ready.' + expr: | + sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase!~"Running|Succeeded"}) > 0 + for: 1h + labels: + severity: critical + - alert: KubeDeploymentGenerationMismatch + annotations: + message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} generation + mismatch + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeDeploymentReplicasMismatch + annotations: + message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica + mismatch + expr: | + kube_deployment_spec_replicas{job="kube-state-metrics"} + != + kube_deployment_status_replicas_available{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetReplicasMismatch + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica + mismatch + expr: | + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetGenerationMismatch + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation + mismatch + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetRolloutStuck + annotations: + message: Only {{$value}}% of desired pods scheduled and ready for daemon set + {{$labels.namespace}}/{{$labels.daemonset}} + expr: | + kube_daemonset_status_number_ready{job="kube-state-metrics"} + / + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetNotScheduled + annotations: + message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} + are not scheduled. + expr: | + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} + are running where they are not supposed to run. + expr: | + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + message: Overcommited CPU resource requests on Pods, cannot tolerate node + failure. + expr: | + sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) + / + sum(node:node_num_cpu:sum) + > + (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Overcommited Memory resource requests on Pods, cannot tolerate node + failure. + expr: | + sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) + / + sum(node_memory_MemTotal) + > + (count(node:node_num_cpu:sum)-1) + / + count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeCPUOvercommit + annotations: + message: Overcommited CPU resource request quota on Namespaces. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) + / + sum(node:node_num_cpu:sum) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Overcommited Memory resource request quota on Namespaces. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) + / + sum(node_memory_MemTotal{job="node-exporter"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaExceeded + annotations: + message: '{{ printf "%0.0f" $value }}% usage of {{ $labels.resource }} in + namespace {{ $labels.namespace }}.' + expr: | + 100 * kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + kube_resourcequota{job="kube-state-metrics", type="hard"} + > 90 + for: 15m + labels: + severity: warning + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeUsageCritical + annotations: + message: The persistent volume claimed by {{ $labels.persistentvolumeclaim + }} in namespace {{ $labels.namespace }} has {{ printf "%0.0f" $value }}% + free. + expr: | + 100 * kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + < 3 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFullInFourDays + annotations: + message: Based on recent sampling, the persistent volume claimed by {{ $labels.persistentvolumeclaim + }} in namespace {{ $labels.namespace }} is expected to fill up within four + days. + expr: | + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[1h], 4 * 24 * 3600) < 0 + for: 5m + labels: + severity: critical + - name: kubernetes-system + rules: + - alert: KubeNodeNotReady + annotations: + message: '{{ $labels.node }} has been unready for more than an hour' + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 1h + labels: + severity: warning + - alert: KubeVersionMismatch + annotations: + message: There are {{ $value }} different versions of Kubernetes components + running. + expr: | + count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 + for: 1h + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ printf "%0.0f" $value }}% errors.' + expr: | + sum(rate(rest_client_requests_total{code!~"2.."}[5m])) by (instance, job) * 100 + / + sum(rate(rest_client_requests_total[5m])) by (instance, job) + > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ printf "%0.0f" $value }} errors / sec.' + expr: | + sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + message: Kubelet {{$labels.instance}} is running {{$value}} pods, close to + the limit of 110. + expr: | + kubelet_running_pod_count{job="kubelet"} > 100 + for: 15m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}}. + expr: | + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 + for: 10m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}}. + expr: | + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is erroring for {{ $value }}% of requests. + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is erroring for {{ $value }}% of requests. + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + for: 10m + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: Kubernetes API certificate is expiring in less than 7 days. + expr: | + histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: Kubernetes API certificate is expiring in less than 1 day. + expr: | + histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical + - name: alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + annotations: + description: The configuration of the instances of the Alertmanager cluster + `{{$labels.service}}` are out of sync. + summary: Configuration out of sync + expr: | + count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + for: 5m + labels: + severity: critical + - alert: AlertmanagerDownOrMissing + annotations: + description: An unexpected number of Alertmanagers are scraped or Alertmanagers + disappeared from discovery. + summary: Alertmanager down or missing + expr: | + label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1 + for: 5m + labels: + severity: warning + - alert: AlertmanagerFailedReload + annotations: + description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + summary: Alertmanager's configuration reload failed + expr: | + alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0 + for: 10m + labels: + severity: warning + - name: general.rules + rules: + - alert: TargetDown + annotations: + description: '{{ $value }}% of {{ $labels.job }} targets are down.' + summary: Targets are down + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + - alert: DeadMansSwitch + annotations: + description: This is a DeadMansSwitch meant to ensure that the entire Alerting + pipeline is functional. + summary: Alerting DeadMansSwitch + expr: vector(1) + labels: + severity: none + - name: kube-prometheus-node-alerting.rules + rules: + - alert: NodeDiskRunningFull + annotations: + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 24 hours (mounted at {{$labels.mountpoint}}) + summary: Node disk is running full within 24 hours + expr: | + predict_linear(node_filesystem_free{job="node-exporter"}[6h], 3600 * 24) < 0 + for: 30m + labels: + severity: warning + - alert: NodeDiskRunningFull + annotations: + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 2 hours (mounted at {{$labels.mountpoint}}) + summary: Node disk is running full within 2 hours + expr: | + predict_linear(node_filesystem_free{job="node-exporter"}[30m], 3600 * 2) < 0 + for: 10m + labels: + severity: critical + - name: prometheus.rules + rules: + - alert: PrometheusConfigReloadFailed + annotations: + description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + summary: Reloading Promehteus' configuration failed + expr: | + prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0 + for: 10m + labels: + severity: warning + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ + $labels.pod}} + summary: Prometheus' alert notification queue is running full + expr: | + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"} + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alert from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alerts from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03 + for: 10m + labels: + severity: critical + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected + to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers + expr: | + prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + reload failures over the last four hours.' + summary: Prometheus has issues reloading data blocks from disk + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + compaction failures over the last four hours.' + summary: Prometheus has issues compacting sample blocks + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBWALCorruptions + annotations: + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead + log (WAL).' + summary: Prometheus write-ahead log is corrupted + expr: | + tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting + samples. + summary: Prometheus isn't ingesting samples + expr: | + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0 + for: 10m + labels: + severity: warning + - alert: PrometheusTargetScapesDuplicate + annotations: + description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected + due to duplicate timestamps but different values' + summary: Prometheus has many samples rejected + expr: | + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0 + for: 10m + labels: + severity: warning -- GitLab