diff --git a/experimental/thanos/prometheus-self.yaml b/experimental/thanos/prometheus-self.yaml deleted file mode 100644 index e778905a2062224dd8f66d47cdb412c1ea760461..0000000000000000000000000000000000000000 --- a/experimental/thanos/prometheus-self.yaml +++ /dev/null @@ -1,73 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: Prometheus -metadata: - name: self - labels: - prometheus: self -spec: - podMetadata: - labels: - thanos-peer: 'true' - replicas: 2 - version: v2.2.1 - serviceAccountName: prometheus-k8s - serviceMonitorSelector: - matchLabels: - app: prometheus - ruleSelector: - matchLabels: - role: prometheus-rulefiles - prometheus: k8s - resources: - requests: - # 2Gi is default, but won't schedule if you don't have a node with >2Gi - # memory. Modify based on your target and time-series count for - # production use. This value is mainly meant for demonstration/testing - # purposes. - memory: 400Mi - containers: - - name: thanos - image: improbable/thanos:latest - args: - - "sidecar" - - "--log.level=debug" - - "--cluster.peers=thanos-peers.default.svc:10900" - ports: - - name: http - containerPort: 10902 - - name: grpc - containerPort: 10901 - - name: cluster - containerPort: 10900 ---- -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: prometheus - labels: - app: prometheus -spec: - selector: - matchLabels: - app: prometheus - endpoints: - - port: web - interval: 30s ---- -apiVersion: v1 -kind: Service -metadata: - labels: - app: prometheus - prometheus: self - name: prometheus-self -spec: - type: NodePort - ports: - - name: web - nodePort: 30900 - port: 9090 - protocol: TCP - targetPort: web - selector: - prometheus: self diff --git a/experimental/thanos/query.yaml b/experimental/thanos/query.yaml deleted file mode 100644 index eb1d99ba2515b6e5efa20a51d43c3232aecd72b7..0000000000000000000000000000000000000000 --- a/experimental/thanos/query.yaml +++ /dev/null @@ -1,51 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: thanos-query - labels: - app: thanos-query - thanos-peer: "true" -spec: - replicas: 2 - selector: - matchLabels: - app: thanos-query - thanos-peer: "true" - template: - metadata: - labels: - app: thanos-query - thanos-peer: "true" - spec: - containers: - - name: thanos-query - image: improbable/thanos:latest - args: - - "query" - - "--log.level=debug" - - "--query.replica-label=prometheus_replica" - - "--cluster.peers=thanos-peers.default.svc:10900" - ports: - - name: http - containerPort: 10902 - - name: grpc - containerPort: 10901 - - name: cluster - containerPort: 10900 ---- -apiVersion: v1 -kind: Service -metadata: - labels: - app: thanos-query - name: thanos-query -spec: - type: NodePort - selector: - app: thanos-query - ports: - - port: 9090 - protocol: TCP - targetPort: http - name: http-query - nodePort: 31111 \ No newline at end of file diff --git a/experimental/thanos/thanos-peers-svc.yaml b/experimental/thanos/thanos-peers-svc.yaml deleted file mode 100644 index afcfcfe43575b82541acb4c6c500dc95afdda5a1..0000000000000000000000000000000000000000 --- a/experimental/thanos/thanos-peers-svc.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: thanos-peers -spec: - type: ClusterIP - clusterIP: None - ports: - - name: cluster - port: 10900 - targetPort: cluster - selector: - # Useful endpoint for gathering all thanos components for common gossip cluster. - thanos-peer: "true" \ No newline at end of file diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet index 27bc2398ed50889278070a4547df933ceed9a1fb..250e7bd7404a36c3d9319aa37764ce16db180f69 100644 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - alertmanager: 'v0.14.0', + alertmanager: 'v0.15.0', }, imageRepos+:: { diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index 5fe1c074c8b71919ffff70cf4f7f48f9dbcd8692..c36f293b33e367478f11adcfa696b67b319c7f33 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -5,8 +5,8 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - kubeStateMetrics: 'v1.3.0', - kubeRbacProxy: 'v0.3.0', + kubeStateMetrics: 'v1.3.1', + kubeRbacProxy: 'v0.3.1', addonResizer: '1.0', }, diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index a3e8d7a78e99cb493f76514977999e03c8e670a2..c51347a3ac7cba461e55aa33e8ebfc6a3dad552f 100644 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -6,7 +6,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; versions+:: { nodeExporter: 'v0.15.2', - kubeRbacProxy: 'v0.3.0', + kubeRbacProxy: 'v0.3.1', }, imageRepos+:: { diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index f1758cf7577641f1db48bd54a86fefe01e092481..e84986f52f000b0487d7e1e2e22a80845896deb2 100644 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; namespace: 'default', versions+:: { - prometheus: 'v2.2.1', + prometheus: 'v2.3.1', }, imageRepos+:: { diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index f4d73c22d07ca10771c022d8f2b1b1e63e7de9dc..923344f203fa01312c2c91b3d5166a14af62a48f 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -2672,6 +2672,77 @@ spec: phase: description: Phase represents the current phase of PersistentVolumeClaim. type: string + thanos: + description: ThanosSpec defines parameters for a Prometheus server within + a Thanos deployment. + properties: + baseImage: + description: Thanos base image if other than default. + type: string + gcs: + description: ThanosGCSSpec defines parameters for use of Google + Cloud Storage (GCS) with Thanos. + properties: + bucket: + description: Google Cloud Storage bucket name for stored blocks. + If empty it won't store any block inside Google Cloud Storage. + type: string + peers: + description: Peers is a DNS name for Thanos to discover peers through. + type: string + s3: + description: ThanosSpec defines parameters for of AWS Simple Storage + Service (S3) with Thanos. (S3 compatible services apply as well) + properties: + accessKey: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bucket: + description: S3-Compatible API bucket name for stored blocks. + type: string + endpoint: + description: S3-Compatible API endpoint for stored blocks. + type: string + insecure: + description: Whether to use an insecure connection with an S3-Compatible + API. + type: boolean + secretKey: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + signatureVersion2: + description: Whether to use S3 Signature Version 2; otherwise + Signature Version 4 will be used. + type: boolean + version: + description: Version describes the version of Thanos to use. + type: string tolerations: description: If specified, the pod's tolerations. items: diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index 2a8daa8d920e1dca445d895dc6a21c747c8dd613..bdc115b9ff4bbdcd9a187f431f76fa6c243dccdb 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -11,4 +11,4 @@ spec: beta.kubernetes.io/os: linux replicas: 3 serviceAccountName: alertmanager-main - version: v0.14.0 + version: v0.15.0 diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 6f04bb1f3ac6830be14538f9d6829a7708750ef4..1c9738367437d5586d4edb09f2493878d9109237 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -64,7 +64,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -150,7 +150,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -248,7 +248,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -334,7 +334,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -432,7 +432,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -518,7 +518,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -616,7 +616,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -702,7 +702,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -800,7 +800,7 @@ items: "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", - "legendLink": "/dashboard/file/k8s-node-rsrc-use.json", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", "step": 10 } ], @@ -909,6 +909,7 @@ items: }, "timezone": "utc", "title": "K8s / USE Method / Cluster", + "uid": "a6e7d1362e1ddbb79db21d5bb40d7137", "version": 0 } kind: ConfigMap @@ -1851,6 +1852,7 @@ items: }, "timezone": "utc", "title": "K8s / USE Method / Node", + "uid": "4ac4f123aae0ff6dbaf4f4f66120033b", "version": 0 } kind: ConfigMap @@ -2468,7 +2470,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -2828,7 +2830,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-namespace.json?var-datasource=$datasource&var-namespace=$__cell", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -3000,6 +3002,7 @@ items: }, "timezone": "utc", "title": "K8s / Compute Resources / Cluster", + "uid": "efa86fd1d0c121a26444b636a3f509a8", "version": 0 } kind: ConfigMap @@ -3269,7 +3272,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -3629,7 +3632,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/dashboard/file/k8s-resources-pod.json?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -3828,6 +3831,7 @@ items: }, "timezone": "utc", "title": "K8s / Compute Resources / Namespace", + "uid": "85a562078cdf77779eaa1add43ccec1e", "version": 0 } kind: ConfigMap @@ -4683,6 +4687,7 @@ items: }, "timezone": "utc", "title": "K8s / Compute Resources / Pod", + "uid": "6581e46e4e5c7ba40a07646395ef7b23", "version": 0 } kind: ConfigMap @@ -5609,6 +5614,7 @@ items: }, "timezone": "browser", "title": "Nodes", + "uid": "fa49a4706d07a042595b664c87fb33ea", "version": 0 } kind: ConfigMap @@ -6098,6 +6104,7 @@ items: }, "timezone": "browser", "title": "Pods", + "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea", "version": 0 } kind: ConfigMap @@ -6950,6 +6957,7 @@ items: }, "timezone": "browser", "title": "StatefulSets", + "uid": "a31c1f46e6f727cb37c0d731a7245005", "version": 0 } kind: ConfigMap diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index fb2a8b5f4ab9179fa66817ce7ede7a18fe11aabe..c7bb25c66f6025bce9edcf290623c0e95a305c62 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -19,7 +19,7 @@ spec: - args: - --secure-listen-address=:8443 - --upstream=http://127.0.0.1:8081/ - image: quay.io/coreos/kube-rbac-proxy:v0.3.0 + image: quay.io/coreos/kube-rbac-proxy:v0.3.1 name: kube-rbac-proxy-main ports: - containerPort: 8443 @@ -34,7 +34,7 @@ spec: - args: - --secure-listen-address=:9443 - --upstream=http://127.0.0.1:8082/ - image: quay.io/coreos/kube-rbac-proxy:v0.3.0 + image: quay.io/coreos/kube-rbac-proxy:v0.3.1 name: kube-rbac-proxy-self ports: - containerPort: 9443 @@ -51,7 +51,7 @@ spec: - --port=8081 - --telemetry-host=127.0.0.1 - --telemetry-port=8082 - image: quay.io/coreos/kube-state-metrics:v1.3.0 + image: quay.io/coreos/kube-state-metrics:v1.3.1 name: kube-state-metrics resources: limits: diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index 92182e05d897f8d16bc5aeb36d781313165e0c4b..f7c9ebb53e6b07a59482e82d5de06f207c077198 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -38,7 +38,7 @@ spec: - args: - --secure-listen-address=:9100 - --upstream=http://127.0.0.1:9101/ - image: quay.io/coreos/kube-rbac-proxy:v0.3.0 + image: quay.io/coreos/kube-rbac-proxy:v0.3.1 name: kube-rbac-proxy ports: - containerPort: 9100 diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index b7fe9f257afb25bd40a713512432da98581b277e..9a7448b6c04c82a3a87fe653c5496cc4cb83a6fb 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -27,4 +27,4 @@ spec: matchExpressions: - key: k8s-app operator: Exists - version: v2.2.1 + version: v2.3.1 diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index cca1c7354ad320302a4f0e99e72cc2859a3dfbf7..75d5f36e4d79166b2092d8cc82a4d1cdcaa2e78a 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -202,21 +202,21 @@ spec: ) record: node:node_memory_swap_io_bytes:sum_rate - expr: | - avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) record: :node_disk_utilisation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3 + irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd).+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -268,6 +268,7 @@ spec: - alert: AlertmanagerDown annotations: message: Alertmanager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown expr: | absent(up{job="alertmanager-main"} == 1) for: 15m @@ -276,6 +277,7 @@ spec: - alert: KubeAPIDown annotations: message: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown expr: | absent(up{job="apiserver"} == 1) for: 15m @@ -284,6 +286,7 @@ spec: - alert: KubeControllerManagerDown annotations: message: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown expr: | absent(up{job="kube-controller-manager"} == 1) for: 15m @@ -292,6 +295,7 @@ spec: - alert: KubeSchedulerDown annotations: message: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown expr: | absent(up{job="kube-scheduler"} == 1) for: 15m @@ -300,6 +304,7 @@ spec: - alert: KubeStateMetricsDown annotations: message: KubeStateMetrics has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown expr: | absent(up{job="kube-state-metrics"} == 1) for: 15m @@ -308,6 +313,7 @@ spec: - alert: KubeletDown annotations: message: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown expr: | absent(up{job="kubelet"} == 1) for: 15m @@ -316,6 +322,7 @@ spec: - alert: NodeExporterDown annotations: message: NodeExporter has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown expr: | absent(up{job="node-exporter"} == 1) for: 15m @@ -324,6 +331,7 @@ spec: - alert: PrometheusDown annotations: message: Prometheus has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown expr: | absent(up{job="prometheus-k8s"} == 1) for: 15m @@ -332,6 +340,7 @@ spec: - alert: PrometheusOperatorDown annotations: message: PrometheusOperator has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown expr: | absent(up{job="prometheus-operator"} == 1) for: 15m @@ -343,6 +352,7 @@ spec: annotations: message: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} / second' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping expr: | rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0 for: 1h @@ -351,6 +361,7 @@ spec: - alert: KubePodNotReady annotations: message: '{{ $labels.namespace }}/{{ $labels.pod }} is not ready.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready expr: | sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase!~"Running|Succeeded"}) > 0 for: 1h @@ -360,6 +371,7 @@ spec: annotations: message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} generation mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch expr: | kube_deployment_status_observed_generation{job="kube-state-metrics"} != @@ -371,6 +383,7 @@ spec: annotations: message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch expr: | kube_deployment_spec_replicas{job="kube-state-metrics"} != @@ -382,6 +395,7 @@ spec: annotations: message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch expr: | kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != @@ -393,6 +407,7 @@ spec: annotations: message: StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation mismatch + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch expr: | kube_statefulset_status_observed_generation{job="kube-state-metrics"} != @@ -404,6 +419,7 @@ spec: annotations: message: Only {{$value}}% of desired pods scheduled and ready for daemon set {{$labels.namespace}}/{{$labels.daemonset}} + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck expr: | kube_daemonset_status_number_ready{job="kube-state-metrics"} / @@ -415,6 +431,7 @@ spec: annotations: message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are not scheduled. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled expr: | kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - @@ -426,17 +443,48 @@ spec: annotations: message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} are running where they are not supposed to run. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled expr: | kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: warning + - alert: KubeCronJobRunning + annotations: + message: CronJob {{ $labels.namespaces }}/{{ $labels.cronjob }} is taking + more than 1h to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning + expr: | + time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 + for: 1h + labels: + severity: warning + - alert: KubeJobCompletion + annotations: + message: Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than + 1h to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion + expr: | + kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning + - alert: KubeJobFailed + annotations: + message: Job {{ $labels.namespaces }}/{{ $labels.job }} failed to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed + expr: | + kube_job_status_failed{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning - name: kubernetes-resources rules: - alert: KubeCPUOvercommit annotations: message: Overcommited CPU resource requests on Pods, cannot tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) / @@ -450,6 +498,7 @@ spec: annotations: message: Overcommited Memory resource requests on Pods, cannot tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) / @@ -464,6 +513,7 @@ spec: - alert: KubeCPUOvercommit annotations: message: Overcommited CPU resource request quota on Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) / @@ -475,6 +525,7 @@ spec: - alert: KubeMemOvercommit annotations: message: Overcommited Memory resource request quota on Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) / @@ -487,6 +538,7 @@ spec: annotations: message: '{{ printf "%0.0f" $value }}% usage of {{ $labels.resource }} in namespace {{ $labels.namespace }}.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded expr: | 100 * kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) @@ -502,6 +554,7 @@ spec: message: The persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} has {{ printf "%0.0f" $value }}% free. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical expr: | 100 * kubelet_volume_stats_available_bytes{job="kubelet"} / @@ -515,6 +568,7 @@ spec: message: Based on recent sampling, the persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is expected to fill up within four days. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays expr: | predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[1h], 4 * 24 * 3600) < 0 for: 5m @@ -525,6 +579,7 @@ spec: - alert: KubeNodeNotReady annotations: message: '{{ $labels.node }} has been unready for more than an hour' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready expr: | kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 for: 1h @@ -534,6 +589,7 @@ spec: annotations: message: There are {{ $value }} different versions of Kubernetes components running. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch expr: | count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 for: 1h @@ -543,6 +599,7 @@ spec: annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }}% errors.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | sum(rate(rest_client_requests_total{code!~"2.."}[5m])) by (instance, job) * 100 / @@ -555,6 +612,7 @@ spec: annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }} errors / sec.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 for: 15m @@ -564,6 +622,7 @@ spec: annotations: message: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods expr: | kubelet_running_pod_count{job="kubelet"} > 100 for: 15m @@ -573,6 +632,7 @@ spec: annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m @@ -582,6 +642,7 @@ spec: annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} {{$labels.resource}}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m @@ -590,6 +651,7 @@ spec: - alert: KubeAPIErrorsHigh annotations: message: API server is erroring for {{ $value }}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) / @@ -600,6 +662,7 @@ spec: - alert: KubeAPIErrorsHigh annotations: message: API server is erroring for {{ $value }}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) / @@ -610,6 +673,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: message: Kubernetes API certificate is expiring in less than 7 days. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: @@ -617,6 +681,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: message: Kubernetes API certificate is expiring in less than 1 day. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: