From 1b7c8cdf2199b4f6f070a280b6aa7248772dd291 Mon Sep 17 00:00:00 2001
From: Frederic Branczyk <fbranczyk@gmail.com>
Date: Mon, 16 Oct 2017 15:11:53 +0200
Subject: [PATCH] *: bump Prometheus to v2.0.0-rc.1

---
 assets/prometheus/rules/alertmanager.rules    |   36 -
 .../prometheus/rules/alertmanager.rules.yaml  |   33 +
 assets/prometheus/rules/etcd3.rules           |  177 ---
 assets/prometheus/rules/etcd3.rules.yaml      |  123 ++
 assets/prometheus/rules/general.rules         |   63 -
 assets/prometheus/rules/general.rules.yaml    |   48 +
 assets/prometheus/rules/kube-apiserver.rules  |   28 -
 .../rules/kube-apiserver.rules.yaml           |   22 +
 .../rules/kube-controller-manager.rules       |   11 -
 .../rules/kube-controller-manager.rules.yaml  |   13 +
 assets/prometheus/rules/kube-scheduler.rules  |   11 -
 .../rules/kube-scheduler.rules.yaml           |   13 +
 assets/prometheus/rules/kubelet.rules         |   60 -
 assets/prometheus/rules/kubelet.rules.yaml    |   49 +
 assets/prometheus/rules/kubernetes.rules      |  171 ---
 assets/prometheus/rules/kubernetes.rules.yaml |  115 ++
 assets/prometheus/rules/node.rules            |   43 -
 assets/prometheus/rules/node.rules.yaml       |   37 +
 assets/prometheus/rules/prometheus.rules      |   10 -
 assets/prometheus/rules/prometheus.rules.yaml |   12 +
 hack/scripts/generate-rules-configmap.sh      |    2 +-
 .../prometheus/prometheus-k8s-rules.yaml      | 1095 +++++++----------
 manifests/prometheus/prometheus-k8s.yaml      |    2 +-
 23 files changed, 942 insertions(+), 1232 deletions(-)
 delete mode 100644 assets/prometheus/rules/alertmanager.rules
 create mode 100644 assets/prometheus/rules/alertmanager.rules.yaml
 delete mode 100644 assets/prometheus/rules/etcd3.rules
 create mode 100644 assets/prometheus/rules/etcd3.rules.yaml
 delete mode 100644 assets/prometheus/rules/general.rules
 create mode 100644 assets/prometheus/rules/general.rules.yaml
 delete mode 100644 assets/prometheus/rules/kube-apiserver.rules
 create mode 100644 assets/prometheus/rules/kube-apiserver.rules.yaml
 delete mode 100644 assets/prometheus/rules/kube-controller-manager.rules
 create mode 100644 assets/prometheus/rules/kube-controller-manager.rules.yaml
 delete mode 100644 assets/prometheus/rules/kube-scheduler.rules
 create mode 100644 assets/prometheus/rules/kube-scheduler.rules.yaml
 delete mode 100644 assets/prometheus/rules/kubelet.rules
 create mode 100644 assets/prometheus/rules/kubelet.rules.yaml
 delete mode 100644 assets/prometheus/rules/kubernetes.rules
 create mode 100644 assets/prometheus/rules/kubernetes.rules.yaml
 delete mode 100644 assets/prometheus/rules/node.rules
 create mode 100644 assets/prometheus/rules/node.rules.yaml
 delete mode 100644 assets/prometheus/rules/prometheus.rules
 create mode 100644 assets/prometheus/rules/prometheus.rules.yaml

diff --git a/assets/prometheus/rules/alertmanager.rules b/assets/prometheus/rules/alertmanager.rules
deleted file mode 100644
index 71bdc687..00000000
--- a/assets/prometheus/rules/alertmanager.rules
+++ /dev/null
@@ -1,36 +0,0 @@
-ALERT AlertmanagerConfigInconsistent
-  IF   count_values by (service) ("config_hash", alertmanager_config_hash)
-     / on(service) group_left
-       label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
-  FOR 5m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "Alertmanager configurations are inconsistent",
-    description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync."
-  }
-
-ALERT AlertmanagerDownOrMissing
-  IF   label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)")
-     / on(job) group_right
-       sum by(job) (up) != 1
-  FOR 5m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Alertmanager down or not discovered",
-    description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery."
-  }
-
-ALERT FailedReload
-  IF alertmanager_config_last_reload_successful == 0
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Alertmanager configuration reload has failed",
-    description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
-  }
diff --git a/assets/prometheus/rules/alertmanager.rules.yaml b/assets/prometheus/rules/alertmanager.rules.yaml
new file mode 100644
index 00000000..8f65c5da
--- /dev/null
+++ b/assets/prometheus/rules/alertmanager.rules.yaml
@@ -0,0 +1,33 @@
+groups:
+- name: ./alertmanager.rules
+  rules:
+  - alert: AlertmanagerConfigInconsistent
+    expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
+      GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
+      "alertmanager-$1", "alertmanager", "(.*)") != 1
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: The configuration of the instances of the Alertmanager cluster
+        `{{$labels.service}}` are out of sync.
+      summary: Alertmanager configurations are inconsistent
+  - alert: AlertmanagerDownOrMissing
+    expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
+      "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      description: An unexpected number of Alertmanagers are scraped or Alertmanagers
+        disappeared from discovery.
+      summary: Alertmanager down or not discovered
+  - alert: FailedReload
+    expr: alertmanager_config_last_reload_successful == 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
+        }}/{{ $labels.pod}}.
+      summary: Alertmanager configuration reload has failed
diff --git a/assets/prometheus/rules/etcd3.rules b/assets/prometheus/rules/etcd3.rules
deleted file mode 100644
index 1b1621e4..00000000
--- a/assets/prometheus/rules/etcd3.rules
+++ /dev/null
@@ -1,177 +0,0 @@
-# general cluster availability
-
-# alert if another failed member will result in an unavailable cluster
-ALERT InsufficientMembers
-IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
-FOR 3m
-LABELS {
-  severity = "critical"
-}
-ANNOTATIONS {
-  summary = "etcd cluster insufficient members",
-  description = "If one more etcd member goes down the cluster will be unavailable",
-}
-
-# etcd leader alerts
-# ==================
-
-# alert if any etcd instance has no leader
-ALERT NoLeader
-IF etcd_server_has_leader{job="etcd"} == 0
-FOR 1m
-LABELS {
-  severity = "critical"
-}
-ANNOTATIONS {
-  summary = "etcd member has no leader",
-  description = "etcd member {{ $labels.instance }} has no leader",
-}
-
-# alert if there are lots of leader changes
-ALERT HighNumberOfLeaderChanges
-IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "a high number of leader changes within the etcd cluster are happening",
-  description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
-}
-
-# gRPC request alerts
-# ===================
-
-# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
-ALERT HighNumberOfFailedGRPCRequests
-IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
-  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "a high number of gRPC requests are failing",
-  description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
-}
-
-# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
-ALERT HighNumberOfFailedGRPCRequests
-IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
-  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
-FOR 5m
-LABELS {
-  severity = "critical"
-}
-ANNOTATIONS {
-  summary = "a high number of gRPC requests are failing",
-  description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
-}
-
-# alert if the 99th percentile of gRPC method calls take more than 150ms
-ALERT GRPCRequestsSlow
-IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
-FOR 10m
-LABELS {
-  severity = "critical"
-}
-ANNOTATIONS {
-  summary = "slow gRPC requests",
-  description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow",
-}
-
-# HTTP requests alerts
-# ====================
-
-# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
-ALERT HighNumberOfFailedHTTPRequests
-IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
-  / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "a high number of HTTP requests are failing",
-  description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
-}
-
-# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
-ALERT HighNumberOfFailedHTTPRequests
-IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
-  / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
-FOR 5m
-LABELS {
-  severity = "critical"
-}
-ANNOTATIONS {
-  summary = "a high number of HTTP requests are failing",
-  description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
-}
-
-# alert if the 99th percentile of HTTP requests take more than 150ms
-ALERT HTTPRequestsSlow
-IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "slow HTTP requests",
-  description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",
-}
-
-# etcd member communication alerts
-# ================================
-
-# alert if 99th percentile of round trips take 150ms
-ALERT EtcdMemberCommunicationSlow
-IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "etcd member communication is slow",
-  description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow",
-}
-
-# etcd proposal alerts
-# ====================
-
-# alert if there are several failed proposals within an hour
-ALERT HighNumberOfFailedProposals
-IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "a high number of proposals within the etcd cluster are failing",
-  description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
-}
-
-# etcd disk io latency alerts
-# ===========================
-
-# alert if 99th percentile of fsync durations is higher than 500ms
-ALERT HighFsyncDurations
-IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "high fsync durations",
-  description = "etcd instance {{ $labels.instance }} fync durations are high",
-}
-
-# alert if 99th percentile of commit durations is higher than 250ms
-ALERT HighCommitDurations
-IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "high commit durations",
-  description = "etcd instance {{ $labels.instance }} commit durations are high",
-}
diff --git a/assets/prometheus/rules/etcd3.rules.yaml b/assets/prometheus/rules/etcd3.rules.yaml
new file mode 100644
index 00000000..ade2ed62
--- /dev/null
+++ b/assets/prometheus/rules/etcd3.rules.yaml
@@ -0,0 +1,123 @@
+groups:
+- name: ./etcd3.rules
+  rules:
+  - alert: InsufficientMembers
+    expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
+    for: 3m
+    labels:
+      severity: critical
+    annotations:
+      description: If one more etcd member goes down the cluster will be unavailable
+      summary: etcd cluster insufficient members
+  - alert: NoLeader
+    expr: etcd_server_has_leader{job="etcd"} == 0
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      description: etcd member {{ $labels.instance }} has no leader
+      summary: etcd member has no leader
+  - alert: HighNumberOfLeaderChanges
+    expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
+    labels:
+      severity: warning
+    annotations:
+      description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
+        changes within the last hour
+      summary: a high number of leader changes within the etcd cluster are happening
+  - alert: HighNumberOfFailedGRPCRequests
+    expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
+      / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
+        on etcd instance {{ $labels.instance }}'
+      summary: a high number of gRPC requests are failing
+  - alert: HighNumberOfFailedGRPCRequests
+    expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
+      / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
+        on etcd instance {{ $labels.instance }}'
+      summary: a high number of gRPC requests are failing
+  - alert: GRPCRequestsSlow
+    expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m]))
+      > 0.15
+    for: 10m
+    labels:
+      severity: critical
+    annotations:
+      description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
+        }} are slow
+      summary: slow gRPC requests
+  - alert: HighNumberOfFailedHTTPRequests
+    expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
+      BY (method) > 0.01
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
+        instance {{ $labels.instance }}'
+      summary: a high number of HTTP requests are failing
+  - alert: HighNumberOfFailedHTTPRequests
+    expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
+      BY (method) > 0.05
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
+        instance {{ $labels.instance }}'
+      summary: a high number of HTTP requests are failing
+  - alert: HTTPRequestsSlow
+    expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
+      > 0.15
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
+        }} are slow
+      summary: slow HTTP requests
+  - alert: EtcdMemberCommunicationSlow
+    expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
+      > 0.15
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: etcd instance {{ $labels.instance }} member communication with
+        {{ $labels.To }} is slow
+      summary: etcd member communication is slow
+  - alert: HighNumberOfFailedProposals
+    expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
+    labels:
+      severity: warning
+    annotations:
+      description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
+        failures within the last hour
+      summary: a high number of proposals within the etcd cluster are failing
+  - alert: HighFsyncDurations
+    expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
+      > 0.5
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: etcd instance {{ $labels.instance }} fync durations are high
+      summary: high fsync durations
+  - alert: HighCommitDurations
+    expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
+      > 0.25
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: etcd instance {{ $labels.instance }} commit durations are high
+      summary: high commit durations
diff --git a/assets/prometheus/rules/general.rules b/assets/prometheus/rules/general.rules
deleted file mode 100644
index 3500d689..00000000
--- a/assets/prometheus/rules/general.rules
+++ /dev/null
@@ -1,63 +0,0 @@
-### Up Alerting ###
-
-Alert TargetDown
-  IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Targets are down",
-    description = "{{ $value }}% or more of {{ $labels.job }} targets are down."
-  }
-
-### Dead man's switch ###
-
-ALERT DeadMansSwitch
-  IF vector(1)
-  LABELS {
-    severity = "none",
-  }
-  ANNOTATIONS {
-    summary = "Alerting DeadMansSwitch",
-    description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.",
-  }
-
-### File descriptor alerts ###
-
-ALERT TooManyOpenFileDescriptors
-  IF 100 * (process_open_fds / process_max_fds) > 95
-  FOR 10m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "too many open file descriptors",
-    description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.",
-  }
-
-instance:fd_utilization = process_open_fds / process_max_fds
-
-# alert if file descriptors are likely to exhaust within the next 4 hours
-ALERT FdExhaustionClose
-  IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "file descriptors soon exhausted",
-    description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
-  }
-
-# alert if file descriptors are likely to exhaust within the next hour
-ALERT FdExhaustionClose
-  IF predict_linear(instance:fd_utilization[10m], 3600) > 1
-  FOR 10m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "file descriptors soon exhausted",
-    description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
-  }
diff --git a/assets/prometheus/rules/general.rules.yaml b/assets/prometheus/rules/general.rules.yaml
new file mode 100644
index 00000000..355e12f3
--- /dev/null
+++ b/assets/prometheus/rules/general.rules.yaml
@@ -0,0 +1,48 @@
+groups:
+- name: ./general.rules
+  rules:
+  - alert: TargetDown
+    expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{ $value }}% or more of {{ $labels.job }} targets are down.'
+      summary: Targets are down
+  - alert: DeadMansSwitch
+    expr: vector(1)
+    labels:
+      severity: none
+    annotations:
+      description: This is a DeadMansSwitch meant to ensure that the entire Alerting
+        pipeline is functional.
+      summary: Alerting DeadMansSwitch
+  - alert: TooManyOpenFileDescriptors
+    expr: 100 * (process_open_fds / process_max_fds) > 95
+    for: 10m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+        $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.'
+      summary: too many open file descriptors
+  - record: instance:fd_utilization
+    expr: process_open_fds / process_max_fds
+  - alert: FdExhaustionClose
+    expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+        $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+      summary: file descriptors soon exhausted
+  - alert: FdExhaustionClose
+    expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
+    for: 10m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+        $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+      summary: file descriptors soon exhausted
diff --git a/assets/prometheus/rules/kube-apiserver.rules b/assets/prometheus/rules/kube-apiserver.rules
deleted file mode 100644
index 04b4a6de..00000000
--- a/assets/prometheus/rules/kube-apiserver.rules
+++ /dev/null
@@ -1,28 +0,0 @@
-ALERT K8SApiserverDown
-  IF absent(up{job="apiserver"} == 1)
-  FOR 5m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "API server unreachable",
-    description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.",
-  }
-
-# Some verbs excluded because they are expected to be long-lasting:
-# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
-#
-# apiserver_request_latencies' unit is microseconds
-ALERT K8SApiServerLatency
-  IF histogram_quantile(
-      0.99,
-      sum without (instance,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
-    ) / 1e6 > 1.0
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Kubernetes apiserver latency is high",
-    description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
-  }
diff --git a/assets/prometheus/rules/kube-apiserver.rules.yaml b/assets/prometheus/rules/kube-apiserver.rules.yaml
new file mode 100644
index 00000000..55ebe025
--- /dev/null
+++ b/assets/prometheus/rules/kube-apiserver.rules.yaml
@@ -0,0 +1,22 @@
+groups:
+- name: ./kube-apiserver.rules
+  rules:
+  - alert: K8SApiserverDown
+    expr: absent(up{job="apiserver"} == 1)
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: Prometheus failed to scrape API server(s), or all API servers have
+        disappeared from service discovery.
+      summary: API server unreachable
+  - alert: K8SApiServerLatency
+    expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"})
+      WITHOUT (instance, resource)) / 1e+06 > 1
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: 99th percentile Latency for {{ $labels.verb }} requests to the
+        kube-apiserver is higher than 1s.
+      summary: Kubernetes apiserver latency is high
diff --git a/assets/prometheus/rules/kube-controller-manager.rules b/assets/prometheus/rules/kube-controller-manager.rules
deleted file mode 100644
index 3157cd12..00000000
--- a/assets/prometheus/rules/kube-controller-manager.rules
+++ /dev/null
@@ -1,11 +0,0 @@
-ALERT K8SControllerManagerDown
-  IF absent(up{job="kube-controller-manager"} == 1)
-  FOR 5m
-  LABELS {
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Controller manager is down",
-    description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
-    runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager",
-  }
diff --git a/assets/prometheus/rules/kube-controller-manager.rules.yaml b/assets/prometheus/rules/kube-controller-manager.rules.yaml
new file mode 100644
index 00000000..f23bbde3
--- /dev/null
+++ b/assets/prometheus/rules/kube-controller-manager.rules.yaml
@@ -0,0 +1,13 @@
+groups:
+- name: ./kube-controller-manager.rules
+  rules:
+  - alert: K8SControllerManagerDown
+    expr: absent(up{job="kube-controller-manager"} == 1)
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: There is no running K8S controller manager. Deployments and replication
+        controllers are not making progress.
+      runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
+      summary: Controller manager is down
diff --git a/assets/prometheus/rules/kube-scheduler.rules b/assets/prometheus/rules/kube-scheduler.rules
deleted file mode 100644
index ee86017a..00000000
--- a/assets/prometheus/rules/kube-scheduler.rules
+++ /dev/null
@@ -1,11 +0,0 @@
-ALERT K8SSchedulerDown
-  IF absent(up{job="kube-scheduler"} == 1)
-  FOR 5m
-  LABELS {
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Scheduler is down",
-    description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
-    runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler",
-  }
diff --git a/assets/prometheus/rules/kube-scheduler.rules.yaml b/assets/prometheus/rules/kube-scheduler.rules.yaml
new file mode 100644
index 00000000..0383b3b1
--- /dev/null
+++ b/assets/prometheus/rules/kube-scheduler.rules.yaml
@@ -0,0 +1,13 @@
+groups:
+- name: ./kube-scheduler.rules
+  rules:
+  - alert: K8SSchedulerDown
+    expr: absent(up{job="kube-scheduler"} == 1)
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: There is no running K8S scheduler. New pods are not being assigned
+        to nodes.
+      runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
+      summary: Scheduler is down
diff --git a/assets/prometheus/rules/kubelet.rules b/assets/prometheus/rules/kubelet.rules
deleted file mode 100644
index 0d47d9d7..00000000
--- a/assets/prometheus/rules/kubelet.rules
+++ /dev/null
@@ -1,60 +0,0 @@
-ALERT K8SNodeNotReady
-  IF kube_node_status_condition{condition="Ready", status="true"} == 0
-  FOR 1h
-  LABELS {
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Node status is NotReady",
-    description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
-  }
-
-ALERT K8SManyNodesNotReady
-  IF
-    count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1
-    AND
-      (
-        count(kube_node_status_condition{condition="Ready", status="true"} == 0)
-      /
-        count(kube_node_status_condition{condition="Ready", status="true"})
-      ) > 0.2
-  FOR 1m
-  LABELS {
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Many Kubernetes nodes are Not Ready",
-    description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).",
-  }
-
-ALERT K8SKubeletDown
-  IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
-  FOR 1h
-  LABELS {
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Many Kubelets cannot be scraped",
-    description = "Prometheus failed to scrape {{ $value }}% of kubelets.",
-  }
-
-ALERT K8SKubeletDown
-  IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
-  FOR 1h
-  LABELS {
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Many Kubelets cannot be scraped",
-    description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.",
-  }
-
-ALERT K8SKubeletTooManyPods
-  IF kubelet_running_pod_count > 100
-  LABELS {
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Kubelet is close to pod limit",
-    description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
-  }
diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml
new file mode 100644
index 00000000..1aa5f84c
--- /dev/null
+++ b/assets/prometheus/rules/kubelet.rules.yaml
@@ -0,0 +1,49 @@
+groups:
+- name: ./kubelet.rules
+  rules:
+  - alert: K8SNodeNotReady
+    expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+    for: 1h
+    labels:
+      severity: warning
+    annotations:
+      description: The Kubelet on {{ $labels.node }} has not checked in with the API,
+        or has set itself to NotReady, for more than an hour
+      summary: Node status is NotReady
+  - alert: K8SManyNodesNotReady
+    expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
+      > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
+      0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
+        state).'
+      summary: Many Kubernetes nodes are Not Ready
+  - alert: K8SKubeletDown
+    expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
+    for: 1h
+    labels:
+      severity: warning
+    annotations:
+      description: Prometheus failed to scrape {{ $value }}% of kubelets.
+      summary: Many Kubelets cannot be scraped
+  - alert: K8SKubeletDown
+    expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})
+      > 0.1
+    for: 1h
+    labels:
+      severity: critical
+    annotations:
+      description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
+        have disappeared from service discovery.
+      summary: Many Kubelets cannot be scraped
+  - alert: K8SKubeletTooManyPods
+    expr: kubelet_running_pod_count > 100
+    labels:
+      severity: warning
+    annotations:
+      description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
+        to the limit of 110
+      summary: Kubelet is close to pod limit
diff --git a/assets/prometheus/rules/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules
deleted file mode 100644
index 084d11e5..00000000
--- a/assets/prometheus/rules/kubernetes.rules
+++ /dev/null
@@ -1,171 +0,0 @@
-# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
-
-### Container resources ###
-
-cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_spec_memory_limit_bytes{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:spec_cpu_shares =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_spec_cpu_shares{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:cpu_usage:rate =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      irate(
-        container_cpu_usage_seconds_total{container_name!=""}[5m]
-      ),
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_usage:bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_memory_usage_bytes{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_working_set:bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_memory_working_set_bytes{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_rss:bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_memory_rss{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_cache:bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_memory_cache{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:disk_usage:bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_disk_usage_bytes{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_pagefaults:rate =
-  sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
-    label_replace(
-      irate(
-        container_memory_failures_total{container_name!=""}[5m]
-      ),
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_oom:rate =
-  sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
-    label_replace(
-      irate(
-        container_memory_failcnt{container_name!=""}[5m]
-      ),
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-### Cluster resources ###
-
-cluster:memory_allocation:percent =
-  100 * sum by (cluster) (
-    container_spec_memory_limit_bytes{pod_name!=""}
-  ) / sum by (cluster) (
-    machine_memory_bytes
-  )
-
-cluster:memory_used:percent =
-  100 * sum by (cluster) (
-    container_memory_usage_bytes{pod_name!=""}
-  ) / sum by (cluster) (
-    machine_memory_bytes
-  )
-
-cluster:cpu_allocation:percent =
-  100 * sum by (cluster) (
-    container_spec_cpu_shares{pod_name!=""}
-  ) / sum by (cluster) (
-    container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
-  )
-
-cluster:node_cpu_use:percent =
-  100 * sum by (cluster) (
-    rate(node_cpu{mode!="idle"}[5m])
-  ) / sum by (cluster) (
-    machine_cpu_cores
-  )
-
-### API latency ###
-
-# Raw metrics are in microseconds. Convert to seconds.
-cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
-  histogram_quantile(
-    0.99,
-    sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
-  ) / 1e6
-cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
-  histogram_quantile(
-    0.9,
-    sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
-  ) / 1e6
-cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
-  histogram_quantile(
-    0.5,
-    sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
-  ) / 1e6
-
-### Scheduling latency ###
-
-cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
-  histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
-  histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
-  histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
-
-cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
-  histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
-  histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
-  histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
-
-cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
-  histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
-  histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
-  histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml
new file mode 100644
index 00000000..ab5ccf06
--- /dev/null
+++ b/assets/prometheus/rules/kubernetes.rules.yaml
@@ -0,0 +1,115 @@
+groups:
+- name: ./kubernetes.rules
+  rules:
+  - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
+    expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
+      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+      controller, pod_name, container_name)
+  - record: cluster_namespace_controller_pod_container:spec_cpu_shares
+    expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
+      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+      container_name)
+  - record: cluster_namespace_controller_pod_container:cpu_usage:rate
+    expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
+      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+      controller, pod_name, container_name)
+  - record: cluster_namespace_controller_pod_container:memory_usage:bytes
+    expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
+      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+      container_name)
+  - record: cluster_namespace_controller_pod_container:memory_working_set:bytes
+    expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
+      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+      controller, pod_name, container_name)
+  - record: cluster_namespace_controller_pod_container:memory_rss:bytes
+    expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
+      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+      container_name)
+  - record: cluster_namespace_controller_pod_container:memory_cache:bytes
+    expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
+      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+      container_name)
+  - record: cluster_namespace_controller_pod_container:disk_usage:bytes
+    expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
+      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+      container_name)
+  - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
+    expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
+      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+      controller, pod_name, container_name, scope, type)
+  - record: cluster_namespace_controller_pod_container:memory_oom:rate
+    expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
+      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+      controller, pod_name, container_name, scope, type)
+  - record: cluster:memory_allocation:percent
+    expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
+      / sum(machine_memory_bytes) BY (cluster)
+  - record: cluster:memory_used:percent
+    expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
+      BY (cluster)
+  - record: cluster:cpu_allocation:percent
+    expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
+      * ON(cluster, instance) machine_cpu_cores) BY (cluster)
+  - record: cluster:node_cpu_use:percent
+    expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
+      BY (cluster)
+  - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+    expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
+      cluster, job, resource, verb)) / 1e+06
+    labels:
+      quantile: "0.99"
+  - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+    expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
+      cluster, job, resource, verb)) / 1e+06
+    labels:
+      quantile: "0.9"
+  - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+    expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
+      cluster, job, resource, verb)) / 1e+06
+    labels:
+      quantile: "0.5"
+  - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+    expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.99"
+  - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+    expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.9"
+  - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+    expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.5"
+  - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+    expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.99"
+  - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+    expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.9"
+  - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+    expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.5"
+  - record: cluster:scheduler_binding_latency:quantile_seconds
+    expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.99"
+  - record: cluster:scheduler_binding_latency:quantile_seconds
+    expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.9"
+  - record: cluster:scheduler_binding_latency:quantile_seconds
+    expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.5"
diff --git a/assets/prometheus/rules/node.rules b/assets/prometheus/rules/node.rules
deleted file mode 100644
index 4f768671..00000000
--- a/assets/prometheus/rules/node.rules
+++ /dev/null
@@ -1,43 +0,0 @@
-ALERT NodeExporterDown
-  IF absent(up{job="node-exporter"} == 1)
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "node-exporter cannot be scraped",
-    description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
-  }
-
-ALERT K8SNodeOutOfDisk
-  IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1
-  LABELS {
-    service = "k8s",
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "Node ran out of disk space.",
-    description = "{{ $labels.node }} has run out of disk space.",
-  }
-
-ALERT K8SNodeMemoryPressure
-  IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Node is under memory pressure.",
-    description = "{{ $labels.node }} is under memory pressure.",
-  }
-
-ALERT K8SNodeDiskPressure
-  IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Node is under disk pressure.",
-    description = "{{ $labels.node }} is under disk pressure.",
-  }
diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml
new file mode 100644
index 00000000..9c1641ca
--- /dev/null
+++ b/assets/prometheus/rules/node.rules.yaml
@@ -0,0 +1,37 @@
+groups:
+- name: ./node.rules
+  rules:
+  - alert: NodeExporterDown
+    expr: absent(up{job="node-exporter"} == 1)
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Prometheus could not scrape a node-exporter for more than 10m,
+        or node-exporters have disappeared from discovery.
+      summary: node-exporter cannot be scraped
+  - alert: K8SNodeOutOfDisk
+    expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+    labels:
+      service: k8s
+      severity: critical
+    annotations:
+      description: '{{ $labels.node }} has run out of disk space.'
+      summary: Node ran out of disk space.
+  - alert: K8SNodeMemoryPressure
+    expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
+      1
+    labels:
+      service: k8s
+      severity: warning
+    annotations:
+      description: '{{ $labels.node }} is under memory pressure.'
+      summary: Node is under memory pressure.
+  - alert: K8SNodeDiskPressure
+    expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+    labels:
+      service: k8s
+      severity: warning
+    annotations:
+      description: '{{ $labels.node }} is under disk pressure.'
+      summary: Node is under disk pressure.
diff --git a/assets/prometheus/rules/prometheus.rules b/assets/prometheus/rules/prometheus.rules
deleted file mode 100644
index 05c278f1..00000000
--- a/assets/prometheus/rules/prometheus.rules
+++ /dev/null
@@ -1,10 +0,0 @@
-ALERT FailedReload
-  IF prometheus_config_last_reload_successful == 0
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Prometheus configuration reload has failed",
-    description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
-  }
diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml
new file mode 100644
index 00000000..6ed0cd68
--- /dev/null
+++ b/assets/prometheus/rules/prometheus.rules.yaml
@@ -0,0 +1,12 @@
+groups:
+- name: ./prometheus.rules
+  rules:
+  - alert: FailedReload
+    expr: prometheus_config_last_reload_successful == 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Reloading Prometheus' configuration has failed for {{ $labels.namespace
+        }}/{{ $labels.pod}}.
+      summary: Prometheus configuration reload has failed
diff --git a/hack/scripts/generate-rules-configmap.sh b/hack/scripts/generate-rules-configmap.sh
index b8e00fef..9eb2efc8 100755
--- a/hack/scripts/generate-rules-configmap.sh
+++ b/hack/scripts/generate-rules-configmap.sh
@@ -11,7 +11,7 @@ metadata:
 data:
 EOF
 
-for f in assets/prometheus/rules/*.rules
+for f in assets/prometheus/rules/*.rules.yaml
 do
   echo "  $(basename $f): |+"
   cat $f | sed "s/^/    /g"
diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml
index de3d7787..041c127b 100644
--- a/manifests/prometheus/prometheus-k8s-rules.yaml
+++ b/manifests/prometheus/prometheus-k8s-rules.yaml
@@ -6,623 +6,478 @@ metadata:
     role: prometheus-rulefiles
     prometheus: k8s
 data:
-  alertmanager.rules: |+
-    ALERT AlertmanagerConfigInconsistent
-      IF   count_values by (service) ("config_hash", alertmanager_config_hash)
-         / on(service) group_left
-           label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
-      FOR 5m
-      LABELS {
-        severity = "critical"
-      }
-      ANNOTATIONS {
-        summary = "Alertmanager configurations are inconsistent",
-        description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync."
-      }
-    
-    ALERT AlertmanagerDownOrMissing
-      IF   label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)")
-         / on(job) group_right
-           sum by(job) (up) != 1
-      FOR 5m
-      LABELS {
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "Alertmanager down or not discovered",
-        description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery."
-      }
-    
-    ALERT FailedReload
-      IF alertmanager_config_last_reload_successful == 0
-      FOR 10m
-      LABELS {
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "Alertmanager configuration reload has failed",
-        description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
-      }
-  etcd3.rules: |+
-    # general cluster availability
-    
-    # alert if another failed member will result in an unavailable cluster
-    ALERT InsufficientMembers
-    IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
-    FOR 3m
-    LABELS {
-      severity = "critical"
-    }
-    ANNOTATIONS {
-      summary = "etcd cluster insufficient members",
-      description = "If one more etcd member goes down the cluster will be unavailable",
-    }
-    
-    # etcd leader alerts
-    # ==================
-    
-    # alert if any etcd instance has no leader
-    ALERT NoLeader
-    IF etcd_server_has_leader{job="etcd"} == 0
-    FOR 1m
-    LABELS {
-      severity = "critical"
-    }
-    ANNOTATIONS {
-      summary = "etcd member has no leader",
-      description = "etcd member {{ $labels.instance }} has no leader",
-    }
-    
-    # alert if there are lots of leader changes
-    ALERT HighNumberOfLeaderChanges
-    IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
-    LABELS {
-      severity = "warning"
-    }
-    ANNOTATIONS {
-      summary = "a high number of leader changes within the etcd cluster are happening",
-      description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
-    }
-    
-    # gRPC request alerts
-    # ===================
-    
-    # alert if more than 1% of gRPC method calls have failed within the last 5 minutes
-    ALERT HighNumberOfFailedGRPCRequests
-    IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
-      / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
-    FOR 10m
-    LABELS {
-      severity = "warning"
-    }
-    ANNOTATIONS {
-      summary = "a high number of gRPC requests are failing",
-      description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
-    }
-    
-    # alert if more than 5% of gRPC method calls have failed within the last 5 minutes
-    ALERT HighNumberOfFailedGRPCRequests
-    IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
-      / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
-    FOR 5m
-    LABELS {
-      severity = "critical"
-    }
-    ANNOTATIONS {
-      summary = "a high number of gRPC requests are failing",
-      description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
-    }
-    
-    # alert if the 99th percentile of gRPC method calls take more than 150ms
-    ALERT GRPCRequestsSlow
-    IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
-    FOR 10m
-    LABELS {
-      severity = "critical"
-    }
-    ANNOTATIONS {
-      summary = "slow gRPC requests",
-      description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow",
-    }
-    
-    # HTTP requests alerts
-    # ====================
-    
-    # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
-    ALERT HighNumberOfFailedHTTPRequests
-    IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
-      / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
-    FOR 10m
-    LABELS {
-      severity = "warning"
-    }
-    ANNOTATIONS {
-      summary = "a high number of HTTP requests are failing",
-      description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
-    }
-    
-    # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
-    ALERT HighNumberOfFailedHTTPRequests
-    IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
-      / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
-    FOR 5m
-    LABELS {
-      severity = "critical"
-    }
-    ANNOTATIONS {
-      summary = "a high number of HTTP requests are failing",
-      description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
-    }
-    
-    # alert if the 99th percentile of HTTP requests take more than 150ms
-    ALERT HTTPRequestsSlow
-    IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
-    FOR 10m
-    LABELS {
-      severity = "warning"
-    }
-    ANNOTATIONS {
-      summary = "slow HTTP requests",
-      description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",
-    }
-    
-    # etcd member communication alerts
-    # ================================
-    
-    # alert if 99th percentile of round trips take 150ms
-    ALERT EtcdMemberCommunicationSlow
-    IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
-    FOR 10m
-    LABELS {
-      severity = "warning"
-    }
-    ANNOTATIONS {
-      summary = "etcd member communication is slow",
-      description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow",
-    }
-    
-    # etcd proposal alerts
-    # ====================
-    
-    # alert if there are several failed proposals within an hour
-    ALERT HighNumberOfFailedProposals
-    IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
-    LABELS {
-      severity = "warning"
-    }
-    ANNOTATIONS {
-      summary = "a high number of proposals within the etcd cluster are failing",
-      description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
-    }
-    
-    # etcd disk io latency alerts
-    # ===========================
-    
-    # alert if 99th percentile of fsync durations is higher than 500ms
-    ALERT HighFsyncDurations
-    IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
-    FOR 10m
-    LABELS {
-      severity = "warning"
-    }
-    ANNOTATIONS {
-      summary = "high fsync durations",
-      description = "etcd instance {{ $labels.instance }} fync durations are high",
-    }
-    
-    # alert if 99th percentile of commit durations is higher than 250ms
-    ALERT HighCommitDurations
-    IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
-    FOR 10m
-    LABELS {
-      severity = "warning"
-    }
-    ANNOTATIONS {
-      summary = "high commit durations",
-      description = "etcd instance {{ $labels.instance }} commit durations are high",
-    }
-  general.rules: |+
-    ### Up Alerting ###
-    
-    Alert TargetDown
-      IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10
-      FOR 10m
-      LABELS {
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "Targets are down",
-        description = "{{ $value }}% or more of {{ $labels.job }} targets are down."
-      }
-    
-    ### Dead man's switch ###
-    
-    ALERT DeadMansSwitch
-      IF vector(1)
-      LABELS {
-        severity = "none",
-      }
-      ANNOTATIONS {
-        summary = "Alerting DeadMansSwitch",
-        description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.",
-      }
-    
-    ### File descriptor alerts ###
-    
-    ALERT TooManyOpenFileDescriptors
-      IF 100 * (process_open_fds / process_max_fds) > 95
-      FOR 10m
-      LABELS {
-        severity = "critical"
-      }
-      ANNOTATIONS {
-        summary = "too many open file descriptors",
-        description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.",
-      }
-    
-    instance:fd_utilization = process_open_fds / process_max_fds
-    
-    # alert if file descriptors are likely to exhaust within the next 4 hours
-    ALERT FdExhaustionClose
-      IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
-      FOR 10m
-      LABELS {
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "file descriptors soon exhausted",
-        description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
-      }
-    
-    # alert if file descriptors are likely to exhaust within the next hour
-    ALERT FdExhaustionClose
-      IF predict_linear(instance:fd_utilization[10m], 3600) > 1
-      FOR 10m
-      LABELS {
-        severity = "critical"
-      }
-      ANNOTATIONS {
-        summary = "file descriptors soon exhausted",
-        description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
-      }
-  kube-apiserver.rules: |+
-    ALERT K8SApiserverDown
-      IF absent(up{job="apiserver"} == 1)
-      FOR 5m
-      LABELS {
-        severity = "critical"
-      }
-      ANNOTATIONS {
-        summary = "API server unreachable",
-        description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.",
-      }
-    
-    # Some verbs excluded because they are expected to be long-lasting:
-    # WATCHLIST is long-poll, CONNECT is `kubectl exec`.
-    #
-    # apiserver_request_latencies' unit is microseconds
-    ALERT K8SApiServerLatency
-      IF histogram_quantile(
-          0.99,
-          sum without (instance,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
-        ) / 1e6 > 1.0
-      FOR 10m
-      LABELS {
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "Kubernetes apiserver latency is high",
-        description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
-      }
-  kube-controller-manager.rules: |+
-    ALERT K8SControllerManagerDown
-      IF absent(up{job="kube-controller-manager"} == 1)
-      FOR 5m
-      LABELS {
-        severity = "critical",
-      }
-      ANNOTATIONS {
-        summary = "Controller manager is down",
-        description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
-        runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager",
-      }
-  kubelet.rules: |+
-    ALERT K8SNodeNotReady
-      IF kube_node_status_condition{condition="Ready", status="true"} == 0
-      FOR 1h
-      LABELS {
-        severity = "warning",
-      }
-      ANNOTATIONS {
-        summary = "Node status is NotReady",
-        description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
-      }
-    
-    ALERT K8SManyNodesNotReady
-      IF
-        count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1
-        AND
-          (
-            count(kube_node_status_condition{condition="Ready", status="true"} == 0)
-          /
-            count(kube_node_status_condition{condition="Ready", status="true"})
-          ) > 0.2
-      FOR 1m
-      LABELS {
-        severity = "critical",
-      }
-      ANNOTATIONS {
-        summary = "Many Kubernetes nodes are Not Ready",
-        description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).",
-      }
-    
-    ALERT K8SKubeletDown
-      IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
-      FOR 1h
-      LABELS {
-        severity = "warning",
-      }
-      ANNOTATIONS {
-        summary = "Many Kubelets cannot be scraped",
-        description = "Prometheus failed to scrape {{ $value }}% of kubelets.",
-      }
-    
-    ALERT K8SKubeletDown
-      IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
-      FOR 1h
-      LABELS {
-        severity = "critical",
-      }
-      ANNOTATIONS {
-        summary = "Many Kubelets cannot be scraped",
-        description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.",
-      }
-    
-    ALERT K8SKubeletTooManyPods
-      IF kubelet_running_pod_count > 100
-      LABELS {
-        severity = "warning",
-      }
-      ANNOTATIONS {
-        summary = "Kubelet is close to pod limit",
-        description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
-      }
-  kubernetes.rules: |+
-    # NOTE: These rules were kindly contributed by the SoundCloud engineering team.
-    
-    ### Container resources ###
-    
-    cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
-      sum by (cluster,namespace,controller,pod_name,container_name) (
-        label_replace(
-          container_spec_memory_limit_bytes{container_name!=""},
-          "controller", "$1",
-          "pod_name", "^(.*)-[a-z0-9]+"
-        )
-      )
-    
-    cluster_namespace_controller_pod_container:spec_cpu_shares =
-      sum by (cluster,namespace,controller,pod_name,container_name) (
-        label_replace(
-          container_spec_cpu_shares{container_name!=""},
-          "controller", "$1",
-          "pod_name", "^(.*)-[a-z0-9]+"
-        )
-      )
-    
-    cluster_namespace_controller_pod_container:cpu_usage:rate =
-      sum by (cluster,namespace,controller,pod_name,container_name) (
-        label_replace(
-          irate(
-            container_cpu_usage_seconds_total{container_name!=""}[5m]
-          ),
-          "controller", "$1",
-          "pod_name", "^(.*)-[a-z0-9]+"
-        )
-      )
-    
-    cluster_namespace_controller_pod_container:memory_usage:bytes =
-      sum by (cluster,namespace,controller,pod_name,container_name) (
-        label_replace(
-          container_memory_usage_bytes{container_name!=""},
-          "controller", "$1",
-          "pod_name", "^(.*)-[a-z0-9]+"
-        )
-      )
-    
-    cluster_namespace_controller_pod_container:memory_working_set:bytes =
-      sum by (cluster,namespace,controller,pod_name,container_name) (
-        label_replace(
-          container_memory_working_set_bytes{container_name!=""},
-          "controller", "$1",
-          "pod_name", "^(.*)-[a-z0-9]+"
-        )
-      )
-    
-    cluster_namespace_controller_pod_container:memory_rss:bytes =
-      sum by (cluster,namespace,controller,pod_name,container_name) (
-        label_replace(
-          container_memory_rss{container_name!=""},
-          "controller", "$1",
-          "pod_name", "^(.*)-[a-z0-9]+"
-        )
-      )
-    
-    cluster_namespace_controller_pod_container:memory_cache:bytes =
-      sum by (cluster,namespace,controller,pod_name,container_name) (
-        label_replace(
-          container_memory_cache{container_name!=""},
-          "controller", "$1",
-          "pod_name", "^(.*)-[a-z0-9]+"
-        )
-      )
-    
-    cluster_namespace_controller_pod_container:disk_usage:bytes =
-      sum by (cluster,namespace,controller,pod_name,container_name) (
-        label_replace(
-          container_disk_usage_bytes{container_name!=""},
-          "controller", "$1",
-          "pod_name", "^(.*)-[a-z0-9]+"
-        )
-      )
-    
-    cluster_namespace_controller_pod_container:memory_pagefaults:rate =
-      sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
-        label_replace(
-          irate(
-            container_memory_failures_total{container_name!=""}[5m]
-          ),
-          "controller", "$1",
-          "pod_name", "^(.*)-[a-z0-9]+"
-        )
-      )
-    
-    cluster_namespace_controller_pod_container:memory_oom:rate =
-      sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
-        label_replace(
-          irate(
-            container_memory_failcnt{container_name!=""}[5m]
-          ),
-          "controller", "$1",
-          "pod_name", "^(.*)-[a-z0-9]+"
-        )
-      )
-    
-    ### Cluster resources ###
-    
-    cluster:memory_allocation:percent =
-      100 * sum by (cluster) (
-        container_spec_memory_limit_bytes{pod_name!=""}
-      ) / sum by (cluster) (
-        machine_memory_bytes
-      )
-    
-    cluster:memory_used:percent =
-      100 * sum by (cluster) (
-        container_memory_usage_bytes{pod_name!=""}
-      ) / sum by (cluster) (
-        machine_memory_bytes
-      )
-    
-    cluster:cpu_allocation:percent =
-      100 * sum by (cluster) (
-        container_spec_cpu_shares{pod_name!=""}
-      ) / sum by (cluster) (
-        container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
-      )
-    
-    cluster:node_cpu_use:percent =
-      100 * sum by (cluster) (
-        rate(node_cpu{mode!="idle"}[5m])
-      ) / sum by (cluster) (
-        machine_cpu_cores
-      )
-    
-    ### API latency ###
-    
-    # Raw metrics are in microseconds. Convert to seconds.
-    cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
-      histogram_quantile(
-        0.99,
-        sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
-      ) / 1e6
-    cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
-      histogram_quantile(
-        0.9,
-        sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
-      ) / 1e6
-    cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
-      histogram_quantile(
-        0.5,
-        sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
-      ) / 1e6
-    
-    ### Scheduling latency ###
-    
-    cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
-      histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
-    cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
-      histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
-    cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
-      histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
-    
-    cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
-      histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
-    cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
-      histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
-    cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
-      histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
-    
-    cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
-      histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
-    cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
-      histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
-    cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
-      histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
-  kube-scheduler.rules: |+
-    ALERT K8SSchedulerDown
-      IF absent(up{job="kube-scheduler"} == 1)
-      FOR 5m
-      LABELS {
-        severity = "critical",
-      }
-      ANNOTATIONS {
-        summary = "Scheduler is down",
-        description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
-        runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler",
-      }
-  node.rules: |+
-    ALERT NodeExporterDown
-      IF absent(up{job="node-exporter"} == 1)
-      FOR 10m
-      LABELS {
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "node-exporter cannot be scraped",
-        description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
-      }
-    
-    ALERT K8SNodeOutOfDisk
-      IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1
-      LABELS {
-        service = "k8s",
-        severity = "critical"
-      }
-      ANNOTATIONS {
-        summary = "Node ran out of disk space.",
-        description = "{{ $labels.node }} has run out of disk space.",
-      }
-    
-    ALERT K8SNodeMemoryPressure
-      IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
-      LABELS {
-        service = "k8s",
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "Node is under memory pressure.",
-        description = "{{ $labels.node }} is under memory pressure.",
-      }
-    
-    ALERT K8SNodeDiskPressure
-      IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1
-      LABELS {
-        service = "k8s",
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "Node is under disk pressure.",
-        description = "{{ $labels.node }} is under disk pressure.",
-      }
-  prometheus.rules: |+
-    ALERT FailedReload
-      IF prometheus_config_last_reload_successful == 0
-      FOR 10m
-      LABELS {
-        severity = "warning"
-      }
-      ANNOTATIONS {
-        summary = "Prometheus configuration reload has failed",
-        description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
-      }
+  alertmanager.rules.yaml: |+
+    groups:
+    - name: ./alertmanager.rules
+      rules:
+      - alert: AlertmanagerConfigInconsistent
+        expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
+          GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
+          "alertmanager-$1", "alertmanager", "(.*)") != 1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          description: The configuration of the instances of the Alertmanager cluster
+            `{{$labels.service}}` are out of sync.
+          summary: Alertmanager configurations are inconsistent
+      - alert: AlertmanagerDownOrMissing
+        expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
+          "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          description: An unexpected number of Alertmanagers are scraped or Alertmanagers
+            disappeared from discovery.
+          summary: Alertmanager down or not discovered
+      - alert: FailedReload
+        expr: alertmanager_config_last_reload_successful == 0
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
+            }}/{{ $labels.pod}}.
+          summary: Alertmanager configuration reload has failed
+  etcd3.rules.yaml: |+
+    groups:
+    - name: ./etcd3.rules
+      rules:
+      - alert: InsufficientMembers
+        expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
+        for: 3m
+        labels:
+          severity: critical
+        annotations:
+          description: If one more etcd member goes down the cluster will be unavailable
+          summary: etcd cluster insufficient members
+      - alert: NoLeader
+        expr: etcd_server_has_leader{job="etcd"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          description: etcd member {{ $labels.instance }} has no leader
+          summary: etcd member has no leader
+      - alert: HighNumberOfLeaderChanges
+        expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
+        labels:
+          severity: warning
+        annotations:
+          description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
+            changes within the last hour
+          summary: a high number of leader changes within the etcd cluster are happening
+      - alert: HighNumberOfFailedGRPCRequests
+        expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
+          / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
+            on etcd instance {{ $labels.instance }}'
+          summary: a high number of gRPC requests are failing
+      - alert: HighNumberOfFailedGRPCRequests
+        expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
+          / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
+            on etcd instance {{ $labels.instance }}'
+          summary: a high number of gRPC requests are failing
+      - alert: GRPCRequestsSlow
+        expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m]))
+          > 0.15
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
+            }} are slow
+          summary: slow gRPC requests
+      - alert: HighNumberOfFailedHTTPRequests
+        expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
+          BY (method) > 0.01
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
+            instance {{ $labels.instance }}'
+          summary: a high number of HTTP requests are failing
+      - alert: HighNumberOfFailedHTTPRequests
+        expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
+          BY (method) > 0.05
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
+            instance {{ $labels.instance }}'
+          summary: a high number of HTTP requests are failing
+      - alert: HTTPRequestsSlow
+        expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
+          > 0.15
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
+            }} are slow
+          summary: slow HTTP requests
+      - alert: EtcdMemberCommunicationSlow
+        expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
+          > 0.15
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: etcd instance {{ $labels.instance }} member communication with
+            {{ $labels.To }} is slow
+          summary: etcd member communication is slow
+      - alert: HighNumberOfFailedProposals
+        expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
+        labels:
+          severity: warning
+        annotations:
+          description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
+            failures within the last hour
+          summary: a high number of proposals within the etcd cluster are failing
+      - alert: HighFsyncDurations
+        expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
+          > 0.5
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: etcd instance {{ $labels.instance }} fync durations are high
+          summary: high fsync durations
+      - alert: HighCommitDurations
+        expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
+          > 0.25
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: etcd instance {{ $labels.instance }} commit durations are high
+          summary: high commit durations
+  general.rules.yaml: |+
+    groups:
+    - name: ./general.rules
+      rules:
+      - alert: TargetDown
+        expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: '{{ $value }}% or more of {{ $labels.job }} targets are down.'
+          summary: Targets are down
+      - alert: DeadMansSwitch
+        expr: vector(1)
+        labels:
+          severity: none
+        annotations:
+          description: This is a DeadMansSwitch meant to ensure that the entire Alerting
+            pipeline is functional.
+          summary: Alerting DeadMansSwitch
+      - alert: TooManyOpenFileDescriptors
+        expr: 100 * (process_open_fds / process_max_fds) > 95
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+            $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.'
+          summary: too many open file descriptors
+      - record: instance:fd_utilization
+        expr: process_open_fds / process_max_fds
+      - alert: FdExhaustionClose
+        expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+            $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+          summary: file descriptors soon exhausted
+      - alert: FdExhaustionClose
+        expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+            $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+          summary: file descriptors soon exhausted
+  kube-apiserver.rules.yaml: |+
+    groups:
+    - name: ./kube-apiserver.rules
+      rules:
+      - alert: K8SApiserverDown
+        expr: absent(up{job="apiserver"} == 1)
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          description: Prometheus failed to scrape API server(s), or all API servers have
+            disappeared from service discovery.
+          summary: API server unreachable
+      - alert: K8SApiServerLatency
+        expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"})
+          WITHOUT (instance, resource)) / 1e+06 > 1
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: 99th percentile Latency for {{ $labels.verb }} requests to the
+            kube-apiserver is higher than 1s.
+          summary: Kubernetes apiserver latency is high
+  kube-controller-manager.rules.yaml: |+
+    groups:
+    - name: ./kube-controller-manager.rules
+      rules:
+      - alert: K8SControllerManagerDown
+        expr: absent(up{job="kube-controller-manager"} == 1)
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          description: There is no running K8S controller manager. Deployments and replication
+            controllers are not making progress.
+          runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
+          summary: Controller manager is down
+  kube-scheduler.rules.yaml: |+
+    groups:
+    - name: ./kube-scheduler.rules
+      rules:
+      - alert: K8SSchedulerDown
+        expr: absent(up{job="kube-scheduler"} == 1)
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          description: There is no running K8S scheduler. New pods are not being assigned
+            to nodes.
+          runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
+          summary: Scheduler is down
+  kubelet.rules.yaml: |+
+    groups:
+    - name: ./kubelet.rules
+      rules:
+      - alert: K8SNodeNotReady
+        expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          description: The Kubelet on {{ $labels.node }} has not checked in with the API,
+            or has set itself to NotReady, for more than an hour
+          summary: Node status is NotReady
+      - alert: K8SManyNodesNotReady
+        expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
+          > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
+          0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
+            state).'
+          summary: Many Kubernetes nodes are Not Ready
+      - alert: K8SKubeletDown
+        expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          description: Prometheus failed to scrape {{ $value }}% of kubelets.
+          summary: Many Kubelets cannot be scraped
+      - alert: K8SKubeletDown
+        expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})
+          > 0.1
+        for: 1h
+        labels:
+          severity: critical
+        annotations:
+          description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
+            have disappeared from service discovery.
+          summary: Many Kubelets cannot be scraped
+      - alert: K8SKubeletTooManyPods
+        expr: kubelet_running_pod_count > 100
+        labels:
+          severity: warning
+        annotations:
+          description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
+            to the limit of 110
+          summary: Kubelet is close to pod limit
+  kubernetes.rules.yaml: |+
+    groups:
+    - name: ./kubernetes.rules
+      rules:
+      - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
+        expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
+          "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+          controller, pod_name, container_name)
+      - record: cluster_namespace_controller_pod_container:spec_cpu_shares
+        expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
+          "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+          container_name)
+      - record: cluster_namespace_controller_pod_container:cpu_usage:rate
+        expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
+          "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+          controller, pod_name, container_name)
+      - record: cluster_namespace_controller_pod_container:memory_usage:bytes
+        expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
+          "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+          container_name)
+      - record: cluster_namespace_controller_pod_container:memory_working_set:bytes
+        expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
+          "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+          controller, pod_name, container_name)
+      - record: cluster_namespace_controller_pod_container:memory_rss:bytes
+        expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
+          "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+          container_name)
+      - record: cluster_namespace_controller_pod_container:memory_cache:bytes
+        expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
+          "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+          container_name)
+      - record: cluster_namespace_controller_pod_container:disk_usage:bytes
+        expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
+          "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+          container_name)
+      - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
+        expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
+          "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+          controller, pod_name, container_name, scope, type)
+      - record: cluster_namespace_controller_pod_container:memory_oom:rate
+        expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
+          "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+          controller, pod_name, container_name, scope, type)
+      - record: cluster:memory_allocation:percent
+        expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
+          / sum(machine_memory_bytes) BY (cluster)
+      - record: cluster:memory_used:percent
+        expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
+          BY (cluster)
+      - record: cluster:cpu_allocation:percent
+        expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
+          * ON(cluster, instance) machine_cpu_cores) BY (cluster)
+      - record: cluster:node_cpu_use:percent
+        expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
+          BY (cluster)
+      - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+        expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
+          cluster, job, resource, verb)) / 1e+06
+        labels:
+          quantile: "0.99"
+      - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+        expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
+          cluster, job, resource, verb)) / 1e+06
+        labels:
+          quantile: "0.9"
+      - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+        expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
+          cluster, job, resource, verb)) / 1e+06
+        labels:
+          quantile: "0.5"
+      - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+        expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.99"
+      - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+        expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.9"
+      - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+        expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.5"
+      - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+        expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.99"
+      - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+        expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.9"
+      - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+        expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.5"
+      - record: cluster:scheduler_binding_latency:quantile_seconds
+        expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.99"
+      - record: cluster:scheduler_binding_latency:quantile_seconds
+        expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.9"
+      - record: cluster:scheduler_binding_latency:quantile_seconds
+        expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.5"
+  node.rules.yaml: |+
+    groups:
+    - name: ./node.rules
+      rules:
+      - alert: NodeExporterDown
+        expr: absent(up{job="node-exporter"} == 1)
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: Prometheus could not scrape a node-exporter for more than 10m,
+            or node-exporters have disappeared from discovery.
+          summary: node-exporter cannot be scraped
+      - alert: K8SNodeOutOfDisk
+        expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+        labels:
+          service: k8s
+          severity: critical
+        annotations:
+          description: '{{ $labels.node }} has run out of disk space.'
+          summary: Node ran out of disk space.
+      - alert: K8SNodeMemoryPressure
+        expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
+          1
+        labels:
+          service: k8s
+          severity: warning
+        annotations:
+          description: '{{ $labels.node }} is under memory pressure.'
+          summary: Node is under memory pressure.
+      - alert: K8SNodeDiskPressure
+        expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+        labels:
+          service: k8s
+          severity: warning
+        annotations:
+          description: '{{ $labels.node }} is under disk pressure.'
+          summary: Node is under disk pressure.
+  prometheus.rules.yaml: |+
+    groups:
+    - name: ./prometheus.rules
+      rules:
+      - alert: FailedReload
+        expr: prometheus_config_last_reload_successful == 0
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: Reloading Prometheus' configuration has failed for {{ $labels.namespace
+            }}/{{ $labels.pod}}.
+          summary: Prometheus configuration reload has failed
diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml
index e936de46..168daa34 100644
--- a/manifests/prometheus/prometheus-k8s.yaml
+++ b/manifests/prometheus/prometheus-k8s.yaml
@@ -6,7 +6,7 @@ metadata:
     prometheus: k8s
 spec:
   replicas: 2
-  version: v1.7.2
+  version: v2.0.0-rc.1
   serviceAccountName: prometheus-k8s
   serviceMonitorSelector:
     matchExpressions:
-- 
GitLab