diff --git a/assets/prometheus/rules/alertmanager.rules.yaml b/assets/prometheus/rules/alertmanager.rules.yaml
index 8f65c5da6fb45d0342221ab29d0e7778603c7685..fdfdfd0f546ebd3365a0429acca6c46a3acbf700 100644
--- a/assets/prometheus/rules/alertmanager.rules.yaml
+++ b/assets/prometheus/rules/alertmanager.rules.yaml
@@ -1,5 +1,5 @@
 groups:
-- name: ./alertmanager.rules
+- name: alertmanager.rules
   rules:
   - alert: AlertmanagerConfigInconsistent
     expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
@@ -11,7 +11,6 @@ groups:
     annotations:
       description: The configuration of the instances of the Alertmanager cluster
         `{{$labels.service}}` are out of sync.
-      summary: Alertmanager configurations are inconsistent
   - alert: AlertmanagerDownOrMissing
     expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
       "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
@@ -21,8 +20,7 @@ groups:
     annotations:
       description: An unexpected number of Alertmanagers are scraped or Alertmanagers
         disappeared from discovery.
-      summary: Alertmanager down or not discovered
-  - alert: FailedReload
+  - alert: AlertmanagerFailedReload
     expr: alertmanager_config_last_reload_successful == 0
     for: 10m
     labels:
@@ -30,4 +28,3 @@ groups:
     annotations:
       description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
         }}/{{ $labels.pod}}.
-      summary: Alertmanager configuration reload has failed
diff --git a/assets/prometheus/rules/general.rules.yaml b/assets/prometheus/rules/general.rules.yaml
index 355e12f3ac36b4b1f7217b08c934b70c260ad06a..84ce6b47fc9013df48a2eb23e958205e83f42bba 100644
--- a/assets/prometheus/rules/general.rules.yaml
+++ b/assets/prometheus/rules/general.rules.yaml
@@ -1,5 +1,5 @@
 groups:
-- name: ./general.rules
+- name: general.rules
   rules:
   - alert: TargetDown
     expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
@@ -7,7 +7,7 @@ groups:
     labels:
       severity: warning
     annotations:
-      description: '{{ $value }}% or more of {{ $labels.job }} targets are down.'
+      description: '{{ $value }}% of {{ $labels.job }} targets are down.'
       summary: Targets are down
   - alert: DeadMansSwitch
     expr: vector(1)
@@ -17,32 +17,23 @@ groups:
       description: This is a DeadMansSwitch meant to ensure that the entire Alerting
         pipeline is functional.
       summary: Alerting DeadMansSwitch
-  - alert: TooManyOpenFileDescriptors
-    expr: 100 * (process_open_fds / process_max_fds) > 95
-    for: 10m
-    labels:
-      severity: critical
-    annotations:
-      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
-        $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.'
-      summary: too many open file descriptors
-  - record: instance:fd_utilization
+  - record: fd_utilization
     expr: process_open_fds / process_max_fds
   - alert: FdExhaustionClose
-    expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
+    expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
     for: 10m
     labels:
       severity: warning
     annotations:
-      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
-        $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
+        will exhaust in file/socket descriptors within the next 4 hours'
       summary: file descriptors soon exhausted
   - alert: FdExhaustionClose
-    expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
+    expr: predict_linear(fd_utilization[10m], 3600) > 1
     for: 10m
     labels:
       severity: critical
     annotations:
-      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
-        $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
+        will exhaust in file/socket descriptors within the next hour'
       summary: file descriptors soon exhausted
diff --git a/assets/prometheus/rules/kube-apiserver.rules.yaml b/assets/prometheus/rules/kube-apiserver.rules.yaml
deleted file mode 100644
index 50982b0546567437b07f88eb93638e5b10f07583..0000000000000000000000000000000000000000
--- a/assets/prometheus/rules/kube-apiserver.rules.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-groups:
-- name: ./kube-apiserver.rules
-  rules:
-  - alert: K8SApiserverDown
-    expr: absent(up{job="apiserver"} == 1)
-    for: 5m
-    labels:
-      severity: critical
-    annotations:
-      description: Prometheus failed to scrape API server(s), or all API servers have
-        disappeared from service discovery.
-      summary: API server unreachable
-  - alert: K8SApiServerLatency
-    expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m]))
-       by (le)) / 1e+06 > 1
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: 99th percentile Latency for {{ $labels.verb }} requests to the
-        kube-apiserver is higher than 1s.
-      summary: Kubernetes apiserver latency is high
diff --git a/assets/prometheus/rules/kube-controller-manager.rules.yaml b/assets/prometheus/rules/kube-controller-manager.rules.yaml
index f23bbde3852bf376866c87d1b3ece76d54cd999b..4ea82ed1c24988c97454cf5bcb435ec4511009c7 100644
--- a/assets/prometheus/rules/kube-controller-manager.rules.yaml
+++ b/assets/prometheus/rules/kube-controller-manager.rules.yaml
@@ -1,5 +1,5 @@
 groups:
-- name: ./kube-controller-manager.rules
+- name: kube-controller-manager.rules
   rules:
   - alert: K8SControllerManagerDown
     expr: absent(up{job="kube-controller-manager"} == 1)
diff --git a/assets/prometheus/rules/kube-scheduler.rules.yaml b/assets/prometheus/rules/kube-scheduler.rules.yaml
index 0383b3b18a1a23d63ead066265080ba1e5dd2691..8f0c01fd2e959c9aeea573245adf368c7be18cb6 100644
--- a/assets/prometheus/rules/kube-scheduler.rules.yaml
+++ b/assets/prometheus/rules/kube-scheduler.rules.yaml
@@ -1,6 +1,51 @@
 groups:
-- name: ./kube-scheduler.rules
+- name: kube-scheduler.rules
   rules:
+  - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
+    expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.99"
+  - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
+    expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.9"
+  - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
+    expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.5"
+  - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
+    expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.99"
+  - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
+    expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.9"
+  - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
+    expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.5"
+  - record: cluster:scheduler_binding_latency_seconds:quantile
+    expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.99"
+  - record: cluster:scheduler_binding_latency_seconds:quantile
+    expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.9"
+  - record: cluster:scheduler_binding_latency_seconds:quantile
+    expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.5"
   - alert: K8SSchedulerDown
     expr: absent(up{job="kube-scheduler"} == 1)
     for: 5m
diff --git a/assets/prometheus/rules/kube-state-metrics.rules.yaml b/assets/prometheus/rules/kube-state-metrics.rules.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..32b99fa20a194f91d0e9913ceec2039a284f4953
--- /dev/null
+++ b/assets/prometheus/rules/kube-state-metrics.rules.yaml
@@ -0,0 +1,55 @@
+groups:
+- name: kube-state-metrics.rules
+  rules:
+  - alert: DeploymentGenerationMismatch
+    expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
+    for: 15m
+    labels:
+      severity: warning
+    annotations:
+      description: Observed deployment generation does not match expected one for
+        deployment {{$labels.namespaces}}{{$labels.deployment}}
+  - alert: DeploymentReplicasNotUpdated
+    expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
+      or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
+      unless (kube_deployment_spec_paused == 1)
+    for: 15m
+    labels:
+      severity: warning
+    annotations:
+      description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
+  - alert: DaemonSetRolloutStuck
+    expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled
+      * 100 < 100
+    for: 15m
+    labels:
+      severity: warning
+    annotations:
+      description: Only {{$value}}% of desired pods scheduled and ready for daemon
+        set {{$labels.namespaces}}/{{$labels.daemonset}}
+  - alert: K8SDaemonSetsNotScheduled
+    expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
+      > 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: A number of daemonsets are not scheduled.
+      summary: Daemonsets are not scheduled correctly
+  - alert: DaemonSetsMissScheduled
+    expr: kube_daemonset_status_number_misscheduled > 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: A number of daemonsets are running where they are not supposed
+        to run.
+      summary: Daemonsets are not scheduled correctly
+  - alert: PodFrequentlyRestarting
+    expr: increase(kube_pod_container_status_restarts[1h]) > 5
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
+        times within the last hour
diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml
index 03ea03da60568f563e4d0e31c34a11528a83b886..a1fc93cbaebeb65134e8e35f8a3c8b29cb390dd5 100644
--- a/assets/prometheus/rules/kubelet.rules.yaml
+++ b/assets/prometheus/rules/kubelet.rules.yaml
@@ -1,5 +1,5 @@
 groups:
-- name: ./kubelet.rules
+- name: kubelet.rules
   rules:
   - alert: K8SNodeNotReady
     expr: kube_node_status_condition{condition="Ready",status="true"} == 0
@@ -18,20 +18,17 @@ groups:
     labels:
       severity: critical
     annotations:
-      description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
-        state).'
-      summary: Many Kubernetes nodes are Not Ready
+      description: '{{ $value }}% of Kubernetes nodes are not ready'
   - alert: K8SKubeletDown
-    expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
+    expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
     for: 1h
     labels:
       severity: warning
     annotations:
       description: Prometheus failed to scrape {{ $value }}% of kubelets.
-      summary: Many Kubelets cannot be scraped
   - alert: K8SKubeletDown
-    expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})
-      > 0.1
+    expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
+      * 100 > 1
     for: 1h
     labels:
       severity: critical
@@ -41,36 +38,10 @@ groups:
       summary: Many Kubelets cannot be scraped
   - alert: K8SKubeletTooManyPods
     expr: kubelet_running_pod_count > 100
+    for: 10m
     labels:
       severity: warning
     annotations:
       description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
         to the limit of 110
       summary: Kubelet is close to pod limit
-  - alert: K8SDaemonSetsNotScheduled
-    expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
-      > 0
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: A number of daemonsets are not scheduled.
-      summary: Daemonsets are not scheduled correctly
-  - alert: K8SDaemonSetsNotRunning
-    expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready
-      > 0
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: A number of daemonsets are not ready.
-      summary: Daemonsets are not ready
-  - alert: K8SDaemonSetsMissScheduled
-    expr: kube_daemonset_status_number_misscheduled > 0
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: A number of daemonsets are running where they are not supposed
-        to run.
-      summary: Daemonsets are not scheduled correctly
diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml
index ab5ccf061ad7f4b72fdfa42a7bf1032ffb030cdf..f13d0088a34aa3a177ca303dac5d8afdae2d8ebf 100644
--- a/assets/prometheus/rules/kubernetes.rules.yaml
+++ b/assets/prometheus/rules/kubernetes.rules.yaml
@@ -1,115 +1,86 @@
 groups:
-- name: ./kubernetes.rules
+- name: kubernetes.rules
   rules:
-  - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
-    expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
-      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
-      controller, pod_name, container_name)
-  - record: cluster_namespace_controller_pod_container:spec_cpu_shares
-    expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
-      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
-      container_name)
-  - record: cluster_namespace_controller_pod_container:cpu_usage:rate
-    expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
-      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
-      controller, pod_name, container_name)
-  - record: cluster_namespace_controller_pod_container:memory_usage:bytes
-    expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
-      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
-      container_name)
-  - record: cluster_namespace_controller_pod_container:memory_working_set:bytes
-    expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
-      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
-      controller, pod_name, container_name)
-  - record: cluster_namespace_controller_pod_container:memory_rss:bytes
-    expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
-      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
-      container_name)
-  - record: cluster_namespace_controller_pod_container:memory_cache:bytes
-    expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
-      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
-      container_name)
-  - record: cluster_namespace_controller_pod_container:disk_usage:bytes
-    expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
-      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
-      container_name)
-  - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
-    expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
-      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
-      controller, pod_name, container_name, scope, type)
-  - record: cluster_namespace_controller_pod_container:memory_oom:rate
-    expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
-      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
-      controller, pod_name, container_name, scope, type)
-  - record: cluster:memory_allocation:percent
-    expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
-      / sum(machine_memory_bytes) BY (cluster)
-  - record: cluster:memory_used:percent
-    expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
-      BY (cluster)
-  - record: cluster:cpu_allocation:percent
-    expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
-      * ON(cluster, instance) machine_cpu_cores) BY (cluster)
-  - record: cluster:node_cpu_use:percent
-    expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
-      BY (cluster)
-  - record: cluster_resource_verb:apiserver_latency:quantile_seconds
-    expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
-      cluster, job, resource, verb)) / 1e+06
+  - record: pod_name:container_memory_usage_bytes:sum
+    expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
+      (pod_name)
+  - record: pod_name:container_spec_cpu_shares:sum
+    expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
+  - record: pod_name:container_cpu_usage:sum
+    expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
+      BY (pod_name)
+  - record: pod_name:container_fs_usage_bytes:sum
+    expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
+  - record: namespace:container_memory_usage_bytes:sum
+    expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
+  - record: namespace:container_spec_cpu_shares:sum
+    expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
+  - record: namespace:container_cpu_usage:sum
+    expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
+      BY (namespace)
+  - record: cluster:memory_usage:ratio
+    expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
+      (cluster) / sum(machine_memory_bytes) BY (cluster)
+  - record: cluster:container_spec_cpu_shares:ratio
+    expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
+      / sum(machine_cpu_cores)
+  - record: cluster:container_cpu_usage:ratio
+    expr: rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])
+      / sum(machine_cpu_cores)
+  - record: apiserver_latency_seconds:quantile
+    expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
+      1e+06
     labels:
       quantile: "0.99"
-  - record: cluster_resource_verb:apiserver_latency:quantile_seconds
-    expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
-      cluster, job, resource, verb)) / 1e+06
+  - record: apiserver_latency:quantile_seconds
+    expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
+      1e+06
     labels:
       quantile: "0.9"
-  - record: cluster_resource_verb:apiserver_latency:quantile_seconds
-    expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
-      cluster, job, resource, verb)) / 1e+06
+  - record: apiserver_latency_seconds:quantile
+    expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
+      1e+06
     labels:
       quantile: "0.5"
-  - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
-    expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
-      BY (le, cluster)) / 1e+06
+  - alert: APIServerLatencyHigh
+    expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
+      > 1
+    for: 10m
     labels:
-      quantile: "0.99"
-  - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
-    expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
-      BY (le, cluster)) / 1e+06
+      severity: warning
+    annotations:
+      description: the API server has a 99th percentile latency of {{ $value }} seconds
+        for {{$labels.verb}} {{$labels.resource}}
+  - alert: APIServerLatencyHigh
+    expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
+      > 4
+    for: 10m
     labels:
-      quantile: "0.9"
-  - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
-    expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
-      BY (le, cluster)) / 1e+06
+      severity: critical
+    annotations:
+      description: the API server has a 99th percentile latency of {{ $value }} seconds
+        for {{$labels.verb}} {{$labels.resource}}
+  - alert: APIServerErrorsHigh
+    expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
+      * 100 > 2
+    for: 10m
     labels:
-      quantile: "0.5"
-  - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
-    expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
-      BY (le, cluster)) / 1e+06
+      severity: warning
+    annotations:
+      description: API server returns errors for {{ $value }}% of requests
+  - alert: APIServerErrorsHigh
+    expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
+      * 100 > 5
+    for: 10m
     labels:
-      quantile: "0.99"
-  - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
-    expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
-      BY (le, cluster)) / 1e+06
+      severity: critical
+    annotations:
+      description: API server returns errors for {{ $value }}% of requests
+  - alert: K8SApiserverDown
+    expr: absent(up{job="apiserver"} == 1)
+    for: 20m
     labels:
-      quantile: "0.9"
-  - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
-    expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
-      BY (le, cluster)) / 1e+06
-    labels:
-      quantile: "0.5"
-  - record: cluster:scheduler_binding_latency:quantile_seconds
-    expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
-      BY (le, cluster)) / 1e+06
-    labels:
-      quantile: "0.99"
-  - record: cluster:scheduler_binding_latency:quantile_seconds
-    expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
-      BY (le, cluster)) / 1e+06
-    labels:
-      quantile: "0.9"
-  - record: cluster:scheduler_binding_latency:quantile_seconds
-    expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
-      BY (le, cluster)) / 1e+06
-    labels:
-      quantile: "0.5"
+      severity: critical
+    annotations:
+      description: No API servers are reachable or all have disappeared from service
+        discovery
diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml
index 9c1641caa9422b035d4bc397cb7652f1e0728a9d..0e7e1bbd3b58a994989882e7ebea589acf1a2953 100644
--- a/assets/prometheus/rules/node.rules.yaml
+++ b/assets/prometheus/rules/node.rules.yaml
@@ -1,6 +1,23 @@
 groups:
-- name: ./node.rules
+- name: node.rules
   rules:
+  - record: instance:node_cpu:rate:sum
+    expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
+      BY (instance)
+  - record: instance:node_filesystem_usage:sum
+    expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
+      BY (instance)
+  - record: instance:node_network_receive_bytes:rate:sum
+    expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
+  - record: instance:node_network_transmit_bytes:rate:sum
+    expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
+  - record: instance:node_cpu:ratio
+    expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
+      GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
+  - record: cluster:node_cpu:sum_rate5m
+    expr: sum(rate(node_cpu{mode!="idle"}[5m]))
+  - record: cluster:node_cpu:ratio
+    expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
   - alert: NodeExporterDown
     expr: absent(up{job="node-exporter"} == 1)
     for: 10m
@@ -8,30 +25,20 @@ groups:
       severity: warning
     annotations:
       description: Prometheus could not scrape a node-exporter for more than 10m,
-        or node-exporters have disappeared from discovery.
-      summary: node-exporter cannot be scraped
-  - alert: K8SNodeOutOfDisk
-    expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+        or node-exporters have disappeared from discovery
+  - alert: NodeDiskRunningFull
+    expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
+    for: 30m
     labels:
-      service: k8s
-      severity: critical
-    annotations:
-      description: '{{ $labels.node }} has run out of disk space.'
-      summary: Node ran out of disk space.
-  - alert: K8SNodeMemoryPressure
-    expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
-      1
-    labels:
-      service: k8s
       severity: warning
     annotations:
-      description: '{{ $labels.node }} is under memory pressure.'
-      summary: Node is under memory pressure.
-  - alert: K8SNodeDiskPressure
-    expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+      description: device {{$labels.device}} on node {{$labels.instance}} is running
+        full within the next 24 hours (mounted at {{$labels.mountpoint}})
+  - alert: NodeDiskRunningFull
+    expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
+    for: 10m
     labels:
-      service: k8s
-      severity: warning
+      severity: critical
     annotations:
-      description: '{{ $labels.node }} is under disk pressure.'
-      summary: Node is under disk pressure.
+      description: device {{$labels.device}} on node {{$labels.instance}} is running
+        full within the next 2 hours (mounted at {{$labels.mountpoint}})
diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml
index 6ed0cd68629f0d2224db09d9c8cf970ac093310c..df51d0106af150aa1b67180ef83c75571fb2b85b 100644
--- a/assets/prometheus/rules/prometheus.rules.yaml
+++ b/assets/prometheus/rules/prometheus.rules.yaml
@@ -1,12 +1,44 @@
 groups:
-- name: ./prometheus.rules
+- name: prometheus.rules
   rules:
-  - alert: FailedReload
+  - alert: PrometheusConfigReloadFailed
     expr: prometheus_config_last_reload_successful == 0
     for: 10m
     labels:
       severity: warning
     annotations:
-      description: Reloading Prometheus' configuration has failed for {{ $labels.namespace
-        }}/{{ $labels.pod}}.
-      summary: Prometheus configuration reload has failed
+      description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
+  - alert: PrometheusNotificationQueueRunningFull
+    expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
+        $labels.pod}}
+  - alert: PrometheusErrorSendingAlerts
+    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
+      > 0.01
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
+        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
+  - alert: PrometheusErrorSendingAlerts
+    expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
+      > 0.03
+    for: 10m
+    labels:
+      severity: critical
+    annotations:
+      description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
+        $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
+  - alert: PrometheusNotConnectedToAlertmanagers
+    expr: prometheus_notifications_alertmanagers_discovered < 1
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
+        to any Alertmanagers
diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml
index 3e6552c1653ac7e2733526f1c920a302f83a1332..6493ff748f1dd598c5023ed6e6840970e8d2998e 100644
--- a/manifests/prometheus/prometheus-k8s-rules.yaml
+++ b/manifests/prometheus/prometheus-k8s-rules.yaml
@@ -8,7 +8,7 @@ metadata:
 data:
   alertmanager.rules.yaml: |+
     groups:
-    - name: ./alertmanager.rules
+    - name: alertmanager.rules
       rules:
       - alert: AlertmanagerConfigInconsistent
         expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
@@ -20,7 +20,6 @@ data:
         annotations:
           description: The configuration of the instances of the Alertmanager cluster
             `{{$labels.service}}` are out of sync.
-          summary: Alertmanager configurations are inconsistent
       - alert: AlertmanagerDownOrMissing
         expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
           "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
@@ -30,8 +29,7 @@ data:
         annotations:
           description: An unexpected number of Alertmanagers are scraped or Alertmanagers
             disappeared from discovery.
-          summary: Alertmanager down or not discovered
-      - alert: FailedReload
+      - alert: AlertmanagerFailedReload
         expr: alertmanager_config_last_reload_successful == 0
         for: 10m
         labels:
@@ -39,7 +37,6 @@ data:
         annotations:
           description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
             }}/{{ $labels.pod}}.
-          summary: Alertmanager configuration reload has failed
   etcd3.rules.yaml: |+
     groups:
     - name: ./etcd3.rules
@@ -166,7 +163,7 @@ data:
           summary: high commit durations
   general.rules.yaml: |+
     groups:
-    - name: ./general.rules
+    - name: general.rules
       rules:
       - alert: TargetDown
         expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
@@ -174,7 +171,7 @@ data:
         labels:
           severity: warning
         annotations:
-          description: '{{ $value }}% or more of {{ $labels.job }} targets are down.'
+          description: '{{ $value }}% of {{ $labels.job }} targets are down.'
           summary: Targets are down
       - alert: DeadMansSwitch
         expr: vector(1)
@@ -184,61 +181,29 @@ data:
           description: This is a DeadMansSwitch meant to ensure that the entire Alerting
             pipeline is functional.
           summary: Alerting DeadMansSwitch
-      - alert: TooManyOpenFileDescriptors
-        expr: 100 * (process_open_fds / process_max_fds) > 95
-        for: 10m
-        labels:
-          severity: critical
-        annotations:
-          description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
-            $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.'
-          summary: too many open file descriptors
-      - record: instance:fd_utilization
+      - record: fd_utilization
         expr: process_open_fds / process_max_fds
       - alert: FdExhaustionClose
-        expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
+        expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
         for: 10m
         labels:
           severity: warning
         annotations:
-          description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
-            $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+          description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
+            will exhaust in file/socket descriptors within the next 4 hours'
           summary: file descriptors soon exhausted
       - alert: FdExhaustionClose
-        expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
+        expr: predict_linear(fd_utilization[10m], 3600) > 1
         for: 10m
         labels:
           severity: critical
         annotations:
-          description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
-            $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+          description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
+            will exhaust in file/socket descriptors within the next hour'
           summary: file descriptors soon exhausted
-  kube-apiserver.rules.yaml: |+
-    groups:
-    - name: ./kube-apiserver.rules
-      rules:
-      - alert: K8SApiserverDown
-        expr: absent(up{job="apiserver"} == 1)
-        for: 5m
-        labels:
-          severity: critical
-        annotations:
-          description: Prometheus failed to scrape API server(s), or all API servers have
-            disappeared from service discovery.
-          summary: API server unreachable
-      - alert: K8SApiServerLatency
-        expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m]))
-           by (le)) / 1e+06 > 1
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          description: 99th percentile Latency for {{ $labels.verb }} requests to the
-            kube-apiserver is higher than 1s.
-          summary: Kubernetes apiserver latency is high
   kube-controller-manager.rules.yaml: |+
     groups:
-    - name: ./kube-controller-manager.rules
+    - name: kube-controller-manager.rules
       rules:
       - alert: K8SControllerManagerDown
         expr: absent(up{job="kube-controller-manager"} == 1)
@@ -252,8 +217,53 @@ data:
           summary: Controller manager is down
   kube-scheduler.rules.yaml: |+
     groups:
-    - name: ./kube-scheduler.rules
+    - name: kube-scheduler.rules
       rules:
+      - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
+        expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.99"
+      - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
+        expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.9"
+      - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
+        expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.5"
+      - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
+        expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.99"
+      - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
+        expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.9"
+      - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
+        expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.5"
+      - record: cluster:scheduler_binding_latency_seconds:quantile
+        expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.99"
+      - record: cluster:scheduler_binding_latency_seconds:quantile
+        expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.9"
+      - record: cluster:scheduler_binding_latency_seconds:quantile
+        expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
+          BY (le, cluster)) / 1e+06
+        labels:
+          quantile: "0.5"
       - alert: K8SSchedulerDown
         expr: absent(up{job="kube-scheduler"} == 1)
         for: 5m
@@ -264,9 +274,65 @@ data:
             to nodes.
           runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
           summary: Scheduler is down
+  kube-state-metrics.rules.yaml: |+
+    groups:
+    - name: kube-state-metrics.rules
+      rules:
+      - alert: DeploymentGenerationMismatch
+        expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          description: Observed deployment generation does not match expected one for
+            deployment {{$labels.namespaces}}{{$labels.deployment}}
+      - alert: DeploymentReplicasNotUpdated
+        expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
+          or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
+          unless (kube_deployment_spec_paused == 1)
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
+      - alert: DaemonSetRolloutStuck
+        expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled
+          * 100 < 100
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          description: Only {{$value}}% of desired pods scheduled and ready for daemon
+            set {{$labels.namespaces}}/{{$labels.daemonset}}
+      - alert: K8SDaemonSetsNotScheduled
+        expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
+          > 0
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: A number of daemonsets are not scheduled.
+          summary: Daemonsets are not scheduled correctly
+      - alert: DaemonSetsMissScheduled
+        expr: kube_daemonset_status_number_misscheduled > 0
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: A number of daemonsets are running where they are not supposed
+            to run.
+          summary: Daemonsets are not scheduled correctly
+      - alert: PodFrequentlyRestarting
+        expr: increase(kube_pod_container_status_restarts[1h]) > 5
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
+            times within the last hour
   kubelet.rules.yaml: |+
     groups:
-    - name: ./kubelet.rules
+    - name: kubelet.rules
       rules:
       - alert: K8SNodeNotReady
         expr: kube_node_status_condition{condition="Ready",status="true"} == 0
@@ -285,20 +351,17 @@ data:
         labels:
           severity: critical
         annotations:
-          description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
-            state).'
-          summary: Many Kubernetes nodes are Not Ready
+          description: '{{ $value }}% of Kubernetes nodes are not ready'
       - alert: K8SKubeletDown
-        expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
+        expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
         for: 1h
         labels:
           severity: warning
         annotations:
           description: Prometheus failed to scrape {{ $value }}% of kubelets.
-          summary: Many Kubelets cannot be scraped
       - alert: K8SKubeletDown
-        expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})
-          > 0.1
+        expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
+          * 100 > 1
         for: 1h
         labels:
           severity: critical
@@ -308,159 +371,121 @@ data:
           summary: Many Kubelets cannot be scraped
       - alert: K8SKubeletTooManyPods
         expr: kubelet_running_pod_count > 100
+        for: 10m
         labels:
           severity: warning
         annotations:
           description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
             to the limit of 110
           summary: Kubelet is close to pod limit
-      - alert: K8SDaemonSetsNotScheduled
-        expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
-          > 0
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          description: A number of daemonsets are not scheduled.
-          summary: Daemonsets are not scheduled correctly
-      - alert: K8SDaemonSetsNotRunning
-        expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready
-          > 0
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          description: A number of daemonsets are not ready.
-          summary: Daemonsets are not ready
-      - alert: K8SDaemonSetsMissScheduled
-        expr: kube_daemonset_status_number_misscheduled > 0
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          description: A number of daemonsets are running where they are not supposed
-            to run.
-          summary: Daemonsets are not scheduled correctly
   kubernetes.rules.yaml: |+
     groups:
-    - name: ./kubernetes.rules
+    - name: kubernetes.rules
       rules:
-      - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
-        expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
-          "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
-          controller, pod_name, container_name)
-      - record: cluster_namespace_controller_pod_container:spec_cpu_shares
-        expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
-          "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
-          container_name)
-      - record: cluster_namespace_controller_pod_container:cpu_usage:rate
-        expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
-          "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
-          controller, pod_name, container_name)
-      - record: cluster_namespace_controller_pod_container:memory_usage:bytes
-        expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
-          "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
-          container_name)
-      - record: cluster_namespace_controller_pod_container:memory_working_set:bytes
-        expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
-          "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
-          controller, pod_name, container_name)
-      - record: cluster_namespace_controller_pod_container:memory_rss:bytes
-        expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
-          "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
-          container_name)
-      - record: cluster_namespace_controller_pod_container:memory_cache:bytes
-        expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
-          "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
-          container_name)
-      - record: cluster_namespace_controller_pod_container:disk_usage:bytes
-        expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
-          "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
-          container_name)
-      - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
-        expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
-          "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
-          controller, pod_name, container_name, scope, type)
-      - record: cluster_namespace_controller_pod_container:memory_oom:rate
-        expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
-          "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
-          controller, pod_name, container_name, scope, type)
-      - record: cluster:memory_allocation:percent
-        expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
-          / sum(machine_memory_bytes) BY (cluster)
-      - record: cluster:memory_used:percent
-        expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
-          BY (cluster)
-      - record: cluster:cpu_allocation:percent
-        expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
-          * ON(cluster, instance) machine_cpu_cores) BY (cluster)
-      - record: cluster:node_cpu_use:percent
-        expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
-          BY (cluster)
-      - record: cluster_resource_verb:apiserver_latency:quantile_seconds
-        expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
-          cluster, job, resource, verb)) / 1e+06
+      - record: pod_name:container_memory_usage_bytes:sum
+        expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
+          (pod_name)
+      - record: pod_name:container_spec_cpu_shares:sum
+        expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
+      - record: pod_name:container_cpu_usage:sum
+        expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
+          BY (pod_name)
+      - record: pod_name:container_fs_usage_bytes:sum
+        expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
+      - record: namespace:container_memory_usage_bytes:sum
+        expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
+      - record: namespace:container_spec_cpu_shares:sum
+        expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
+      - record: namespace:container_cpu_usage:sum
+        expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
+          BY (namespace)
+      - record: cluster:memory_usage:ratio
+        expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
+          (cluster) / sum(machine_memory_bytes) BY (cluster)
+      - record: cluster:container_spec_cpu_shares:ratio
+        expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
+          / sum(machine_cpu_cores)
+      - record: cluster:container_cpu_usage:ratio
+        expr: rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])
+          / sum(machine_cpu_cores)
+      - record: apiserver_latency_seconds:quantile
+        expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
+          1e+06
         labels:
           quantile: "0.99"
-      - record: cluster_resource_verb:apiserver_latency:quantile_seconds
-        expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
-          cluster, job, resource, verb)) / 1e+06
+      - record: apiserver_latency:quantile_seconds
+        expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
+          1e+06
         labels:
           quantile: "0.9"
-      - record: cluster_resource_verb:apiserver_latency:quantile_seconds
-        expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
-          cluster, job, resource, verb)) / 1e+06
+      - record: apiserver_latency_seconds:quantile
+        expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
+          1e+06
         labels:
           quantile: "0.5"
-      - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
-        expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
-          BY (le, cluster)) / 1e+06
-        labels:
-          quantile: "0.99"
-      - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
-        expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
-          BY (le, cluster)) / 1e+06
-        labels:
-          quantile: "0.9"
-      - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
-        expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
-          BY (le, cluster)) / 1e+06
-        labels:
-          quantile: "0.5"
-      - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
-        expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
-          BY (le, cluster)) / 1e+06
-        labels:
-          quantile: "0.99"
-      - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
-        expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
-          BY (le, cluster)) / 1e+06
+      - alert: APIServerLatencyHigh
+        expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
+          > 1
+        for: 10m
         labels:
-          quantile: "0.9"
-      - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
-        expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
-          BY (le, cluster)) / 1e+06
+          severity: warning
+        annotations:
+          description: the API server has a 99th percentile latency of {{ $value }} seconds
+            for {{$labels.verb}} {{$labels.resource}}
+      - alert: APIServerLatencyHigh
+        expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
+          > 4
+        for: 10m
         labels:
-          quantile: "0.5"
-      - record: cluster:scheduler_binding_latency:quantile_seconds
-        expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
-          BY (le, cluster)) / 1e+06
+          severity: critical
+        annotations:
+          description: the API server has a 99th percentile latency of {{ $value }} seconds
+            for {{$labels.verb}} {{$labels.resource}}
+      - alert: APIServerErrorsHigh
+        expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
+          * 100 > 2
+        for: 10m
         labels:
-          quantile: "0.99"
-      - record: cluster:scheduler_binding_latency:quantile_seconds
-        expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
-          BY (le, cluster)) / 1e+06
+          severity: warning
+        annotations:
+          description: API server returns errors for {{ $value }}% of requests
+      - alert: APIServerErrorsHigh
+        expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
+          * 100 > 5
+        for: 10m
         labels:
-          quantile: "0.9"
-      - record: cluster:scheduler_binding_latency:quantile_seconds
-        expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
-          BY (le, cluster)) / 1e+06
+          severity: critical
+        annotations:
+          description: API server returns errors for {{ $value }}% of requests
+      - alert: K8SApiserverDown
+        expr: absent(up{job="apiserver"} == 1)
+        for: 20m
         labels:
-          quantile: "0.5"
+          severity: critical
+        annotations:
+          description: No API servers are reachable or all have disappeared from service
+            discovery
   node.rules.yaml: |+
     groups:
-    - name: ./node.rules
+    - name: node.rules
       rules:
+      - record: instance:node_cpu:rate:sum
+        expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
+          BY (instance)
+      - record: instance:node_filesystem_usage:sum
+        expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
+          BY (instance)
+      - record: instance:node_network_receive_bytes:rate:sum
+        expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
+      - record: instance:node_network_transmit_bytes:rate:sum
+        expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
+      - record: instance:node_cpu:ratio
+        expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
+          GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
+      - record: cluster:node_cpu:sum_rate5m
+        expr: sum(rate(node_cpu{mode!="idle"}[5m]))
+      - record: cluster:node_cpu:ratio
+        expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
       - alert: NodeExporterDown
         expr: absent(up{job="node-exporter"} == 1)
         for: 10m
@@ -468,43 +493,65 @@ data:
           severity: warning
         annotations:
           description: Prometheus could not scrape a node-exporter for more than 10m,
-            or node-exporters have disappeared from discovery.
-          summary: node-exporter cannot be scraped
-      - alert: K8SNodeOutOfDisk
-        expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
-        labels:
-          service: k8s
-          severity: critical
-        annotations:
-          description: '{{ $labels.node }} has run out of disk space.'
-          summary: Node ran out of disk space.
-      - alert: K8SNodeMemoryPressure
-        expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
-          1
+            or node-exporters have disappeared from discovery
+      - alert: NodeDiskRunningFull
+        expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
+        for: 30m
         labels:
-          service: k8s
           severity: warning
         annotations:
-          description: '{{ $labels.node }} is under memory pressure.'
-          summary: Node is under memory pressure.
-      - alert: K8SNodeDiskPressure
-        expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+          description: device {{$labels.device}} on node {{$labels.instance}} is running
+            full within the next 24 hours (mounted at {{$labels.mountpoint}})
+      - alert: NodeDiskRunningFull
+        expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
+        for: 10m
         labels:
-          service: k8s
-          severity: warning
+          severity: critical
         annotations:
-          description: '{{ $labels.node }} is under disk pressure.'
-          summary: Node is under disk pressure.
+          description: device {{$labels.device}} on node {{$labels.instance}} is running
+            full within the next 2 hours (mounted at {{$labels.mountpoint}})
   prometheus.rules.yaml: |+
     groups:
-    - name: ./prometheus.rules
+    - name: prometheus.rules
       rules:
-      - alert: FailedReload
+      - alert: PrometheusConfigReloadFailed
         expr: prometheus_config_last_reload_successful == 0
         for: 10m
         labels:
           severity: warning
         annotations:
-          description: Reloading Prometheus' configuration has failed for {{ $labels.namespace
-            }}/{{ $labels.pod}}.
-          summary: Prometheus configuration reload has failed
+          description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
+      - alert: PrometheusNotificationQueueRunningFull
+        expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
+            $labels.pod}}
+      - alert: PrometheusErrorSendingAlerts
+        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
+          > 0.01
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
+            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
+      - alert: PrometheusErrorSendingAlerts
+        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
+          > 0.03
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
+            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
+      - alert: PrometheusNotConnectedToAlertmanagers
+        expr: prometheus_notifications_alertmanagers_discovered < 1
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
+            to any Alertmanagers