diff --git a/assets/prometheus/rules/kube-controller-manager.rules.yaml b/assets/prometheus/rules/kube-controller-manager.rules.yaml deleted file mode 100644 index 4ea82ed1c24988c97454cf5bcb435ec4511009c7..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/kube-controller-manager.rules.yaml +++ /dev/null @@ -1,13 +0,0 @@ -groups: -- name: kube-controller-manager.rules - rules: - - alert: K8SControllerManagerDown - expr: absent(up{job="kube-controller-manager"} == 1) - for: 5m - labels: - severity: critical - annotations: - description: There is no running K8S controller manager. Deployments and replication - controllers are not making progress. - runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager - summary: Controller manager is down diff --git a/assets/prometheus/rules/kube-scheduler.rules.yaml b/assets/prometheus/rules/kube-scheduler.rules.yaml deleted file mode 100644 index 8f0c01fd2e959c9aeea573245adf368c7be18cb6..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/kube-scheduler.rules.yaml +++ /dev/null @@ -1,58 +0,0 @@ -groups: -- name: kube-scheduler.rules - rules: - - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile - expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile - expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile - expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile - expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile - expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile - expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - record: cluster:scheduler_binding_latency_seconds:quantile - expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.99" - - record: cluster:scheduler_binding_latency_seconds:quantile - expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.9" - - record: cluster:scheduler_binding_latency_seconds:quantile - expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 - labels: - quantile: "0.5" - - alert: K8SSchedulerDown - expr: absent(up{job="kube-scheduler"} == 1) - for: 5m - labels: - severity: critical - annotations: - description: There is no running K8S scheduler. New pods are not being assigned - to nodes. - runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler - summary: Scheduler is down diff --git a/assets/prometheus/rules/kube-state-metrics.rules.yaml b/assets/prometheus/rules/kube-state-metrics.rules.yaml deleted file mode 100644 index 4c7041fea0286347626de8eb128f9ec2f5d36276..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/kube-state-metrics.rules.yaml +++ /dev/null @@ -1,59 +0,0 @@ -groups: -- name: kube-state-metrics.rules - rules: - - alert: DeploymentGenerationMismatch - expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation - for: 15m - labels: - severity: warning - annotations: - description: Observed deployment generation does not match expected one for - deployment {{$labels.namespace}}/{{$labels.deployment}} - summary: Deployment is outdated - - alert: DeploymentReplicasNotUpdated - expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) - or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) - unless (kube_deployment_spec_paused == 1) - for: 15m - labels: - severity: warning - annotations: - description: Replicas are not updated and available for deployment {{$labels.namespace}}/{{$labels.deployment}} - summary: Deployment replicas are outdated - - alert: DaemonSetRolloutStuck - expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled - * 100 < 100 - for: 15m - labels: - severity: warning - annotations: - description: Only {{$value}}% of desired pods scheduled and ready for daemon - set {{$labels.namespace}}/{{$labels.daemonset}} - summary: DaemonSet is missing pods - - alert: K8SDaemonSetsNotScheduled - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled - > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are not scheduled. - summary: Daemonsets are not scheduled correctly - - alert: DaemonSetsMissScheduled - expr: kube_daemonset_status_number_misscheduled > 0 - for: 10m - labels: - severity: warning - annotations: - description: A number of daemonsets are running where they are not supposed - to run. - summary: Daemonsets are not scheduled correctly - - alert: PodFrequentlyRestarting - expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 - for: 10m - labels: - severity: warning - annotations: - description: Pod {{$labels.namespace}}/{{$labels.pod}} was restarted {{$value}} - times within the last hour - summary: Pod is restarting frequently diff --git a/assets/prometheus/rules/kubelet.rules.yaml b/assets/prometheus/rules/kubelet.rules.yaml deleted file mode 100644 index a416840482bf862de4a156f61b14b6659b2d2676..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/kubelet.rules.yaml +++ /dev/null @@ -1,48 +0,0 @@ -groups: -- name: kubelet.rules - rules: - - alert: K8SNodeNotReady - expr: kube_node_status_condition{condition="Ready",status="true"} == 0 - for: 1h - labels: - severity: warning - annotations: - description: The Kubelet on {{ $labels.node }} has not checked in with the API, - or has set itself to NotReady, for more than an hour - summary: Node status is NotReady - - alert: K8SManyNodesNotReady - expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) - > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == - 0) / count(kube_node_status_condition{condition="Ready",status="true"})) * 100 > 20 - for: 1m - labels: - severity: critical - annotations: - description: '{{ $value }}% of Kubernetes nodes are not ready' - - alert: K8SKubeletDown - expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 - for: 1h - labels: - severity: warning - annotations: - description: Prometheus failed to scrape {{ $value }}% of kubelets. - summary: Prometheus failed to scrape - - alert: K8SKubeletDown - expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) - * 100 > 10 - for: 1h - labels: - severity: critical - annotations: - description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets - have disappeared from service discovery. - summary: Many Kubelets cannot be scraped - - alert: K8SKubeletTooManyPods - expr: kubelet_running_pod_count > 100 - for: 10m - labels: - severity: warning - annotations: - description: Kubelet {{$labels.instance}} is running {{$value}} pods, close - to the limit of 110 - summary: Kubelet is close to pod limit diff --git a/assets/prometheus/rules/kubernetes.rules.yaml b/assets/prometheus/rules/kubernetes.rules.yaml deleted file mode 100644 index 288841b7a1ddd87f146201ad7af32456451e9137..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/kubernetes.rules.yaml +++ /dev/null @@ -1,106 +0,0 @@ -groups: -- name: kubernetes.rules - rules: - - record: pod_name:container_memory_usage_bytes:sum - expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY - (pod_name) - - record: pod_name:container_spec_cpu_shares:sum - expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) - - record: pod_name:container_cpu_usage:sum - expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) - BY (pod_name) - - record: pod_name:container_fs_usage_bytes:sum - expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) - - record: namespace:container_memory_usage_bytes:sum - expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) - - record: namespace:container_spec_cpu_shares:sum - expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) - - record: namespace:container_cpu_usage:sum - expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) - BY (namespace) - - record: cluster:memory_usage:ratio - expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY - (cluster) / sum(machine_memory_bytes) BY (cluster) - - record: cluster:container_spec_cpu_shares:ratio - expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 - / sum(machine_cpu_cores) - - record: cluster:container_cpu_usage:ratio - expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) - / sum(machine_cpu_cores) - - record: apiserver_latency_seconds:quantile - expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / - 1e+06 - labels: - quantile: "0.99" - - record: apiserver_latency:quantile_seconds - expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / - 1e+06 - labels: - quantile: "0.9" - - record: apiserver_latency_seconds:quantile - expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / - 1e+06 - labels: - quantile: "0.5" - - alert: APIServerLatencyHigh - expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} - > 1 - for: 10m - labels: - severity: warning - annotations: - description: the API server has a 99th percentile latency of {{ $value }} seconds - for {{$labels.verb}} {{$labels.resource}} - summary: API server high latency - - alert: APIServerLatencyHigh - expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} - > 4 - for: 10m - labels: - severity: critical - annotations: - description: the API server has a 99th percentile latency of {{ $value }} seconds - for {{$labels.verb}} {{$labels.resource}} - summary: API server high latency - - alert: APIServerErrorsHigh - expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) - * 100 > 2 - for: 10m - labels: - severity: warning - annotations: - description: API server returns errors for {{ $value }}% of requests - summary: API server request errors - - alert: APIServerErrorsHigh - expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) - * 100 > 5 - for: 10m - labels: - severity: critical - annotations: - description: API server returns errors for {{ $value }}% of requests - - alert: K8SApiserverDown - expr: absent(up{job="apiserver"} == 1) - for: 20m - labels: - severity: critical - annotations: - description: No API servers are reachable or all have disappeared from service - discovery - summary: No API servers are reachable - - - alert: K8sCertificateExpirationNotice - labels: - severity: warning - annotations: - description: Kubernetes API Certificate is expiring soon (less than 7 days) - summary: Kubernetes API Certificate is expiering soon - expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 - - - alert: K8sCertificateExpirationNotice - labels: - severity: critical - annotations: - description: Kubernetes API Certificate is expiring in less than 1 day - summary: Kubernetes API Certificate is expiering - expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0