diff --git a/jsonnet/kube-prometheus/components/alertmanager.libsonnet b/jsonnet/kube-prometheus/components/alertmanager.libsonnet index 87b651a654127267aec91f57df5704e56ee2bd1b..bda39ec4c7747ed516b7ea3abb9380b6f4c8f683 100644 --- a/jsonnet/kube-prometheus/components/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/components/alertmanager.libsonnet @@ -64,7 +64,7 @@ local defaults = { alertmanagerName: '{{ $labels.namespace }}/{{ $labels.pod}}', alertmanagerClusterLabels: 'namespace,service', alertmanagerSelector: 'job="alertmanager-' + defaults.name + '",namespace="' + defaults.namespace + '"', - runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/alertmanager/%s', }, }, }; diff --git a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet index 475a3e5a266a7d5a0c7149032c12451fac285a1f..e7323aeab058179fd9461eca42d703c1ad437967 100644 --- a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet +++ b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet @@ -17,7 +17,7 @@ local defaults = { kubeControllerManagerSelector: 'job="kube-controller-manager"', kubeApiserverSelector: 'job="apiserver"', podLabel: 'pod', - runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/%s', diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', hostNetworkInterfaceSelector: 'device!~"veth.+"', }, diff --git a/jsonnet/kube-prometheus/components/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/components/kube-state-metrics.libsonnet index 7c8d25683f025059010b108ea58abd1b93e75839..b80bd9396ad1c8e42c42f12594562fefdf625ebe 100644 --- a/jsonnet/kube-prometheus/components/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/components/kube-state-metrics.libsonnet @@ -29,7 +29,7 @@ local defaults = { ruleLabels: {}, _config: { kubeStateMetricsSelector: 'job="' + defaults.name + '"', - runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/%s', }, }, }; diff --git a/jsonnet/kube-prometheus/components/mixin/custom.libsonnet b/jsonnet/kube-prometheus/components/mixin/custom.libsonnet index f84236535f68e094136d2c5818d428636e4c5baa..c8f43b0340baf35faa5f64a6a7d7920c7cb1315d 100644 --- a/jsonnet/kube-prometheus/components/mixin/custom.libsonnet +++ b/jsonnet/kube-prometheus/components/mixin/custom.libsonnet @@ -11,7 +11,7 @@ local defaults = { _config: { nodeExporterSelector: 'job="node-exporter"', hostNetworkInterfaceSelector: 'device!~"veth.+"', - runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/general/%s', }, }, }; diff --git a/jsonnet/kube-prometheus/components/node-exporter.libsonnet b/jsonnet/kube-prometheus/components/node-exporter.libsonnet index 4e5d116778de8006ecf28369eac1d3a4273122c5..5530d16aa9e0bc481da39624bc7b588a72d01a0a 100644 --- a/jsonnet/kube-prometheus/components/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/components/node-exporter.libsonnet @@ -30,7 +30,7 @@ local defaults = { nodeExporterSelector: 'job="' + defaults.name + '"', fsSpaceFillingUpCriticalThreshold: 15, diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', - runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/node/%s', }, }, }; diff --git a/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet index b8adeeebf60d607f0986080518796bd119607b05..b0a78e0625586e049e870b90530f095c8faf7368 100644 --- a/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet @@ -31,7 +31,7 @@ local defaults = { }, _config: { prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"', - runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/%s', }, }, }; diff --git a/jsonnet/kube-prometheus/components/prometheus.libsonnet b/jsonnet/kube-prometheus/components/prometheus.libsonnet index e4a439b7605e62d738066ce32be341be4f4085ea..5e1c9e33f5e0bcb0158d9b684a7a2d755ad0186d 100644 --- a/jsonnet/kube-prometheus/components/prometheus.libsonnet +++ b/jsonnet/kube-prometheus/components/prometheus.libsonnet @@ -36,7 +36,7 @@ local defaults = { prometheusSelector: 'job="prometheus-' + defaults.name + '",namespace="' + defaults.namespace + '"', prometheusName: '{{$labels.namespace}}/{{$labels.pod}}', thanosSelector: 'job="thanos-sidecar"', - runbookURLPattern: 'https://github.com/prometheus-operator/kube-prometheus/wiki/%s', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus/%s', }, }, thanos: {}, diff --git a/manifests/alertmanager-prometheusRule.yaml b/manifests/alertmanager-prometheusRule.yaml index d6321b957aa719b10b75c231d9c05812dfe83076..a7fd4cc3788489673800bb9e3f32a86509d5e673 100644 --- a/manifests/alertmanager-prometheusRule.yaml +++ b/manifests/alertmanager-prometheusRule.yaml @@ -17,7 +17,7 @@ spec: - alert: AlertmanagerFailedReload annotations: description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedreload + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload summary: Reloading an Alertmanager configuration has failed. expr: | # Without max_over_time, failed scrapes could create false negatives, see @@ -29,7 +29,7 @@ spec: - alert: AlertmanagerMembersInconsistent annotations: description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagermembersinconsistent + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent summary: A member of an Alertmanager cluster has not found all other cluster members. expr: | # Without max_over_time, failed scrapes could create false negatives, see @@ -43,7 +43,7 @@ spec: - alert: AlertmanagerFailedToSendAlerts annotations: description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedtosendalerts + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts summary: An Alertmanager instance failed to send notifications. expr: | ( @@ -58,7 +58,7 @@ spec: - alert: AlertmanagerClusterFailedToSendAlerts annotations: description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. expr: | min by (namespace,service, integration) ( @@ -73,7 +73,7 @@ spec: - alert: AlertmanagerClusterFailedToSendAlerts annotations: description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. expr: | min by (namespace,service, integration) ( @@ -88,7 +88,7 @@ spec: - alert: AlertmanagerConfigInconsistent annotations: description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerconfiginconsistent + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent summary: Alertmanager instances within the same cluster have different configurations. expr: | count by (namespace,service) ( @@ -101,7 +101,7 @@ spec: - alert: AlertmanagerClusterDown annotations: description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterdown + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown summary: Half or more of the Alertmanager instances within the same cluster are down. expr: | ( @@ -120,7 +120,7 @@ spec: - alert: AlertmanagerClusterCrashlooping annotations: description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclustercrashlooping + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. expr: | ( diff --git a/manifests/kube-prometheus-prometheusRule.yaml b/manifests/kube-prometheus-prometheusRule.yaml index d203dd915bb876cae85231402d3f393668ca19ef..84c2da68457a8084dd85139e4f92aadaff0a2136 100644 --- a/manifests/kube-prometheus-prometheusRule.yaml +++ b/manifests/kube-prometheus-prometheusRule.yaml @@ -16,7 +16,7 @@ spec: - alert: TargetDown annotations: description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/targetdown + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown summary: One or more targets are unreachable. expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 for: 10m @@ -30,7 +30,7 @@ spec: and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/watchdog + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog summary: An alert that should always be firing to certify that Alertmanager is working properly. expr: vector(1) labels: @@ -40,7 +40,7 @@ spec: - alert: NodeNetworkInterfaceFlapping annotations: description: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping summary: Network interface is often changin it's status expr: | changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 diff --git a/manifests/kube-state-metrics-prometheusRule.yaml b/manifests/kube-state-metrics-prometheusRule.yaml index 1b4904b3f2e2015cf21b639697d5638bf6c52180..237776f2d872ff237f4aac1dbcd826da8f03f6a3 100644 --- a/manifests/kube-state-metrics-prometheusRule.yaml +++ b/manifests/kube-state-metrics-prometheusRule.yaml @@ -17,7 +17,7 @@ spec: - alert: KubeStateMetricsListErrors annotations: description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricslisterrors + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors summary: kube-state-metrics is experiencing errors in list operations. expr: | (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) @@ -30,7 +30,7 @@ spec: - alert: KubeStateMetricsWatchErrors annotations: description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricswatcherrors + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors summary: kube-state-metrics is experiencing errors in watch operations. expr: | (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) @@ -43,7 +43,7 @@ spec: - alert: KubeStateMetricsShardingMismatch annotations: description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricsshardingmismatch + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch summary: kube-state-metrics sharding is misconfigured. expr: | stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 @@ -53,7 +53,7 @@ spec: - alert: KubeStateMetricsShardsMissing annotations: description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricsshardsmissing + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing summary: kube-state-metrics shards are missing. expr: | 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 diff --git a/manifests/kubernetes-prometheusRule.yaml b/manifests/kubernetes-prometheusRule.yaml index 60e6a85b050760d8fe85916b686a165432fae570..1ff25d1fe08270591e3e82ea3a2d0ac0ab813790 100644 --- a/manifests/kubernetes-prometheusRule.yaml +++ b/manifests/kubernetes-prometheusRule.yaml @@ -15,7 +15,7 @@ spec: - alert: KubePodCrashLooping annotations: description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping summary: Pod is crash looping. expr: | increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) > 0 @@ -27,7 +27,7 @@ spec: - alert: KubePodNotReady annotations: description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodnotready + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready summary: Pod has been in a non-ready state for more than 15 minutes. expr: | sum by (namespace, pod) ( @@ -43,7 +43,7 @@ spec: - alert: KubeDeploymentGenerationMismatch annotations: description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentgenerationmismatch + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch summary: Deployment generation mismatch due to possible roll-back expr: | kube_deployment_status_observed_generation{job="kube-state-metrics"} @@ -55,7 +55,7 @@ spec: - alert: KubeDeploymentReplicasMismatch annotations: description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch summary: Deployment has not matched the expected number of replicas. expr: | ( @@ -73,7 +73,7 @@ spec: - alert: KubeStatefulSetReplicasMismatch annotations: description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetreplicasmismatch + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch summary: Deployment has not matched the expected number of replicas. expr: | ( @@ -91,7 +91,7 @@ spec: - alert: KubeStatefulSetGenerationMismatch annotations: description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetgenerationmismatch + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch summary: StatefulSet generation mismatch due to possible roll-back expr: | kube_statefulset_status_observed_generation{job="kube-state-metrics"} @@ -103,7 +103,7 @@ spec: - alert: KubeStatefulSetUpdateNotRolledOut annotations: description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetupdatenotrolledout + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout summary: StatefulSet update has not been rolled out. expr: | ( @@ -129,7 +129,7 @@ spec: - alert: KubeDaemonSetRolloutStuck annotations: description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetrolloutstuck + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck summary: DaemonSet rollout is stuck. expr: | ( @@ -161,7 +161,7 @@ spec: - alert: KubeContainerWaiting annotations: description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontainerwaiting + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting summary: Pod container waiting longer than 1 hour expr: | sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 @@ -171,7 +171,7 @@ spec: - alert: KubeDaemonSetNotScheduled annotations: description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetnotscheduled + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled summary: DaemonSet pods are not scheduled. expr: | kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} @@ -183,7 +183,7 @@ spec: - alert: KubeDaemonSetMisScheduled annotations: description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetmisscheduled + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled summary: DaemonSet pods are misscheduled. expr: | kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 @@ -193,7 +193,7 @@ spec: - alert: KubeJobCompletion annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobcompletion + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobcompletion summary: Job did not complete in time expr: | kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 @@ -203,7 +203,7 @@ spec: - alert: KubeJobFailed annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobfailed + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed summary: Job failed to complete. expr: | kube_job_failed{job="kube-state-metrics"} > 0 @@ -213,7 +213,7 @@ spec: - alert: KubeHpaReplicasMismatch annotations: description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpareplicasmismatch + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch summary: HPA has not matched descired number of replicas. expr: | (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} @@ -235,7 +235,7 @@ spec: - alert: KubeHpaMaxedOut annotations: description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpamaxedout + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout summary: HPA is running at max replicas expr: | kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} @@ -249,7 +249,7 @@ spec: - alert: KubeCPUOvercommit annotations: description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuovercommit + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit summary: Cluster has overcommitted CPU resource requests. expr: | sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) @@ -263,7 +263,7 @@ spec: - alert: KubeMemoryOvercommit annotations: description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryovercommit + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit summary: Cluster has overcommitted memory resource requests. expr: | sum(namespace_memory:kube_pod_container_resource_requests:sum{}) @@ -279,7 +279,7 @@ spec: - alert: KubeCPUQuotaOvercommit annotations: description: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuquotaovercommit + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit summary: Cluster has overcommitted CPU resource requests. expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) @@ -292,7 +292,7 @@ spec: - alert: KubeMemoryQuotaOvercommit annotations: description: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryquotaovercommit + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit summary: Cluster has overcommitted memory resource requests. expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) @@ -305,7 +305,7 @@ spec: - alert: KubeQuotaAlmostFull annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaalmostfull + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull summary: Namespace quota is going to be full. expr: | kube_resourcequota{job="kube-state-metrics", type="used"} @@ -318,7 +318,7 @@ spec: - alert: KubeQuotaFullyUsed annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotafullyused + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused summary: Namespace quota is fully used. expr: | kube_resourcequota{job="kube-state-metrics", type="used"} @@ -331,7 +331,7 @@ spec: - alert: KubeQuotaExceeded annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaexceeded + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded summary: Namespace quota has exceeded the limits. expr: | kube_resourcequota{job="kube-state-metrics", type="used"} @@ -344,7 +344,7 @@ spec: - alert: CPUThrottlingHigh annotations: description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/cputhrottlinghigh + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh summary: Processes experience elevated CPU throttling. expr: | sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) @@ -359,7 +359,7 @@ spec: - alert: KubePersistentVolumeFillingUp annotations: description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: | ( @@ -375,7 +375,7 @@ spec: - alert: KubePersistentVolumeFillingUp annotations: description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: | ( @@ -393,7 +393,7 @@ spec: - alert: KubePersistentVolumeErrors annotations: description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumeerrors + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors summary: PersistentVolume is having issues with provisioning. expr: | kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 @@ -405,7 +405,7 @@ spec: - alert: KubeVersionMismatch annotations: description: There are {{ $value }} different semantic versions of Kubernetes components running. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeversionmismatch + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch summary: Different semantic versions of Kubernetes components running. expr: | count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 @@ -415,7 +415,7 @@ spec: - alert: KubeClientErrors annotations: description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclienterrors + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors summary: Kubernetes API server client is experiencing errors. expr: | (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) @@ -430,7 +430,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) @@ -444,7 +444,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) @@ -458,7 +458,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) @@ -472,7 +472,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) @@ -488,7 +488,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 @@ -497,7 +497,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 @@ -506,7 +506,7 @@ spec: - alert: AggregatedAPIErrors annotations: description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapierrors summary: An aggregated API has reported errors. expr: | sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 @@ -515,7 +515,7 @@ spec: - alert: AggregatedAPIDown annotations: description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapidown + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapidown summary: An aggregated API is down. expr: | (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 @@ -525,7 +525,7 @@ spec: - alert: KubeAPIDown annotations: description: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapidown + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="apiserver"} == 1) @@ -535,7 +535,7 @@ spec: - alert: KubeAPITerminatedRequests annotations: description: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapiterminatedrequests + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests summary: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. expr: | sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 @@ -547,7 +547,7 @@ spec: - alert: KubeNodeNotReady annotations: description: '{{ $labels.node }} has been unready for more than 15 minutes.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodenotready + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready summary: Node is not ready. expr: | kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 @@ -557,7 +557,7 @@ spec: - alert: KubeNodeUnreachable annotations: description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodeunreachable + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable summary: Node is unreachable. expr: | (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 @@ -567,7 +567,7 @@ spec: - alert: KubeletTooManyPods annotations: description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubelettoomanypods + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods summary: Kubelet is running at capacity. expr: | count by(node) ( @@ -583,7 +583,7 @@ spec: - alert: KubeNodeReadinessFlapping annotations: description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodereadinessflapping + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping summary: Node readiness status is flapping. expr: | sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 @@ -593,7 +593,7 @@ spec: - alert: KubeletPlegDurationHigh annotations: description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletplegdurationhigh + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. expr: | node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 @@ -603,7 +603,7 @@ spec: - alert: KubeletPodStartUpLatencyHigh annotations: description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletpodstartuplatencyhigh + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh summary: Kubelet Pod startup latency is too high. expr: | histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 @@ -613,7 +613,7 @@ spec: - alert: KubeletClientCertificateExpiration annotations: description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration summary: Kubelet client certificate is about to expire. expr: | kubelet_certificate_manager_client_ttl_seconds < 604800 @@ -622,7 +622,7 @@ spec: - alert: KubeletClientCertificateExpiration annotations: description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration summary: Kubelet client certificate is about to expire. expr: | kubelet_certificate_manager_client_ttl_seconds < 86400 @@ -631,7 +631,7 @@ spec: - alert: KubeletServerCertificateExpiration annotations: description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration summary: Kubelet server certificate is about to expire. expr: | kubelet_certificate_manager_server_ttl_seconds < 604800 @@ -640,7 +640,7 @@ spec: - alert: KubeletServerCertificateExpiration annotations: description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration summary: Kubelet server certificate is about to expire. expr: | kubelet_certificate_manager_server_ttl_seconds < 86400 @@ -649,7 +649,7 @@ spec: - alert: KubeletClientCertificateRenewalErrors annotations: description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificaterenewalerrors + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors summary: Kubelet has failed to renew its client certificate. expr: | increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 @@ -659,7 +659,7 @@ spec: - alert: KubeletServerCertificateRenewalErrors annotations: description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificaterenewalerrors + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors summary: Kubelet has failed to renew its server certificate. expr: | increase(kubelet_server_expiration_renew_errors[5m]) > 0 @@ -669,7 +669,7 @@ spec: - alert: KubeletDown annotations: description: Kubelet has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletdown + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="kubelet", metrics_path="/metrics"} == 1) @@ -681,7 +681,7 @@ spec: - alert: KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeschedulerdown + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="kube-scheduler"} == 1) @@ -693,7 +693,7 @@ spec: - alert: KubeControllerManagerDown annotations: description: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontrollermanagerdown + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="kube-controller-manager"} == 1) diff --git a/manifests/node-exporter-prometheusRule.yaml b/manifests/node-exporter-prometheusRule.yaml index 295f35bc20c8b3dd3234f6da079a3ba34d040957..cd91f442bd227ebc73435b41716a566392110b32 100644 --- a/manifests/node-exporter-prometheusRule.yaml +++ b/manifests/node-exporter-prometheusRule.yaml @@ -17,7 +17,7 @@ spec: - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 24 hours. expr: | ( @@ -33,7 +33,7 @@ spec: - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 4 hours. expr: | ( @@ -49,7 +49,7 @@ spec: - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace summary: Filesystem has less than 5% space left. expr: | ( @@ -63,7 +63,7 @@ spec: - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace summary: Filesystem has less than 3% space left. expr: | ( @@ -77,7 +77,7 @@ spec: - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 24 hours. expr: | ( @@ -93,7 +93,7 @@ spec: - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 4 hours. expr: | ( @@ -109,7 +109,7 @@ spec: - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles summary: Filesystem has less than 5% inodes left. expr: | ( @@ -123,7 +123,7 @@ spec: - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles summary: Filesystem has less than 3% inodes left. expr: | ( @@ -137,7 +137,7 @@ spec: - alert: NodeNetworkReceiveErrs annotations: description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkreceiveerrs + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs summary: Network interface is reporting many receive errors. expr: | rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 @@ -147,7 +147,7 @@ spec: - alert: NodeNetworkTransmitErrs annotations: description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworktransmiterrs + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs summary: Network interface is reporting many transmit errors. expr: | rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 @@ -157,7 +157,7 @@ spec: - alert: NodeHighNumberConntrackEntriesUsed annotations: description: '{{ $value | humanizePercentage }} of conntrack entries are used.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodehighnumberconntrackentriesused + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused summary: Number of conntrack are getting close to the limit. expr: | (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 @@ -166,7 +166,7 @@ spec: - alert: NodeTextFileCollectorScrapeError annotations: description: Node Exporter text file collector failed to scrape. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodetextfilecollectorscrapeerror + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror summary: Node Exporter text file collector failed to scrape. expr: | node_textfile_scrape_error{job="node-exporter"} == 1 @@ -175,7 +175,7 @@ spec: - alert: NodeClockSkewDetected annotations: description: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclockskewdetected + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected summary: Clock skew detected. expr: | ( @@ -195,7 +195,7 @@ spec: - alert: NodeClockNotSynchronising annotations: description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclocknotsynchronising + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising summary: Clock not synchronising. expr: | min_over_time(node_timex_sync_status[5m]) == 0 @@ -207,7 +207,7 @@ spec: - alert: NodeRAIDDegraded annotations: description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddegraded + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded summary: RAID Array is degraded expr: | node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 @@ -217,7 +217,7 @@ spec: - alert: NodeRAIDDiskFailure annotations: description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddiskfailure + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure summary: Failed device in RAID array expr: | node_md_disks{state="failed"} > 0 @@ -226,7 +226,7 @@ spec: - alert: NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefiledescriptorlimit + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit summary: Kernel is predicted to exhaust file descriptors limit soon. expr: | ( @@ -238,7 +238,7 @@ spec: - alert: NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefiledescriptorlimit + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit summary: Kernel is predicted to exhaust file descriptors limit soon. expr: | ( diff --git a/manifests/prometheus-operator-prometheusRule.yaml b/manifests/prometheus-operator-prometheusRule.yaml index 141af2eb7e191e591104da1a7d48e75c0d33704e..3d462c51f847baee7cb2bc86b3c5de388f226ba5 100644 --- a/manifests/prometheus-operator-prometheusRule.yaml +++ b/manifests/prometheus-operator-prometheusRule.yaml @@ -17,7 +17,7 @@ spec: - alert: PrometheusOperatorListErrors annotations: description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorlisterrors + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors summary: Errors while performing list operations in controller. expr: | (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 @@ -27,7 +27,7 @@ spec: - alert: PrometheusOperatorWatchErrors annotations: description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorwatcherrors + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors summary: Errors while performing watch operations in controller. expr: | (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 @@ -37,7 +37,7 @@ spec: - alert: PrometheusOperatorSyncFailed annotations: description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorsyncfailed + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed summary: Last controller reconciliation failed expr: | min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 @@ -47,7 +47,7 @@ spec: - alert: PrometheusOperatorReconcileErrors annotations: description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorreconcileerrors + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors summary: Errors while reconciling controller. expr: | (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 @@ -57,7 +57,7 @@ spec: - alert: PrometheusOperatorNodeLookupErrors annotations: description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornodelookuperrors + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors summary: Errors while reconciling Prometheus. expr: | rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 @@ -67,7 +67,7 @@ spec: - alert: PrometheusOperatorNotReady annotations: description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornotready + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready summary: Prometheus operator not ready expr: | min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) @@ -77,7 +77,7 @@ spec: - alert: PrometheusOperatorRejectedResources annotations: description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorrejectedresources + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources summary: Resources rejected by Prometheus operator expr: | min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 diff --git a/manifests/prometheus-prometheusRule.yaml b/manifests/prometheus-prometheusRule.yaml index c9063ba01e1cbb1a65fad9a1c3c4bb88322f2b0e..5dd5b248894f899b7c72141bc3d8008d126dfbea 100644 --- a/manifests/prometheus-prometheusRule.yaml +++ b/manifests/prometheus-prometheusRule.yaml @@ -17,7 +17,7 @@ spec: - alert: PrometheusBadConfig annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusbadconfig + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig summary: Failed Prometheus configuration reload. expr: | # Without max_over_time, failed scrapes could create false negatives, see @@ -29,7 +29,7 @@ spec: - alert: PrometheusNotificationQueueRunningFull annotations: description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotificationqueuerunningfull + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull summary: Prometheus alert notification queue predicted to run full in less than 30m. expr: | # Without min_over_time, failed scrapes could create false negatives, see @@ -45,7 +45,7 @@ spec: - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstosomealertmanagers + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. expr: | ( @@ -61,7 +61,7 @@ spec: - alert: PrometheusNotConnectedToAlertmanagers annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotconnectedtoalertmanagers + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers summary: Prometheus is not connected to any Alertmanagers. expr: | # Without max_over_time, failed scrapes could create false negatives, see @@ -73,7 +73,7 @@ spec: - alert: PrometheusTSDBReloadsFailing annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbreloadsfailing + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing summary: Prometheus has issues reloading blocks from disk. expr: | increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 @@ -83,7 +83,7 @@ spec: - alert: PrometheusTSDBCompactionsFailing annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbcompactionsfailing + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing summary: Prometheus has issues compacting blocks. expr: | increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 @@ -93,7 +93,7 @@ spec: - alert: PrometheusNotIngestingSamples annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotingestingsamples + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples summary: Prometheus is not ingesting samples. expr: | ( @@ -111,7 +111,7 @@ spec: - alert: PrometheusDuplicateTimestamps annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusduplicatetimestamps + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps summary: Prometheus is dropping samples with duplicate timestamps. expr: | rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 @@ -121,7 +121,7 @@ spec: - alert: PrometheusOutOfOrderTimestamps annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoutofordertimestamps + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps summary: Prometheus drops samples with out-of-order timestamps. expr: | rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 @@ -131,7 +131,7 @@ spec: - alert: PrometheusRemoteStorageFailures annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotestoragefailures + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures summary: Prometheus fails to send samples to remote storage. expr: | ( @@ -151,7 +151,7 @@ spec: - alert: PrometheusRemoteWriteBehind annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritebehind + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind summary: Prometheus remote write is behind. expr: | # Without max_over_time, failed scrapes could create false negatives, see @@ -168,7 +168,7 @@ spec: - alert: PrometheusRemoteWriteDesiredShards annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritedesiredshards + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. expr: | # Without max_over_time, failed scrapes could create false negatives, see @@ -184,7 +184,7 @@ spec: - alert: PrometheusRuleFailures annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusrulefailures + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures summary: Prometheus is failing rule evaluations. expr: | increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 @@ -194,7 +194,7 @@ spec: - alert: PrometheusMissingRuleEvaluations annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusmissingruleevaluations + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations summary: Prometheus is missing rule evaluations due to slow rule group evaluation. expr: | increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 @@ -204,7 +204,7 @@ spec: - alert: PrometheusTargetLimitHit annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustargetlimithit + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. expr: | increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 @@ -214,7 +214,7 @@ spec: - alert: PrometheusLabelLimitHit annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuslabellimithit + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. expr: | increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 @@ -224,7 +224,7 @@ spec: - alert: PrometheusTargetSyncFailure annotations: description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustargetsyncfailure + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure summary: Prometheus has failed to sync targets. expr: | increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring"}[30m]) > 0 @@ -234,7 +234,7 @@ spec: - alert: PrometheusErrorSendingAlertsToAnyAlertmanager annotations: description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' - runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstoanyalertmanager + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. expr: | min without (alertmanager) (