diff --git a/jsonnet/kube-prometheus/alerts/alerts.libsonnet b/jsonnet/kube-prometheus/alerts/alerts.libsonnet index 19568a24cb680b3da24463207d008fe019a226c6..1b2d94eb2c207e30fd9b92db70374971e15bc05f 100644 --- a/jsonnet/kube-prometheus/alerts/alerts.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alerts.libsonnet @@ -1,4 +1,5 @@ (import 'alertmanager.libsonnet') + (import 'general.libsonnet') + (import 'node.libsonnet') + -(import 'prometheus.libsonnet') +(import 'prometheus.libsonnet') + +(import 'prometheus-operator.libsonnet') diff --git a/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..f851caa0b11c99818e2f103efb6b3d72c3d3b868 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet @@ -0,0 +1,50 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'prometheus-operator', + rules: [ + { + alert: 'PrometheusOperatorAlertmanagerReconcileErrors', + expr: ||| + rate(prometheus_operator_alertmanager_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Errors while reconciling Alertmanager in {{ $labels.namespace }} Namespace.', + }, + 'for': '10m', + }, + { + alert: 'PrometheusOperatorPrometheusReconcileErrors', + expr: ||| + rate(prometheus_operator_prometheus_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.', + }, + 'for': '10m', + }, + { + alert: 'PrometheusOperatorNodeLookupErrors', + expr: ||| + rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.', + }, + 'for': '10m', + }, + ], + }, + ], + }, +} diff --git a/jsonnetfile.json b/jsonnetfile.json index b4ebb0f2d8395fb3455d878435a9e3ef059bc496..619586b23b2ed4030f66bb5101abfff465f8463f 100644 --- a/jsonnetfile.json +++ b/jsonnetfile.json @@ -11,4 +11,4 @@ "version": "." } ] -} \ No newline at end of file +} diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 613b0ad802e9c3eac2e911abc54cdec263631817..e6904980d81a78d98645dd11df76875788ab8cd8 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "ce4ab08d6791161267204d9a61588e64f1b57e05" + "version": "bffc85d6e76f6341d5370af68ea980030ab402e8" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "d445c4d98fdf88fd3c59bb34ca4b0f82536f878c" + "version": "c70814dcafce1b51357938e09ee1192998a95706" }, { "name": "grafonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "1df1ddff4361ed7f2c0f33571923511889a115ce" + "version": "001bbb97ccea05cb0d5f6e97c3939654244e8998" } ] -} \ No newline at end of file +} diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index e8f39619fc11829fa76567ca46ee22e1dec8d87a..af68467a0339c0892e62199711c4a641b3ab823b 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -4707,7 +4707,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container_name!=\"\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 7958b926fcffdf81bf2c4a953af6124d508c6852..221fa7260682a9e1b8ae72233238fca1034d6b2a 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -838,7 +838,7 @@ spec: the limit of 110. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods expr: | - kubelet_running_pod_count{job="kubelet"} > 100 + kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 for: 15m labels: severity: warning @@ -914,8 +914,8 @@ spec: severity: critical - alert: AlertmanagerDownOrMissing annotations: - description: An unexpected number of Alertmanagers are scraped or Alertmanagers - disappeared from discovery. + description: An unexpected number of Alertmanagers were scraped or disappeared + from discovery. summary: Alertmanager down or missing expr: | label_replace(prometheus_operator_alertmanager_spec_replicas{job="prometheus-operator"}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{job="alertmanager-main"}) BY (job) != 1 @@ -936,7 +936,7 @@ spec: rules: - alert: TargetDown annotations: - description: '{{ $value }}% of {{ $labels.job }} targets are down.' + description: '{{ $value }}% of the {{ $labels.job }} targets are down.' summary: Targets are down expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 for: 10m @@ -944,7 +944,7 @@ spec: severity: warning - alert: DeadMansSwitch annotations: - description: This is a DeadMansSwitch meant to ensure that the entire Alerting + description: This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional. summary: Alerting DeadMansSwitch expr: vector(1) @@ -955,7 +955,7 @@ spec: - alert: NodeDiskRunningFull annotations: message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace - }}/{{ $labels.pod }} is running full within the next 24 hours. + }}/{{ $labels.pod }} will be full within the next 24 hours. expr: | (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) for: 30m @@ -964,7 +964,7 @@ spec: - alert: NodeDiskRunningFull annotations: message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace - }}/{{ $labels.pod }} is running full within the next 2 hours. + }}/{{ $labels.pod }} will be full within the next 2 hours. expr: | (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) for: 10m @@ -1071,3 +1071,30 @@ spec: for: 10m labels: severity: warning + - name: prometheus-operator + rules: + - alert: PrometheusOperatorAlertmanagerReconcileErrors + annotations: + message: Errors while reconciling Alertmanager in {{ $labels.namespace }} + Namespace. + expr: | + rate(prometheus_operator_alertmanager_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorPrometheusReconcileErrors + annotations: + message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. + expr: | + rate(prometheus_operator_prometheus_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. + expr: | + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 + for: 10m + labels: + severity: warning