diff --git a/README.md b/README.md index b035aceb33c7336de336da5de6acb0cdf501ce42..f6fc98606b9e7d930cf1f781af91106f0e6b171a 100644 --- a/README.md +++ b/README.md @@ -255,7 +255,7 @@ These are the available fields with their respective default values: namespace: "default", versions+:: { - alertmanager: "v0.15.2", + alertmanager: "v0.15.3", nodeExporter: "v0.16.0", kubeStateMetrics: "v1.3.1", kubeRbacProxy: "v0.3.1", @@ -377,9 +377,9 @@ $ jsonnet -J vendor -S --tla-str repository=internal-registry.com/organization s docker pull quay.io/coreos/addon-resizer:1.0 docker tag quay.io/coreos/addon-resizer:1.0 internal-registry.com/organization/addon-resizer:1.0 docker push internal-registry.com/organization/addon-resizer:1.0 -docker pull quay.io/prometheus/alertmanager:v0.15.2 -docker tag quay.io/prometheus/alertmanager:v0.15.2 internal-registry.com/organization/alertmanager:v0.15.2 -docker push internal-registry.com/organization/alertmanager:v0.15.2 +docker pull quay.io/prometheus/alertmanager:v0.15.3 +docker tag quay.io/prometheus/alertmanager:v0.15.3 internal-registry.com/organization/alertmanager:v0.15.3 +docker push internal-registry.com/organization/alertmanager:v0.15.3 ... ``` diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 2dace0bd114074050b19cac77b2b613d950e478b..8c6ef28cec60bfeecc3d80a26060234acec94cd2 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "5185231304f688cf127bf235a4dfdf9f4f9e7821" + "version": "c9350aab06b47bcf8410b597ba50b4addf21ee3d" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "1595151b85934d55ea6969a781039d66f82b22d5" + "version": "f7ca48cca5d9cadc9a2203b8c0b3bb3eb85f3294" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "1ed195577cd8a406d4811dd6818e939169b686a7" + "version": "d407225c5a2e087eb68843528aab2be0507c73b8" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "04e51ce1caeaa4c9aed4c446c9922388a13f6cb1" + "version": "90fbdbf08cf0d4bdc78ab52151041da36a7b0abc" }, { "name": "grafana", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "83304cfc808cf6303d48c45a696f169fae422e68" + "version": "ee9dcbca0d89dc563c9e6bc725fab0c6f21d689b" } ] } diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index e800beacfa64d897384068f6adda78a4ee570daa..2230ea9e30f62e1e4ee22e3084eeb3b59198cf9d 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -11,4 +11,4 @@ spec: beta.kubernetes.io/os: linux replicas: 3 serviceAccountName: alertmanager-main - version: v0.15.2 + version: v0.15.3 diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 31f2ffcf85ff401fd1240833a1bfde3884cbacd9..e4364aa65b258b43f90d1c78077580eb0d5e495a 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -5279,7 +5279,8 @@ items: "expr": "avg(sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "80, 90", @@ -5484,7 +5485,8 @@ items: "expr": "max(\n (\n (\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "80, 90", @@ -6580,7 +6582,8 @@ items: "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -6659,7 +6662,8 @@ items: "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -6738,7 +6742,8 @@ items: "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -6832,7 +6837,8 @@ items: "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -6912,7 +6918,8 @@ items: "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -6992,7 +6999,8 @@ items: "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", @@ -7072,7 +7080,8 @@ items: "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", namespace=\"$namespace\"}) without (instance, pod)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "" + "legendFormat": "", + "refId": "A" } ], "thresholds": "", diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index f0e668a16b001addfdca63307dd50d98545d6787..6a5df8a3eae69a1ae81286ebde87d2da62f91aac 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -223,22 +223,22 @@ spec: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted_seconds_total_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_saturation:avg_irate - expr: | - max by (namespace, pod, device) ((node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"} - - node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"}) - / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) + max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} + - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) record: 'node:node_filesystem_usage:' - expr: | - max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) + max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) record: 'node:node_filesystem_avail:' - expr: | sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) + @@ -829,6 +829,16 @@ spec: for: 5m labels: severity: critical + - alert: KubePersistentVolumeErrors + annotations: + message: The persistent volume {{ $labels.persistentvolume }} has status {{ + $labels.phase }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical - name: kubernetes-system rules: - alert: KubeNodeNotReady