diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 1f6d5c6983daabf84ead1c60f06dad099961aaa0..1e3eff81b8b168c7883acccb341e51b6a79d5f45 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -27,7 +27,7 @@ "subdir": "" } }, - "version": "d9b461b0692ddfff6c5d2a189443cfe4beefb3b2" + "version": "3da19e5a40fbb9f7a621958040472db918c4de9c" }, { "name": "grafonnet", @@ -47,7 +47,7 @@ "subdir": "grafana-builder" } }, - "version": "2b9b14d0d91adf8781e5b2c9b62dc8cb180a9886" + "version": "e59d64a96a73e65ba53ba7fe05c9598974cc4a52" }, { "name": "grafana", @@ -57,7 +57,7 @@ "subdir": "grafana" } }, - "version": "5df496bc1199b40bd066a8c228d94d9653173645" + "version": "567be6b15b7f3b747c48dc7b111c1860cab121c7" }, { "name": "prometheus-operator", @@ -77,7 +77,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "efd1fc634b58a629903990e605f2cb9d5633706d" + "version": "3ef2ad8e115449a7004b628a873e2629855ed468" }, { "name": "prometheus", @@ -87,7 +87,7 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "08c55c119f39093e18b2bb9cba5c5619dc4ea0e1" + "version": "b05b5f9a300b0209689c06d70f676291f23774c4" }, { "name": "node-mixin", @@ -97,7 +97,7 @@ "subdir": "docs/node-mixin" } }, - "version": "27b8c93a5afc21632239890c4558c7300cca17d2" + "version": "9f49fff79ef85fcebb69289622150e6d5346528b" }, { "name": "promgrafonnet", @@ -107,7 +107,7 @@ "subdir": "lib/promgrafonnet" } }, - "version": "d9b461b0692ddfff6c5d2a189443cfe4beefb3b2" + "version": "3da19e5a40fbb9f7a621958040472db918c4de9c" } ] } diff --git a/kustomization.yaml b/kustomization.yaml index a580ed8e6602627dd362095bc07356b0dc55d801..95f9dc3d5f9769a20e95640e8fd198e090071611 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -22,6 +22,7 @@ resources: - ./manifests/grafana-dashboardDefinitions.yaml - ./manifests/grafana-dashboardSources.yaml - ./manifests/grafana-deployment.yaml +- ./manifests/grafana-rawDashboardDefinitions.yaml - ./manifests/grafana-service.yaml - ./manifests/grafana-serviceAccount.yaml - ./manifests/grafana-serviceMonitor.yaml diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 6120794d77fa06d32c9f4f88c1c8be1cac277589..799c493c8427866a1fdeeb5a7751f2256c602db6 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -3093,7 +3093,7 @@ items: "decimals": 0, "link": true, "linkTooltip": "Drill down to pods", - "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", + "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", "pattern": "Value #A", "thresholds": [ @@ -3111,7 +3111,7 @@ items: "decimals": 0, "link": true, "linkTooltip": "Drill down to workloads", - "linkUrl": "/d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", + "linkUrl": "./d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", "pattern": "Value #B", "thresholds": [ @@ -3219,7 +3219,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down to pods", - "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", + "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -3512,7 +3512,7 @@ items: "decimals": 0, "link": true, "linkTooltip": "Drill down to pods", - "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", + "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", "pattern": "Value #A", "thresholds": [ @@ -3530,7 +3530,7 @@ items: "decimals": 0, "link": true, "linkTooltip": "Drill down to workloads", - "linkUrl": "/d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", + "linkUrl": "./d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", "pattern": "Value #B", "thresholds": [ @@ -3638,7 +3638,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down to pods", - "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", + "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", "pattern": "namespace", "thresholds": [ @@ -4130,7 +4130,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -4549,7 +4549,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -6067,7 +6067,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -6432,7 +6432,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -7005,7 +7005,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2", + "linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2", "pattern": "workload", "thresholds": [ @@ -7415,7 +7415,7 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "/d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2", + "linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2", "pattern": "workload", "thresholds": [ diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 769e655987a350f57ed552f57fcc3e66852a2c6d..eeb7af5aecf8172294b0f34439f4a6b368b1d9a0 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -242,7 +242,7 @@ spec: summary: Filesystem is predicted to run out of space within the next 24 hours. expr: | ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} < 0.4 + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 and @@ -260,7 +260,7 @@ spec: summary: Filesystem is predicted to run out of space within the next 4 hours. expr: | ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} < 0.2 + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20 and predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 and @@ -308,7 +308,7 @@ spec: summary: Filesystem is predicted to run out of inodes within the next 24 hours. expr: | ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} < 0.4 + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 and predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 and @@ -326,7 +326,7 @@ spec: summary: Filesystem is predicted to run out of inodes within the next 4 hours. expr: | ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} < 0.2 + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 and predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 and @@ -573,13 +573,13 @@ spec: severity: critical - alert: KubeDaemonSetRolloutStuck annotations: - message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace - }}/{{ $labels.daemonset }} are scheduled and ready. + message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet + {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck expr: | kube_daemonset_status_number_ready{job="kube-state-metrics"} / - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00 for: 15m labels: severity: critical @@ -718,25 +718,28 @@ spec: severity: warning - alert: KubeQuotaExceeded annotations: - message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value - }}% of its {{ $labels.resource }} quota. + message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded expr: | - 100 * kube_resourcequota{job="kube-state-metrics", type="used"} + kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 90 + > 0.90 for: 15m labels: severity: warning - alert: CPUThrottlingHigh annotations: - message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace - }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' + message: '{{ $value | humanizePercentage }} throttling of CPU in namespace + {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ + $labels.pod }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh - expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", - }[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) - by (container, pod, namespace)\n > 25 \n" + expr: | + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) + > ( 25 / 100 ) for: 15m labels: severity: warning @@ -745,14 +748,14 @@ spec: - alert: KubePersistentVolumeUsageCritical annotations: message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim - }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value - }}% free. + }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage + }} free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical expr: | - 100 * kubelet_volume_stats_available_bytes{job="kubelet"} + kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} - < 3 + < 0.03 for: 1m labels: severity: critical @@ -760,14 +763,14 @@ spec: annotations: message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four - days. Currently {{ printf "%0.2f" $value }}% is available. + days. Currently {{ $value | humanizePercentage }} is available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays expr: | - 100 * ( + ( kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} - ) < 15 + ) < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 for: 5m @@ -807,23 +810,23 @@ spec: - alert: KubeClientErrors annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance - }}' is experiencing {{ printf "%0.0f" $value }}% errors.' + }}' is experiencing {{ $value | humanizePercentage }} errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) / sum(rate(rest_client_requests_total[5m])) by (instance, job)) - * 100 > 1 + > 0.01 for: 15m labels: severity: warning - alert: KubeletTooManyPods annotations: - message: Kubelet '{{ $labels.node }}' is running at {{ printf "%.4g" $value - }}% of its Pod capacity. + message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage + }} of its Pod capacity. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods expr: | - 100 * max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 95 + max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95 for: 15m labels: severity: warning @@ -849,47 +852,51 @@ spec: severity: critical - alert: KubeAPIErrorsHigh annotations: - message: API server is returning errors for {{ $value }}% of requests. + message: API server is returning errors for {{ $value | humanizePercentage + }} of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) / - sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3 + sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.03 for: 10m labels: severity: critical - alert: KubeAPIErrorsHigh annotations: - message: API server is returning errors for {{ $value }}% of requests. + message: API server is returning errors for {{ $value | humanizePercentage + }} of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) / - sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1 + sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.01 for: 10m labels: severity: warning - alert: KubeAPIErrorsHigh annotations: - message: API server is returning errors for {{ $value }}% of requests for - {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. + message: API server is returning errors for {{ $value | humanizePercentage + }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource + }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) / - sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10 + sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.10 for: 10m labels: severity: critical - alert: KubeAPIErrorsHigh annotations: - message: API server is returning errors for {{ $value }}% of requests for - {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. + message: API server is returning errors for {{ $value | humanizePercentage + }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource + }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) / - sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5 + sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.05 for: 10m labels: severity: warning