diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 1f9a7a8897d878fce2ced68dfa8f0197bc7ca71c..31f2ffcf85ff401fd1240833a1bfde3884cbacd9 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -796,7 +796,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(max(node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", + "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -1920,7 +1920,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - avg(rate(node_cpu{mode=\"idle\"}[1m]))", + "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2172,7 +2172,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - sum(:node_memory_MemFreeCachedBuffers:sum) / sum(:node_memory_MemTotal:sum)", + "expr": "1 - sum(:node_memory_MemFreeCachedBuffers_bytes:sum) / sum(:node_memory_MemTotal_bytes:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2256,7 +2256,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(:node_memory_MemTotal:sum)", + "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -2340,7 +2340,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(:node_memory_MemTotal:sum)", + "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5003,7 +5003,7 @@ items: }, "yaxes": [ { - "format": "percentunit", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -5011,7 +5011,7 @@ items: "show": true }, { - "format": "percentunit", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -5064,7 +5064,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "avg by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m])) * 100", + "expr": "avg by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m])) * 100", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cpu}}", @@ -5076,7 +5076,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "System load", + "title": "Usage Per Core", "tooltip": { "shared": true, "sort": 0, @@ -5168,7 +5168,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "avg (sum by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n", + "expr": "max (sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n", "format": "time_series", "intervalFactor": 10, "legendFormat": "{{ cpu }}", @@ -5276,7 +5276,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "avg(sum by (cpu) (irate(node_cpu{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", + "expr": "avg(sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -5352,28 +5352,28 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "expr": "max(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory used", "refId": "A" }, { - "expr": "max(node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory buffers", "refId": "B" }, { - "expr": "max(node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory cached", "refId": "C" }, { - "expr": "max(node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"})", + "expr": "max(node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "memory free", @@ -5481,7 +5481,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(\n (\n (\n node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", + "expr": "max(\n (\n (\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "" @@ -5564,21 +5564,21 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_disk_bytes_read{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "read", "refId": "A" }, { - "expr": "max(rate(node_disk_bytes_written{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "written", "refId": "B" }, { - "expr": "max(rate(node_disk_io_time_ms{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "expr": "max(rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "io time", @@ -5773,7 +5773,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_receive_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5864,7 +5864,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "max(rate(node_network_transmit_bytes{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -5958,7 +5958,7 @@ items: "options": [ ], - "query": "label_values(node_boot_time{job=\"node-exporter\"}, instance)", + "query": "label_values(node_boot_time_seconds{job=\"node-exporter\"}, instance)", "refresh": 2, "regex": "", "sort": 0, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 11a2330711399ead5b138cb08e3651056e499021..f0e668a16b001addfdca63307dd50d98545d6787 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -122,17 +122,17 @@ spec: record: 'node_namespace_pod:kube_pod_info:' - expr: | count by (node) (sum by (node, cpu) ( - node_cpu{job="node-exporter"} + node_cpu_seconds_total{job="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: )) record: node:node_num_cpu:sum - expr: | - 1 - avg(rate(node_cpu{job="node-exporter",mode="idle"}[1m])) + 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) record: :node_cpu_utilisation:avg1m - expr: | 1 - avg by (node) ( - rate(node_cpu{job="node-exporter",mode="idle"}[1m]) + rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) record: node:node_cpu_utilisation:avg1m @@ -152,26 +152,26 @@ spec: record: 'node:node_cpu_saturation_load1:' - expr: | 1 - - sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) / - sum(node_memory_MemTotal{job="node-exporter"}) + sum(node_memory_MemTotal_bytes{job="node-exporter"}) record: ':node_memory_utilisation:' - expr: | - sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) - record: :node_memory_MemFreeCachedBuffers:sum + sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + record: :node_memory_MemFreeCachedBuffers_bytes:sum - expr: | - sum(node_memory_MemTotal{job="node-exporter"}) - record: :node_memory_MemTotal:sum + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + record: :node_memory_MemTotal_bytes:sum - expr: | sum by (node) ( - (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_memory_bytes_available:sum - expr: | sum by (node) ( - node_memory_MemTotal{job="node-exporter"} + node_memory_MemTotal_bytes{job="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -190,13 +190,13 @@ spec: - expr: | 1 - sum by (node) ( - (node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) + (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) / sum by (node) ( - node_memory_MemTotal{job="node-exporter"} + node_memory_MemTotal_bytes{job="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -213,21 +213,21 @@ spec: ) record: node:node_memory_swap_io_bytes:sum_rate - expr: | - avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m])) record: :node_disk_utilisation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 + irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted_seconds_total_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted_seconds_total_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -241,25 +241,25 @@ spec: max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) record: 'node:node_filesystem_avail:' - expr: | - sum(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) record: :node_net_utilisation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_net_utilisation:sum_irate - expr: | - sum(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) record: :node_net_saturation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -688,8 +688,8 @@ spec: severity: warning - alert: KubeCronJobRunning annotations: - message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking - more than 1h to complete. + message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more + than 1h to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning expr: | time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 @@ -698,8 +698,8 @@ spec: severity: warning - alert: KubeJobCompletion annotations: - message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than - one hour to complete. + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more + than one hour to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion expr: | kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 @@ -739,7 +739,7 @@ spec: expr: | sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) / - sum(node_memory_MemTotal) + sum(node_memory_MemTotal_bytes) > (count(node:node_num_cpu:sum)-1) / @@ -766,7 +766,7 @@ spec: expr: | sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) / - sum(node_memory_MemTotal{job="node-exporter"}) + sum(node_memory_MemTotal_bytes{job="node-exporter"}) > 1.5 for: 5m labels: @@ -801,7 +801,7 @@ spec: - alert: KubePersistentVolumeUsageCritical annotations: message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim - }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value + }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value }}% free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical expr: | @@ -816,14 +816,14 @@ spec: annotations: message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four - days. Currently {{ $value }} bytes are available. + days. Currently {{ printf "%0.2f" $value }}% is available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays expr: | - ( - kubelet_volume_stats_used_bytes{job="kubelet"} + 100 * ( + kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} - ) > 0.85 + ) < 15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 for: 5m