Skip to content
Snippets Groups Projects
prometheus-rules.yaml 94.5 KiB
Newer Older
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  labels:
    prometheus: k8s
    role: alert-rules
  name: prometheus-k8s-rules
  namespace: monitoring
paulfantom's avatar
paulfantom committed
  - name: node-exporter.rules
    rules:
    - expr: |
        count without (cpu) (
          count without (mode) (
            node_cpu_seconds_total{job="node-exporter"}
          )
        )
      record: instance:node_num_cpu:sum
    - expr: |
        1 - avg without (cpu, mode) (
          rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
        )
      record: instance:node_cpu_utilisation:rate1m
    - expr: |
        (
          node_load1{job="node-exporter"}
        /
          instance:node_num_cpu:sum{job="node-exporter"}
        )
      record: instance:node_load1_per_cpu:ratio
    - expr: |
        1 - (
          node_memory_MemAvailable_bytes{job="node-exporter"}
        /
          node_memory_MemTotal_bytes{job="node-exporter"}
        )
      record: instance:node_memory_utilisation:ratio
    - expr: |
        rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
      record: instance:node_vmstat_pgmajfault:rate1m
paulfantom's avatar
paulfantom committed
    - expr: |
Paul Gier's avatar
Paul Gier committed
        rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
paulfantom's avatar
paulfantom committed
      record: instance_device:node_disk_io_time_seconds:rate1m
    - expr: |
Paul Gier's avatar
Paul Gier committed
        rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
paulfantom's avatar
paulfantom committed
      record: instance_device:node_disk_io_time_weighted_seconds:rate1m
    - expr: |
        sum without (device) (
          rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
        )
      record: instance:node_network_receive_bytes_excluding_lo:rate1m
    - expr: |
        sum without (device) (
          rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
        )
      record: instance:node_network_transmit_bytes_excluding_lo:rate1m
    - expr: |
        sum without (device) (
          rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
        )
      record: instance:node_network_receive_drop_excluding_lo:rate1m
    - expr: |
        sum without (device) (
          rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
        )
      record: instance:node_network_transmit_drop_excluding_lo:rate1m
Lili Cosic's avatar
Lili Cosic committed
  - name: kube-apiserver.rules
paulfantom's avatar
paulfantom committed
    rules:
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
            -
            (
              (
                sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d]))
                or
                vector(0)
              )
              +
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d]))
              +
Lili Cosic's avatar
Lili Cosic committed
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
            )
          )
          +
          # errors
          sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
paulfantom's avatar
paulfantom committed
        )
Lili Cosic's avatar
Lili Cosic committed
        /
        sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: read
      record: apiserver_request:burnrate1d
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
            -
            (
              (
                sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h]))
                or
                vector(0)
              )
              +
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h]))
              +
Lili Cosic's avatar
Lili Cosic committed
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
            )
          )
          +
          # errors
          sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
paulfantom's avatar
paulfantom committed
        )
Lili Cosic's avatar
Lili Cosic committed
        /
        sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: read
      record: apiserver_request:burnrate1h
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
            -
            (
              (
                sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h]))
                or
                vector(0)
              )
              +
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h]))
              +
Lili Cosic's avatar
Lili Cosic committed
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
            )
          )
          +
          # errors
          sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
paulfantom's avatar
paulfantom committed
        )
Lili Cosic's avatar
Lili Cosic committed
        /
        sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: read
      record: apiserver_request:burnrate2h
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
            -
            (
              (
                sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m]))
                or
                vector(0)
              )
              +
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m]))
              +
Lili Cosic's avatar
Lili Cosic committed
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
            )
          )
          +
          # errors
          sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
paulfantom's avatar
paulfantom committed
        )
Lili Cosic's avatar
Lili Cosic committed
        /
        sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: read
      record: apiserver_request:burnrate30m
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
            -
            (
              (
                sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d]))
                or
                vector(0)
              )
              +
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d]))
              +
Lili Cosic's avatar
Lili Cosic committed
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
            )
          )
          +
          # errors
          sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
paulfantom's avatar
paulfantom committed
        )
Lili Cosic's avatar
Lili Cosic committed
        /
        sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: read
      record: apiserver_request:burnrate3d
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
            -
            (
              (
                sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m]))
                or
                vector(0)
              )
              +
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m]))
              +
Lili Cosic's avatar
Lili Cosic committed
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
            )
          )
          +
          # errors
          sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
paulfantom's avatar
paulfantom committed
        )
Lili Cosic's avatar
Lili Cosic committed
        /
        sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: read
      record: apiserver_request:burnrate5m
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
            -
            (
              (
                sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h]))
                or
                vector(0)
              )
              +
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h]))
              +
Lili Cosic's avatar
Lili Cosic committed
              sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
            )
          )
          +
          # errors
          sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
paulfantom's avatar
paulfantom committed
        )
Lili Cosic's avatar
Lili Cosic committed
        /
        sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: read
      record: apiserver_request:burnrate6h
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
            -
            sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
          )
          +
          sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
        )
paulfantom's avatar
paulfantom committed
        /
Lili Cosic's avatar
Lili Cosic committed
        sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: write
      record: apiserver_request:burnrate1d
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
            -
            sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
          )
          +
          sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
        )
paulfantom's avatar
paulfantom committed
        /
Lili Cosic's avatar
Lili Cosic committed
        sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: write
      record: apiserver_request:burnrate1h
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
            -
            sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
          )
          +
          sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
        )
paulfantom's avatar
paulfantom committed
        /
Lili Cosic's avatar
Lili Cosic committed
        sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: write
      record: apiserver_request:burnrate2h
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
            -
            sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
          )
          +
          sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
        )
paulfantom's avatar
paulfantom committed
        /
Lili Cosic's avatar
Lili Cosic committed
        sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: write
      record: apiserver_request:burnrate30m
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
            -
            sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
          )
          +
          sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
        )
paulfantom's avatar
paulfantom committed
        /
Lili Cosic's avatar
Lili Cosic committed
        sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: write
      record: apiserver_request:burnrate3d
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
            -
            sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
          )
          +
          sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
        )
paulfantom's avatar
paulfantom committed
        /
Lili Cosic's avatar
Lili Cosic committed
        sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: write
      record: apiserver_request:burnrate5m
paulfantom's avatar
paulfantom committed
    - expr: |
Lili Cosic's avatar
Lili Cosic committed
        (
          (
            # too slow
            sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
            -
            sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
          )
          +
          sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
        )
paulfantom's avatar
paulfantom committed
        /
Lili Cosic's avatar
Lili Cosic committed
        sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
paulfantom's avatar
paulfantom committed
      labels:
Lili Cosic's avatar
Lili Cosic committed
        verb: write
      record: apiserver_request:burnrate6h
    - expr: |
        sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
      labels:
        verb: read
      record: code_resource:apiserver_request_total:rate5m
    - expr: |
        sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
      labels:
        verb: write
      record: code_resource:apiserver_request_total:rate5m
    - expr: |
        histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
      labels:
        quantile: "0.99"
        verb: read
      record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
    - expr: |
        histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
      labels:
        quantile: "0.99"
        verb: write
      record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
    - expr: |
        sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
        /
        sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
      record: cluster:apiserver_request_duration_seconds:mean5m
    - expr: |
        histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
      labels:
        quantile: "0.99"
      record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
    - expr: |
        histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
      labels:
        quantile: "0.9"
      record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
    - expr: |
        histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
      labels:
        quantile: "0.5"
      record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
  - interval: 3m
    name: kube-apiserver-availability.rules
    rules:
Lili Cosic's avatar
Lili Cosic committed
    - expr: |
        1 - (
          (
            # write too slow
            sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
            -
            sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
          ) +
          (
            # read too slow
            sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
            -
            (
              (
                sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
                or
                vector(0)
              )
              +
              sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
              +
Lili Cosic's avatar
Lili Cosic committed
              sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
            )
          ) +
          # errors
          sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
Lili Cosic's avatar
Lili Cosic committed
        )
        /
        sum(code:apiserver_request_total:increase30d)
      labels:
        verb: all
      record: apiserver_request:availability30d
    - expr: |
        1 - (
          sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
          -
          (
            # too slow
            (
              sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
              or
              vector(0)
            )
            +
            sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
            +
Lili Cosic's avatar
Lili Cosic committed
            sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
          )
          +
          # errors
          sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
Lili Cosic's avatar
Lili Cosic committed
        )
        /
        sum(code:apiserver_request_total:increase30d{verb="read"})
      labels:
        verb: read
      record: apiserver_request:availability30d
    - expr: |
        1 - (
          (
            # too slow
            sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
            -
            sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
          )
          +
          # errors
          sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
Lili Cosic's avatar
Lili Cosic committed
        )
        /
        sum(code:apiserver_request_total:increase30d{verb="write"})
      labels:
        verb: write
      record: apiserver_request:availability30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d]))
Lili Cosic's avatar
Lili Cosic committed
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
Lili Cosic's avatar
Lili Cosic committed
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
Lili Cosic's avatar
Lili Cosic committed
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
Lili Cosic's avatar
Lili Cosic committed
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
Lili Cosic's avatar
Lili Cosic committed
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
Lili Cosic's avatar
Lili Cosic committed
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
Sergiusz Urbaniak's avatar
Sergiusz Urbaniak committed
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
Simon Pasquier's avatar
Simon Pasquier committed
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
Sergiusz Urbaniak's avatar
Sergiusz Urbaniak committed
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d]))
      record: code_verb:apiserver_request_total:increase30d
    - expr: |
        sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
Sergiusz Urbaniak's avatar
Sergiusz Urbaniak committed
      labels:
        verb: read
      record: code:apiserver_request_total:increase30d
Sergiusz Urbaniak's avatar
Sergiusz Urbaniak committed
    - expr: |
        sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
Sergiusz Urbaniak's avatar
Sergiusz Urbaniak committed
      labels:
        verb: write
      record: code:apiserver_request_total:increase30d
omerlh's avatar
omerlh committed
        sum(rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])) by (namespace)
      record: namespace:container_cpu_usage_seconds_total:sum_rate
    - expr: |
paulfantom's avatar
paulfantom committed
        sum by (cluster, namespace, pod, container) (
omerlh's avatar
omerlh committed
          rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
paulfantom's avatar
paulfantom committed
        ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
          1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
paulfantom's avatar
paulfantom committed
        )
liuxu's avatar
liuxu committed
      record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
omerlh's avatar
omerlh committed
        container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
paulfantom's avatar
paulfantom committed
        * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
          max by(namespace, pod, node) (kube_pod_info{node!=""})
paulfantom's avatar
paulfantom committed
        )
      record: node_namespace_pod_container:container_memory_working_set_bytes
    - expr: |
omerlh's avatar
omerlh committed
        container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
paulfantom's avatar
paulfantom committed
        * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
          max by(namespace, pod, node) (kube_pod_info{node!=""})
paulfantom's avatar
paulfantom committed
        )
      record: node_namespace_pod_container:container_memory_rss
    - expr: |
omerlh's avatar
omerlh committed
        container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
paulfantom's avatar
paulfantom committed
        * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
          max by(namespace, pod, node) (kube_pod_info{node!=""})
paulfantom's avatar
paulfantom committed
        )
      record: node_namespace_pod_container:container_memory_cache
    - expr: |
omerlh's avatar
omerlh committed
        container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
paulfantom's avatar
paulfantom committed
        * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
          max by(namespace, pod, node) (kube_pod_info{node!=""})
paulfantom's avatar
paulfantom committed
        )
      record: node_namespace_pod_container:container_memory_swap
omerlh's avatar
omerlh committed
        sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
      record: namespace:container_memory_usage_bytes:sum
    - expr: |
Simon Pasquier's avatar
Simon Pasquier committed
        sum by (namespace) (
            sum by (namespace, pod) (
                max by (namespace, pod, container) (
                    kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
                ) * on(namespace, pod) group_left() max by (namespace, pod) (
                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
                )
            )
      record: namespace:kube_pod_container_resource_requests_memory_bytes:sum
    - expr: |
Simon Pasquier's avatar
Simon Pasquier committed
        sum by (namespace) (
            sum by (namespace, pod) (
                max by (namespace, pod, container) (
                    kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
                ) * on(namespace, pod) group_left() max by (namespace, pod) (
                  kube_pod_status_phase{phase=~"Pending|Running"} == 1
                )
            )
        )
      record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
    - expr: |
paulfantom's avatar
paulfantom committed
        max by (cluster, namespace, workload, pod) (
          label_replace(
            label_replace(
              kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
              "replicaset", "$1", "owner_name", "(.*)"
paulfantom's avatar
paulfantom committed
            ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
              1, max by (replicaset, namespace, owner_name) (
                kube_replicaset_owner{job="kube-state-metrics"}
              )
            ),
            "workload", "$1", "owner_name", "(.*)"
          )
paulfantom's avatar
paulfantom committed
        )
      labels:
        workload_type: deployment
      record: namespace_workload_pod:kube_pod_owner:relabel
    - expr: |
paulfantom's avatar
paulfantom committed
        max by (cluster, namespace, workload, pod) (
          label_replace(
            kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
            "workload", "$1", "owner_name", "(.*)"
          )
paulfantom's avatar
paulfantom committed
        )
      labels:
        workload_type: daemonset
      record: namespace_workload_pod:kube_pod_owner:relabel
    - expr: |
paulfantom's avatar
paulfantom committed
        max by (cluster, namespace, workload, pod) (
          label_replace(
            kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
            "workload", "$1", "owner_name", "(.*)"
          )
paulfantom's avatar
paulfantom committed
        )
      labels:
        workload_type: statefulset
      record: namespace_workload_pod:kube_pod_owner:relabel
  - name: kube-scheduler.rules
    rules:
    - expr: |
        histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
      record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
        histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
      record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
        histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
      record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
        histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
      record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
        histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
      record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
        histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
      record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
        histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
      record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
        histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
      record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
        histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
      record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
paulfantom's avatar
paulfantom committed
    - expr: |
        sum(min(kube_pod_info{node!=""}) by (cluster, node))
      record: ':kube_pod_info_node_count:'
    - expr: |
paulfantom's avatar
paulfantom committed
        topk by(namespace, pod) (1,
          max by (node, namespace, pod) (
            label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
paulfantom's avatar
paulfantom committed
        ))
      record: 'node_namespace_pod:kube_pod_info:'
    - expr: |
paulfantom's avatar
paulfantom committed
        count by (cluster, node) (sum by (node, cpu) (
Dmitry Verkhoturov's avatar
Dmitry Verkhoturov committed
          node_cpu_seconds_total{job="node-exporter"}
        * on (namespace, pod) group_left(node)
          node_namespace_pod:kube_pod_info:
        ))
      record: node:node_num_cpu:sum
Sergiusz Urbaniak's avatar
Sergiusz Urbaniak committed
        sum(
          node_memory_MemAvailable_bytes{job="node-exporter"} or
          (
            node_memory_Buffers_bytes{job="node-exporter"} +
            node_memory_Cached_bytes{job="node-exporter"} +
            node_memory_MemFree_bytes{job="node-exporter"} +
            node_memory_Slab_bytes{job="node-exporter"}
          )
paulfantom's avatar
paulfantom committed
        ) by (cluster)
Sergiusz Urbaniak's avatar
Sergiusz Urbaniak committed
      record: :node_memory_MemAvailable_bytes:sum
paulfantom's avatar
paulfantom committed
  - name: kubelet.rules
    rules:
    - expr: |
        histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
      labels:
        quantile: "0.99"
      record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
    - expr: |
        histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
      labels:
        quantile: "0.9"
      record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
    - expr: |
        histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
      labels:
        quantile: "0.5"
      record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
  - name: kube-prometheus-node-recording.rules
    rules:
    - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
        (instance)
      record: instance:node_cpu:rate:sum
    - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
      record: instance:node_network_receive_bytes:rate:sum
    - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
      record: instance:node_network_transmit_bytes:rate:sum
    - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT
        (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
        BY (instance, cpu)) BY (instance)
      record: instance:node_cpu:ratio
    - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
      record: cluster:node_cpu:sum_rate5m
    - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
        BY (instance, cpu))
      record: cluster:node_cpu:ratio
  - name: kube-prometheus-general.rules
    rules:
    - expr: count without(instance, pod, node) (up == 1)
      record: count:up1
    - expr: count without(instance, pod, node) (up == 0)
      record: count:up0
  - name: kube-state-metrics
    rules:
    - alert: KubeStateMetricsListErrors
      annotations:
        description: kube-state-metrics is experiencing errors at an elevated rate
          in list operations. This is likely causing it to not be able to expose metrics
          about Kubernetes objects correctly or at all.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors
        summary: kube-state-metrics is experiencing errors in list operations.
      expr: |
        (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
          /
        sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
        > 0.01
      for: 15m
      labels:
        severity: critical
    - alert: KubeStateMetricsWatchErrors
      annotations:
        description: kube-state-metrics is experiencing errors at an elevated rate
          in watch operations. This is likely causing it to not be able to expose
          metrics about Kubernetes objects correctly or at all.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors
        summary: kube-state-metrics is experiencing errors in watch operations.
      expr: |
        (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
          /
        sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
        > 0.01
      for: 15m
      labels:
        severity: critical
paulfantom's avatar
paulfantom committed
  - name: node-exporter
    rules:
    - alert: NodeFilesystemSpaceFillingUp
      annotations:
        description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
          has only {{ printf "%.2f" $value }}% available space left and is filling
          up.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
        summary: Filesystem is predicted to run out of space within the next 24 hours.
      expr: |
        (
          node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
paulfantom's avatar
paulfantom committed
        )
      for: 1h
      labels:
        severity: warning
    - alert: NodeFilesystemSpaceFillingUp
      annotations:
        description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
          has only {{ printf "%.2f" $value }}% available space left and is filling
          up fast.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
        summary: Filesystem is predicted to run out of space within the next 4 hours.
      expr: |
        (
iuri aranda's avatar
iuri aranda committed
          node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
paulfantom's avatar
paulfantom committed
        )
      for: 1h
      labels:
        severity: critical
    - alert: NodeFilesystemAlmostOutOfSpace
      annotations:
        description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
          has only {{ printf "%.2f" $value }}% available space left.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
        summary: Filesystem has less than 5% space left.
      expr: |
        (
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
paulfantom's avatar
paulfantom committed
        )
      for: 1h
      labels:
        severity: warning
    - alert: NodeFilesystemAlmostOutOfSpace
      annotations:
        description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
          has only {{ printf "%.2f" $value }}% available space left.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
        summary: Filesystem has less than 3% space left.
      expr: |
        (
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
paulfantom's avatar
paulfantom committed
        )
      for: 1h
      labels:
        severity: critical
    - alert: NodeFilesystemFilesFillingUp
      annotations:
        description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
          has only {{ printf "%.2f" $value }}% available inodes left and is filling
          up.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
        summary: Filesystem is predicted to run out of inodes within the next 24 hours.
      expr: |
        (
          node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
paulfantom's avatar
paulfantom committed
        )
      for: 1h
      labels:
        severity: warning
    - alert: NodeFilesystemFilesFillingUp
      annotations:
        description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
          has only {{ printf "%.2f" $value }}% available inodes left and is filling
          up fast.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
        summary: Filesystem is predicted to run out of inodes within the next 4 hours.
      expr: |
        (
          node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
paulfantom's avatar
paulfantom committed
        )
      for: 1h
      labels:
        severity: critical
    - alert: NodeFilesystemAlmostOutOfFiles
      annotations:
        description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
          has only {{ printf "%.2f" $value }}% available inodes left.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
        summary: Filesystem has less than 5% inodes left.
      expr: |
        (
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
paulfantom's avatar
paulfantom committed
        )
      for: 1h
      labels:
        severity: warning
    - alert: NodeFilesystemAlmostOutOfFiles
      annotations:
        description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
          has only {{ printf "%.2f" $value }}% available inodes left.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
        summary: Filesystem has less than 3% inodes left.
      expr: |
        (
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
paulfantom's avatar
paulfantom committed
        and
Lili Cosic's avatar
Lili Cosic committed
          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
paulfantom's avatar
paulfantom committed
        )
      for: 1h
      labels:
        severity: critical
    - alert: NodeNetworkReceiveErrs
      annotations:
        description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
          {{ printf "%.0f" $value }} receive errors in the last two minutes.'
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
        summary: Network interface is reporting many receive errors.
      expr: |
        increase(node_network_receive_errs_total[2m]) > 10
      for: 1h
      labels:
        severity: warning
    - alert: NodeNetworkTransmitErrs
      annotations:
        description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
          {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
        summary: Network interface is reporting many transmit errors.
      expr: |
        increase(node_network_transmit_errs_total[2m]) > 10
      for: 1h
      labels:
        severity: warning
paulfantom's avatar
paulfantom committed
    - alert: NodeHighNumberConntrackEntriesUsed
      annotations:
        description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
paulfantom's avatar
paulfantom committed
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
        summary: Number of conntrack are getting close to the limit.
paulfantom's avatar
paulfantom committed
      expr: |
        (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
      labels:
        severity: warning
    - alert: NodeTextFileCollectorScrapeError
      annotations:
        description: Node Exporter text file collector failed to scrape.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror
        summary: Node Exporter text file collector failed to scrape.
      expr: |
        node_textfile_scrape_error{job="node-exporter"} == 1
      labels:
        severity: warning
paulfantom's avatar
paulfantom committed
    - alert: NodeClockSkewDetected
      annotations:
        message: Clock on {{ $labels.instance }} is out of sync by more than 300s.
          Ensure NTP is configured correctly on this host.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected
        summary: Clock skew detected.
      expr: |
        (