diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000000000000000000000000000000000..6e12307c0e9ab16a21891abf2febb2d84cc7c65a --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,37 @@ +<!-- +WARNING: Not using this template will result in a longer review process and your change won't be visible in CHANGELOG. +--> + +## Description + +_Describe the big picture of your changes here to communicate to the maintainers why we should accept this pull request. +If it fixes a bug or resolves a feature request, be sure to link to that issue._ + + + +## Type of change + +_What type of changes does your code introduce to the kube-prometheus? Put an `x` in the box that apply._ + +- [ ] `CHANGE` (fix or feature that would cause existing functionality to not work as expected) +- [ ] `FEATURE` (non-breaking change which adds functionality) +- [ ] `BUGFIX` (non-breaking change which fixes an issue) +- [ ] `ENHANCEMENT` (non-breaking change which improves existing functionality) +- [ ] `NONE` (if none of the other choices apply. Example, tooling, build system, CI, docs, etc.) + +## Changelog entry + +_Please put a one-line changelog entry below. Later this will be copied to the changelog file._ + +<!-- +Your release note should be written in clear and straightforward sentences. Most often, users aren't familiar with +the technical details of your PR, so consider what they need to know when you write your release note. + +Some brief examples of release notes: +- Add metadataConfig field to the Prometheus CRD for configuring how remote-write sends metadata information. +- Generate correct scraping configuration for Probes with empty or unset module parameter. +--> + +```release-note + +``` diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 54a8e50c801de8b7cce9beff1b8125cf4828a9b3..e4b5c79e2547824f5577a268b5abe3a760c08ce4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -3,8 +3,8 @@ on: - push - pull_request env: - golang-version: '1.13' - kind-version: 'v0.9.0' + golang-version: '1.15' + kind-version: 'v0.11.1' jobs: generate: runs-on: ${{ matrix.os }} @@ -16,15 +16,35 @@ jobs: name: Generate steps: - uses: actions/checkout@v2 + with: + persist-credentials: false - uses: actions/setup-go@v2 with: go-version: ${{ env.golang-version }} - - run: make --always-make generate && git diff --exit-code + - run: make --always-make generate validate && git diff --exit-code + lint: + runs-on: ubuntu-latest + name: Jsonnet linter + steps: + - uses: actions/checkout@v2 + with: + persist-credentials: false + - run: make --always-make lint + fmt: + runs-on: ubuntu-latest + name: Jsonnet formatter + steps: + - uses: actions/checkout@v2 + with: + persist-credentials: false + - run: make --always-make fmt && git diff --exit-code unit-tests: runs-on: ubuntu-latest name: Unit tests steps: - uses: actions/checkout@v2 + with: + persist-credentials: false - run: make --always-make test e2e-tests: name: E2E tests @@ -32,21 +52,20 @@ jobs: strategy: matrix: kind-image: - - 'kindest/node:v1.19.0' + - 'kindest/node:v1.21.1' + - 'kindest/node:v1.22.0' steps: - uses: actions/checkout@v2 + with: + persist-credentials: false - name: Start KinD uses: engineerd/setup-kind@v0.5.0 with: version: ${{ env.kind-version }} image: ${{ matrix.kind-image }} + wait: 300s - name: Wait for cluster to finish bootstraping - run: | - until [ "$(kubectl get pods --all-namespaces --no-headers | grep -cEv '([0-9]+)/\1')" -eq 0 ]; do - sleep 5s - done - kubectl cluster-info - kubectl get pods -A + run: kubectl wait --for=condition=Ready pods --all --all-namespaces --timeout=300s - name: Create kube-prometheus stack run: | kubectl create -f manifests/setup diff --git a/.github/workflows/versions.yaml b/.github/workflows/versions.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0230d4414b2c34e34b453b6b38a1d64364581c2f --- /dev/null +++ b/.github/workflows/versions.yaml @@ -0,0 +1,68 @@ +name: Upgrade to latest versions + +on: + workflow_dispatch: + schedule: + - cron: '37 7 * * 1' +jobs: + versions: + runs-on: ubuntu-latest + strategy: + matrix: + branch: + - 'release-0.6' + - 'release-0.7' + - 'release-0.8' + - 'release-0.9' + - 'main' + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ matrix.branch }} + - uses: actions/setup-go@v2 + with: + go-version: 1.16 + - name: Upgrade versions + run: | + export GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }} + # Write to temporary file to make update atomic + scripts/generate-versions.sh > /tmp/versions.json + mv /tmp/versions.json jsonnet/kube-prometheus/versions.json + if: matrix.branch == 'main' + - name: Update jsonnet dependencies + run: | + make update + make generate + + # Reset jsonnetfile.lock.json if no dependencies were updated + changedFiles=$(git diff --name-only | grep -v 'jsonnetfile.lock.json' | wc -l) + if [[ "$changedFiles" -eq 0 ]]; then + git checkout -- jsonnetfile.lock.json; + fi + - name: Create Pull Request + uses: peter-evans/create-pull-request@v3 + with: + commit-message: "[bot] [${{ matrix.branch }}] Automated version update" + title: "[bot] [${{ matrix.branch }}] Automated version update" + body: | + ## Description + + This is an automated version and jsonnet dependencies update performed from CI. + + Configuration of the workflow is located in `.github/workflows/versions.yaml` + + ## Type of change + + - [x] `NONE` (if none of the other choices apply. Example, tooling, build system, CI, docs, etc.) + + ## Changelog entry + + ```release-note + + ``` + team-reviewers: kube-prometheus-reviewers + branch: automated-updates-${{ matrix.branch }} + delete-branch: true + # GITHUB_TOKEN cannot be used as it won't trigger CI in a created PR + # More in https://github.com/peter-evans/create-pull-request/issues/155 + token: ${{ secrets.PROM_OP_BOT_PAT }} diff --git a/.gitignore b/.gitignore index f334fb5652237c8c726b3c428eb457c177fd41e9..a82cecedb5f56c4cbf64a4c33bf18660775d101f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ minikube-manifests/ vendor/ ./auth .swp +crdschemas/ + +.gitpod/_output/ \ No newline at end of file diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 0000000000000000000000000000000000000000..936bc53aa570c57dacebf25cc541aa6322c954a8 --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,47 @@ +image: gitpod/workspace-full +checkoutLocation: gitpod-k3s +tasks: + - init: | + make --always-make + export PATH="$(pwd)/tmp/bin:${PATH}" + cat > ${PWD}/.git/hooks/pre-commit <<EOF + #!/bin/bash + + echo "Checking jsonnet fmt" + make fmt > /dev/null 2>&1 + echo "Checking if manifests are correct" + make generate > /dev/null 2>&1 + + git diff --exit-code + if [[ \$? == 1 ]]; then + echo " + + This commit is being rejected because the YAML manifests are incorrect or jsonnet needs to be formatted." + echo "Please commit your changes again!" + exit 1 + fi + EOF + chmod +x ${PWD}/.git/hooks/pre-commit + - name: run kube-prometheus + command: | + .gitpod/prepare-k3s.sh + .gitpod/deploy-kube-prometheus.sh + - name: kernel dev environment + init: | + sudo apt update -y + sudo apt install qemu qemu-system-x86 linux-image-$(uname -r) libguestfs-tools sshpass netcat -y + sudo curl -o /usr/bin/kubectl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + sudo chmod +x /usr/bin/kubectl + .gitpod/prepare-rootfs.sh + command: | + .gitpod/qemu.sh +ports: + - port: 3000 + onOpen: open-browser + - port: 9090 + onOpen: open-browser + - port: 9093 + onOpen: open-browser +vscode: + extensions: + - heptio.jsonnet@0.1.0:woEDU5N62LRdgdz0g/I6sQ== \ No newline at end of file diff --git a/.gitpod/deploy-kube-prometheus.sh b/.gitpod/deploy-kube-prometheus.sh new file mode 100755 index 0000000000000000000000000000000000000000..fdd9c1d23e6b3a1fa4783541a84a1b705c4d5762 --- /dev/null +++ b/.gitpod/deploy-kube-prometheus.sh @@ -0,0 +1,16 @@ +kubectl apply -f manifests/setup + +# Safety wait for CRDs to be working +sleep 30 + +kubectl apply -f manifests/ + +kubectl rollout status -n monitoring daemonset node-exporter +kubectl rollout status -n monitoring statefulset alertmanager-main +kubectl rollout status -n monitoring statefulset prometheus-k8s +kubectl rollout status -n monitoring deployment grafana +kubectl rollout status -n monitoring deployment kube-state-metrics + +kubectl port-forward -n monitoring svc/grafana 3000 > /dev/null 2>&1 & +kubectl port-forward -n monitoring svc/alertmanager-main 9093 > /dev/null 2>&1 & +kubectl port-forward -n monitoring svc/prometheus-k8s 9090 > /dev/null 2>&1 & \ No newline at end of file diff --git a/.gitpod/prepare-k3s.sh b/.gitpod/prepare-k3s.sh new file mode 100755 index 0000000000000000000000000000000000000000..ccfd658abf69ec7b8a51260d9e2d5d4fc54450b8 --- /dev/null +++ b/.gitpod/prepare-k3s.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +script_dirname="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +rootfslock="${script_dirname}/_output/rootfs/rootfs-ready.lock" +k3sreadylock="${script_dirname}/_output/rootfs/k3s-ready.lock" + +if test -f "${k3sreadylock}"; then + exit 0 +fi + +cd $script_dirname + +function waitssh() { + while ! nc -z 127.0.0.1 2222; do + sleep 0.1 + done + ./ssh.sh "whoami" &>/dev/null + if [ $? -ne 0 ]; then + sleep 1 + waitssh + fi +} + +function waitrootfs() { + while ! test -f "${rootfslock}"; do + sleep 0.1 + done +} + +echo "🔥 Installing everything, this will be done only one time per workspace." + +echo "Waiting for the rootfs to become available, it can take a while, open the terminal #2 for progress" +waitrootfs +echo "✅ rootfs available" + +echo "Waiting for the ssh server to become available, it can take a while, after this k3s is getting installed" +waitssh +echo "✅ ssh server available" + +./ssh.sh "curl -sfL https://get.k3s.io | sh -" + +mkdir -p ~/.kube +./scp.sh root@127.0.0.1:/etc/rancher/k3s/k3s.yaml ~/.kube/config + +echo "✅ k3s server is ready" +touch "${k3sreadylock}" + +# safety wait for cluster availability +sleep 30s \ No newline at end of file diff --git a/.gitpod/prepare-rootfs.sh b/.gitpod/prepare-rootfs.sh new file mode 100755 index 0000000000000000000000000000000000000000..c67e9a77c2050e3bdb10c6923f7bff653ce0d9ae --- /dev/null +++ b/.gitpod/prepare-rootfs.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +set -euo pipefail + +img_url="https://cloud-images.ubuntu.com/hirsute/current/hirsute-server-cloudimg-amd64.tar.gz" + +script_dirname="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +outdir="${script_dirname}/_output/rootfs" + +rm -Rf $outdir +mkdir -p $outdir + +curl -L -o "${outdir}/rootfs.tar.gz" $img_url + +cd $outdir + +tar -xvf rootfs.tar.gz + +qemu-img resize hirsute-server-cloudimg-amd64.img +20G + +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --run-command 'resize2fs /dev/sda' + +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --root-password password:root + +netconf=" +network: + version: 2 + renderer: networkd + ethernets: + enp0s3: + dhcp4: yes +" + +# networking setup +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --run-command "echo '${netconf}' > /etc/netplan/01-net.yaml" + +# copy kernel modules +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --copy-in /lib/modules/$(uname -r):/lib/modules + +# ssh +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --run-command 'apt remove openssh-server -y && apt install openssh-server -y' +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --run-command "sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config" +sudo virt-customize -a hirsute-server-cloudimg-amd64.img --run-command "sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config" + +# mark as ready +touch rootfs-ready.lock + +echo "k3s development environment is ready" diff --git a/.gitpod/qemu.sh b/.gitpod/qemu.sh new file mode 100755 index 0000000000000000000000000000000000000000..f4256439806f86fbb9f7647ff02e765c9a008d7f --- /dev/null +++ b/.gitpod/qemu.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -xeuo pipefail + +script_dirname="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +outdir="${script_dirname}/_output" + +sudo qemu-system-x86_64 -kernel "/boot/vmlinuz" \ +-boot c -m 3073M -hda "${outdir}/rootfs/hirsute-server-cloudimg-amd64.img" \ +-net user \ +-smp 8 \ +-append "root=/dev/sda rw console=ttyS0,115200 acpi=off nokaslr" \ +-nic user,hostfwd=tcp::2222-:22,hostfwd=tcp::6443-:6443 \ +-serial mon:stdio -display none \ No newline at end of file diff --git a/.gitpod/scp.sh b/.gitpod/scp.sh new file mode 100755 index 0000000000000000000000000000000000000000..2295c3c0bbdd15b6a3cb47264a8e08b6de076a1e --- /dev/null +++ b/.gitpod/scp.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +sshpass -p 'root' scp -o StrictHostKeychecking=no -P 2222 $@ \ No newline at end of file diff --git a/.gitpod/ssh.sh b/.gitpod/ssh.sh new file mode 100755 index 0000000000000000000000000000000000000000..b4d2ca8c15ecd9ebab19fbaf14e43a24cb9e551b --- /dev/null +++ b/.gitpod/ssh.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +sshpass -p 'root' ssh -o StrictHostKeychecking=no -p 2222 root@127.0.0.1 "$@" \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..db7a11400e1719f9a9a0bc7b7cb5b80a259fb8e7 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,44 @@ +## release-0.9 / 2021-08-19 + +* [CHANGE] Test against Kubernetes 1.21 and 1,22. #1161 #1337 +* [CHANGE] Drop cAdvisor metrics without (pod, namespace) label pairs. #1250 +* [CHANGE] Excluded deprecated `etcd_object_counts` metric. #1337 +* [FEATURE] Add PodDisruptionBudget to prometheus-adapter. #1136 +* [FEATURE] Add support for feature flags in Prometheus. #1129 +* [FEATURE] Add env parameter for grafana component. #1171 +* [FEATURE] Add gitpod deployment of kube-prometheus on k3s. #1211 +* [FEATURE] Add resource requests and limits to prometheus-adapter container. #1282 +* [FEATURE] Add PodMonitor for kube-proxy. #1230 +* [FEATURE] Turn AWS VPC CNI into a control plane add-on. #1307 +* [ENHANCEMENT] Export anti-affinity addon. #1114 +* [ENHANCEMENT] Allow changing configmap-reloader, grafana, and kube-rbac-proxy images in $.values.common.images. #1123 #1124 #1125 +* [ENHANCEMENT] Add automated version upgrader. #1166 +* [ENHANCEMENT] Improve all-namespace addon. #1131 +* [ENHANCEMENT] Add example of running without grafana deployment. #1201 +* [ENHANCEMENT] Import managed-cluster addon for the EKS platform. #1205 +* [ENHANCEMENT] Automatically update jsonnet dependencies. #1220 +* [ENHANCEMENT] Adapt kube-prometheus to changes to ovn veth interfaces names. #1224 +* [ENHANCEMENT] Add example release-0.3 to release-0.8 migration to docs. #1235 +* [ENHANCEMENT] Consolidate intervals used in prometheus-adapter CPU queries. #1231 +* [ENHANCEMENT] Create dashboardDefinitions if rawDashboards or folderDashboards are specified. #1255 +* [ENHANCEMENT] Relabel instance with node name for CNI DaemonSet on EKS. #1259 +* [ENHANCEMENT] Update doc on Prometheus rule updates since release 0.8. #1253 +* [ENHANCEMENT] Point runbooks to https://runbooks.prometheus-operator.dev. #1267 +* [ENHANCEMENT] Allow setting of kubeRbacProxyMainResources in kube-state-metrics. #1257 +* [ENHANCEMENT] Automate release branch updates. #1293 #1303 +* [ENHANCEMENT] Create Thanos Sidecar rules separately from Prometheus ones. #1308 +* [ENHANCEMENT] Allow using newer jsonnet-bundler dependency resolution when using windows addon. #1310 +* [ENHANCEMENT] Prometheus ruleSelector defaults to all rules. +* [BUGFIX] Fix kube-state-metrics metric denylist regex pattern. #1146 +* [BUGFIX] Fix missing resource config in blackbox exporter. #1148 +* [BUGFIX] Fix adding private repository. #1169 +* [BUGFIX] Fix kops selectors for scheduler, controllerManager and kube-dns. #1164 +* [BUGFIX] Fix scheduler and controller selectors for Kubespray. #1142 +* [BUGFIX] Fix label selector for coredns ServiceMonitor. #1200 +* [BUGFIX] Fix name for blackbox-exporter PodSecurityPolicy. #1213 +* [BUGFIX] Fix ingress path rules for networking.k8s.io/v1. #1212 +* [BUGFIX] Disable insecure cypher suites for prometheus-adapter. #1216 +* [BUGFIX] Fix CNI metrics relabelings on EKS. #1277 +* [BUGFIX] Fix node-exporter ignore list for OVN. #1283 +* [BUGFIX] Revert back to awscni_total_ip_addresses-based alert on EKS. #1292 +* [BUGFIX] Allow passing `thanos: {}` to prometheus configuration. #1325 diff --git a/Makefile b/Makefile index 9411829ec37df4e8e26a940af04024b1b60fa7b3..e2c265cdd3e229c00fd03014c0950bcb7d1a1dea 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,15 @@ SHELL=/bin/bash -o pipefail -export GO111MODULE=on - BIN_DIR?=$(shell pwd)/tmp/bin EMBEDMD_BIN=$(BIN_DIR)/embedmd JB_BIN=$(BIN_DIR)/jb GOJSONTOYAML_BIN=$(BIN_DIR)/gojsontoyaml JSONNET_BIN=$(BIN_DIR)/jsonnet +JSONNETLINT_BIN=$(BIN_DIR)/jsonnet-lint JSONNETFMT_BIN=$(BIN_DIR)/jsonnetfmt -TOOLING=$(EMBEDMD_BIN) $(JB_BIN) $(GOJSONTOYAML_BIN) $(JSONNET_BIN) $(JSONNETFMT_BIN) +KUBECONFORM_BIN=$(BIN_DIR)/kubeconform +TOOLING=$(EMBEDMD_BIN) $(JB_BIN) $(GOJSONTOYAML_BIN) $(JSONNET_BIN) $(JSONNETLINT_BIN) $(JSONNETFMT_BIN) $(KUBECONFORM_BIN) JSONNETFMT_ARGS=-n 2 --max-blank-lines 2 --string-style s --comment-style s @@ -26,22 +26,47 @@ generate: manifests **.md **.md: $(EMBEDMD_BIN) $(shell find examples) build.sh example.jsonnet $(EMBEDMD_BIN) -w `find . -name "*.md" | grep -v vendor` -manifests: examples/kustomize.jsonnet $(GOJSONTOYAML_BIN) vendor build.sh +manifests: examples/kustomize.jsonnet $(GOJSONTOYAML_BIN) vendor ./build.sh $< vendor: $(JB_BIN) jsonnetfile.json jsonnetfile.lock.json rm -rf vendor $(JB_BIN) install +crdschemas: vendor + ./scripts/generate-schemas.sh + +.PHONY: update +update: $(JB_BIN) + $(JB_BIN) update + +.PHONY: validate +validate: validate-1.21 validate-1.22 + +validate-1.21: + KUBE_VERSION=1.21.1 $(MAKE) kubeconform + +validate-1.22: + KUBE_VERSION=1.22.0 $(MAKE) kubeconform + +.PHONY: kubeconform +kubeconform: crdschemas manifests $(KUBECONFORM_BIN) + $(KUBECONFORM_BIN) -kubernetes-version $(KUBE_VERSION) -schema-location 'default' -schema-location 'crdschemas/{{ .ResourceKind }}.json' -skip CustomResourceDefinition manifests/ + .PHONY: fmt fmt: $(JSONNETFMT_BIN) find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ xargs -n 1 -- $(JSONNETFMT_BIN) $(JSONNETFMT_ARGS) -i +.PHONY: lint +lint: $(JSONNETLINT_BIN) vendor + find jsonnet/ -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNETLINT_BIN) -J vendor + .PHONY: test test: $(JB_BIN) $(JB_BIN) install - ./test.sh + ./scripts/test.sh .PHONY: test-e2e test-e2e: @@ -52,4 +77,4 @@ $(BIN_DIR): $(TOOLING): $(BIN_DIR) @echo Installing tools from scripts/tools.go - @cat scripts/tools.go | grep _ | awk -F'"' '{print $$2}' | GOBIN=$(BIN_DIR) xargs -tI % go install % + @cd scripts && cat tools.go | grep _ | awk -F'"' '{print $$2}' | xargs -tI % go build -modfile=go.mod -o $(BIN_DIR) % diff --git a/NOTICE b/NOTICE deleted file mode 100644 index 23a0ada2fbb5635786d4bb9672c421010369d767..0000000000000000000000000000000000000000 --- a/NOTICE +++ /dev/null @@ -1,5 +0,0 @@ -CoreOS Project -Copyright 2018 CoreOS, Inc - -This product includes software developed at CoreOS, Inc. -(http://www.coreos.com/). diff --git a/OWNERS b/OWNERS deleted file mode 100644 index a4d521339bf26811a04939cc84fbf0720d01e3d3..0000000000000000000000000000000000000000 --- a/OWNERS +++ /dev/null @@ -1,14 +0,0 @@ -reviewers: - - brancz - - metalmatze - - mxinden - - s-urbaniak - - squat - - paulfantom -approvers: - - brancz - - metalmatze - - mxinden - - s-urbaniak - - squat - - paulfantom diff --git a/README.md b/README.md index e0ded79ceabb9d3498ae52013f64f6f93aa7c6f2..a130aaca1d223ff391f456ebbee7c9aa08c99642 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # kube-prometheus +[](https://github.com/prometheus-operator/kube-prometheus/actions) +[](http://slack.k8s.io/) +[](https://gitpod.io/#https://github.com/prometheus-operator/kube-prometheus) + > Note that everything is experimental and may change significantly at any time. This repository collects Kubernetes manifests, [Grafana](http://grafana.com/) dashboards, and [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with [Prometheus](https://prometheus.io/) using the Prometheus Operator. @@ -18,9 +22,14 @@ Components included in this package: This stack is meant for cluster monitoring, so it is pre-configured to collect metrics from all Kubernetes components. In addition to that it delivers a default set of dashboards and alerting rules. Many of the useful dashboards and alerts come from the [kubernetes-mixin project](https://github.com/kubernetes-monitoring/kubernetes-mixin), similar to this project it provides composable jsonnet as a library for users to customize to their needs. +## Warning + +If you are migrating from `release-0.7` branch or earlier please read [what changed and how to migrate in our guide](https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/migration-guide.md). + ## Table of contents - [kube-prometheus](#kube-prometheus) + - [Warning](#warning) - [Table of contents](#table-of-contents) - [Prerequisites](#prerequisites) - [minikube](#minikube) @@ -53,13 +62,17 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m - [Stripping container resource limits](#stripping-container-resource-limits) - [Customizing Prometheus alerting/recording rules and Grafana dashboards](#customizing-prometheus-alertingrecording-rules-and-grafana-dashboards) - [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress) + - [Setting up a blackbox exporter](#setting-up-a-blackbox-exporter) - [Minikube Example](#minikube-example) + - [Continuous Delivery](#continuous-delivery) - [Troubleshooting](#troubleshooting) - [Error retrieving kubelet metrics](#error-retrieving-kubelet-metrics) - [Authentication problem](#authentication-problem) - [Authorization problem](#authorization-problem) - [kube-state-metrics resource usage](#kube-state-metrics-resource-usage) + - [Error retrieving kube-proxy metrics](#error-retrieving-kube-proxy-metrics) - [Contributing](#contributing) + - [License](#license) ## Prerequisites @@ -78,7 +91,7 @@ This adapter is an Extension API Server and Kubernetes needs to be have this fea To try out this stack, start [minikube](https://github.com/kubernetes/minikube) with the following command: ```shell -$ minikube delete && minikube start --kubernetes-version=v1.19.0 --memory=6g --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 +$ minikube delete && minikube start --kubernetes-version=v1.20.0 --memory=6g --bootstrapper=kubeadm --extra-config=kubelet.authentication-token-webhook=true --extra-config=kubelet.authorization-mode=Webhook --extra-config=scheduler.address=0.0.0.0 --extra-config=controller-manager.address=0.0.0.0 ``` The kube-prometheus stack includes a resource metrics API server, so the metrics-server addon is not necessary. Ensure the metrics-server addon is disabled on minikube: @@ -93,19 +106,17 @@ $ minikube addons disable metrics-server The following versions are supported and work as we test against these versions in their respective branches. But note that other versions might work! -| kube-prometheus stack | Kubernetes 1.14 | Kubernetes 1.15 | Kubernetes 1.16 | Kubernetes 1.17 | Kubernetes 1.18 | Kubernetes 1.19 | -|-----------------------|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| -| `release-0.3` | ✔ | ✔ | ✔ | ✔ | ✗ | ✗ | -| `release-0.4` | ✗ | ✗ | ✔ (v1.16.5+) | ✔ | ✗ | ✗ | -| `release-0.5` | ✗ | ✗ | ✗ | ✗ | ✔ | ✗ | -| `release-0.6` | ✗ | ✗ | ✗ | ✗ | ✔ | ✔ | -| `HEAD` | ✗ | ✗ | ✗ | ✗ | x | ✔ | - -Note: Due to [two](https://github.com/kubernetes/kubernetes/issues/83778) [bugs](https://github.com/kubernetes/kubernetes/issues/86359) in Kubernetes v1.16.1, and prior to Kubernetes v1.16.5 the kube-prometheus release-0.4 branch only supports v1.16.5 and higher. The `extension-apiserver-authentication-reader` role in the kube-system namespace can be manually edited to include list and watch permissions in order to workaround the second issue with Kubernetes v1.16.2 through v1.16.4. +| kube-prometheus stack | Kubernetes 1.18 | Kubernetes 1.19 | Kubernetes 1.20 | Kubernetes 1.21 | Kubernetes 1.22 | +|------------------------------------------------------------------------------------------|-----------------|-----------------|-----------------|-----------------|-----------------| +| [`release-0.6`](https://github.com/prometheus-operator/kube-prometheus/tree/release-0.6) | ✗ | ✔ | ✗ | ✗ | ✗ | +| [`release-0.7`](https://github.com/prometheus-operator/kube-prometheus/tree/release-0.7) | ✗ | ✔ | ✔ | ✗ | ✗ | +| [`release-0.8`](https://github.com/prometheus-operator/kube-prometheus/tree/release-0.8) | ✗ | ✗ | ✔ | ✔ | ✗ | +| [`release-0.9`](https://github.com/prometheus-operator/kube-prometheus/tree/release-0.9) | ✗ | ✗ | ✗ | ✔ | ✔ | +| [`HEAD`](https://github.com/prometheus-operator/kube-prometheus/tree/main) | ✗ | ✗ | ✗ | ✔ | ✔ | ## Quickstart ->Note: For versions before Kubernetes v1.19.z refer to the [Kubernetes compatibility matrix](#kubernetes-compatibility-matrix) in order to choose a compatible branch. +>Note: For versions before Kubernetes v1.21.z refer to the [Kubernetes compatibility matrix](#kubernetes-compatibility-matrix) in order to choose a compatible branch. This project is intended to be used as a library (i.e. the intent is not for you to create your own modified copy of this repository). @@ -113,7 +124,7 @@ Though for a quickstart a compiled version of the Kubernetes [manifests](manifes * Create the monitoring stack using the config in the `manifests` directory: ```shell -# Create the namespace and CRDs, and then wait for them to be availble before creating the remaining resources +# Create the namespace and CRDs, and then wait for them to be available before creating the remaining resources kubectl create -f manifests/setup until kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""; done kubectl create -f manifests/ @@ -174,12 +185,15 @@ Install this library in your own project with [jsonnet-bundler](https://github.c $ mkdir my-kube-prometheus; cd my-kube-prometheus $ jb init # Creates the initial/empty `jsonnetfile.json` # Install the kube-prometheus dependency -$ jb install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@release-0.4 # Creates `vendor/` & `jsonnetfile.lock.json`, and fills in `jsonnetfile.json` +$ jb install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@release-0.7 # Creates `vendor/` & `jsonnetfile.lock.json`, and fills in `jsonnetfile.json` + +$ wget https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.7/example.jsonnet -O example.jsonnet +$ wget https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.7/build.sh -O build.sh ``` > `jb` can be installed with `go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb` -> An e.g. of how to install a given version of this library: `jb install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@release-0.4` +> An e.g. of how to install a given version of this library: `jb install github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus@release-0.7` In order to update the kube-prometheus dependency, simply use the jsonnet-bundler update functionality: ```shell @@ -190,7 +204,7 @@ $ jb update e.g. of how to compile the manifests: `./build.sh example.jsonnet` -> before compiling, install `gojsontoyaml` tool with `go get github.com/brancz/gojsontoyaml` +> before compiling, install `gojsontoyaml` tool with `go get github.com/brancz/gojsontoyaml` and `jsonnet` with `go get github.com/google/go-jsonnet/cmd/jsonnet` Here's [example.jsonnet](example.jsonnet): @@ -199,33 +213,39 @@ Here's [example.jsonnet](example.jsonnet): [embedmd]:# (example.jsonnet) ```jsonnet local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + // Uncomment the following imports to enable its patches - // (import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-managed-cluster.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/anti-affinity.libsonnet') + + // (import 'kube-prometheus/addons/managed-cluster.libsonnet') + + // (import 'kube-prometheus/addons/node-ports.libsonnet') + + // (import 'kube-prometheus/addons/static-etcd.libsonnet') + + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; -{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + -// serviceMonitor is separated so that it can be created after the CRDs are ready +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + -{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } ``` And here's the [build.sh](build.sh) script (which uses `vendor/` to render all manifests in a json structure of `{filename: manifest-content}`): @@ -266,7 +286,7 @@ The previous steps (compilation) has created a bunch of manifest files in the ma Now simply use `kubectl` to install Prometheus and Grafana as per your configuration: ```shell -# Update the namespace and CRDs, and then wait for them to be availble before creating the remaining resources +# Update the namespace and CRDs, and then wait for them to be available before creating the remaining resources $ kubectl apply -f manifests/setup $ kubectl apply -f manifests/ ``` @@ -308,74 +328,22 @@ Once updated, just follow the instructions under "Compiling" and "Apply the kube Jsonnet has the concept of hidden fields. These are fields, that are not going to be rendered in a result. This is used to configure the kube-prometheus components in jsonnet. In the example jsonnet code of the above [Customizing Kube-Prometheus section](#customizing-kube-prometheus), you can see an example of this, where the `namespace` is being configured to be `monitoring`. In order to not override the whole object, use the `+::` construct of jsonnet, to merge objects, this way you can override individual settings, but retain all other settings and defaults. -These are the available fields with their respective default values: -``` -{ - _config+:: { - namespace: "default", - - versions+:: { - alertmanager: "v0.17.0", - nodeExporter: "v0.18.1", - kubeStateMetrics: "v1.5.0", - kubeRbacProxy: "v0.4.1", - prometheusOperator: "v0.30.0", - prometheus: "v2.10.0", - }, +The available fields and their default values can be seen in [main.libsonnet](jsonnet/kube-prometheus/main.libsonnet). Note that many of the fields get their default values from variables, and for example the version numbers are imported from [versions.json](jsonnet/kube-prometheus/versions.json). - imageRepos+:: { - prometheus: "quay.io/prometheus/prometheus", - alertmanager: "quay.io/prometheus/alertmanager", - kubeStateMetrics: "quay.io/coreos/kube-state-metrics", - kubeRbacProxy: "quay.io/coreos/kube-rbac-proxy", - nodeExporter: "quay.io/prometheus/node-exporter", - prometheusOperator: "quay.io/prometheus-operator/prometheus-operator", - }, - - prometheus+:: { - names: 'k8s', - replicas: 2, - rules: {}, - }, +Configuration is mainly done in the `values` map. You can see this being used in the `example.jsonnet` to set the namespace to `monitoring`. This is done in the `common` field, which all other components take their default value from. See for example how Alertmanager is configured in `main.libsonnet`: - alertmanager+:: { +``` + alertmanager: { name: 'main', - config: ||| - global: - resolve_timeout: 5m - route: - group_by: ['job'] - group_wait: 30s - group_interval: 5m - repeat_interval: 12h - receiver: 'null' - routes: - - match: - alertname: Watchdog - receiver: 'null' - receivers: - - name: 'null' - |||, - replicas: 3, - }, - - kubeStateMetrics+:: { - collectors: '', // empty string gets a default set - scrapeInterval: '30s', - scrapeTimeout: '30s', - - baseCPU: '100m', - baseMemory: '150Mi', - }, - - nodeExporter+:: { - port: 9100, + // Use the namespace specified under values.common by default. + namespace: $.values.common.namespace, + version: $.values.common.versions.alertmanager, + image: $.values.common.images.alertmanager, + mixin+: { ruleLabels: $.values.common.ruleLabels }, }, - }, -} ``` -The grafana definition is located in a different project (https://github.com/brancz/kubernetes-grafana), but needed configuration can be customized from the same top level `_config` field. For example to allow anonymous access to grafana, add the following `_config` section: +The grafana definition is located in a different project (https://github.com/brancz/kubernetes-grafana), but needed configuration can be customized from the same top level `values` field. For example to allow anonymous access to grafana, add the following `values` section: ``` grafana+:: { config: { // http://docs.grafana.org/installation/configuration/ @@ -392,57 +360,28 @@ Jsonnet is a turing complete language, any logic can be reflected in it. It also ### Cluster Creation Tools -A common example is that not all Kubernetes clusters are created exactly the same way, meaning the configuration to monitor them may be slightly different. For [kubeadm](examples/jsonnet-snippets/kubeadm.jsonnet), [bootkube](examples/jsonnet-snippets/bootkube.jsonnet), [kops](examples/jsonnet-snippets/kops.jsonnet) and [kubespray](examples/jsonnet-snippets/kubespray.jsonnet) clusters there are mixins available to easily configure these: - -kubeadm: - -[embedmd]:# (examples/jsonnet-snippets/kubeadm.jsonnet) -```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') -``` - -bootkube: - -[embedmd]:# (examples/jsonnet-snippets/bootkube.jsonnet) -```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-bootkube.libsonnet') -``` +A common example is that not all Kubernetes clusters are created exactly the same way, meaning the configuration to monitor them may be slightly different. For the following clusters there are mixins available to easily configure them: -kops: - -[embedmd]:# (examples/jsonnet-snippets/kops.jsonnet) -```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops.libsonnet') -``` - -kops with CoreDNS: - -If your kops cluster is using CoreDNS, there is an additional mixin to import. - -[embedmd]:# (examples/jsonnet-snippets/kops-coredns.jsonnet) -```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') -``` - -kubespray: - -[embedmd]:# (examples/jsonnet-snippets/kubespray.jsonnet) -```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kubespray.libsonnet') -``` +* aws +* bootkube +* eks +* gke +* kops-coredns +* kubeadm +* kubespray -kube-aws: +These mixins are selectable via the `platform` field of kubePrometheus: -[embedmd]:# (examples/jsonnet-snippets/kube-aws.jsonnet) +[embedmd]:# (examples/jsonnet-snippets/platform.jsonnet) ```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kube-aws.libsonnet') +(import 'kube-prometheus/main.libsonnet') + +{ + values+:: { + common+: { + platform: 'example-platform', + }, + }, +} ``` ### Internal Registry @@ -468,10 +407,12 @@ Then to generate manifests with `internal-registry.com/organization`, use the `w [embedmd]:# (examples/internal-registry.jsonnet) ```jsonnet -local mixin = import 'kube-prometheus/kube-prometheus-config-mixins.libsonnet'; -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local mixin = import 'kube-prometheus/addons/config-mixins.libsonnet'; +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, } + mixin.withImageRepository('internal-registry.com/organization'); @@ -490,8 +431,8 @@ Another mixin that may be useful for exploring the stack is to expose the UIs of [embedmd]:# (examples/jsonnet-snippets/node-ports.jsonnet) ```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') +(import 'kube-prometheus/main.libsonnet') + +(import 'kube-prometheus/addons/node-ports.libsonnet') ``` ### Prometheus Object Name @@ -500,7 +441,7 @@ To give another customization example, the name of the `Prometheus` object provi [embedmd]:# (examples/prometheus-name-override.jsonnet) ```jsonnet -((import 'kube-prometheus/kube-prometheus.libsonnet') + { +((import 'kube-prometheus/main.libsonnet') + { prometheus+: { prometheus+: { metadata+: { @@ -517,7 +458,7 @@ Standard Kubernetes manifests are all written using [ksonnet-lib](https://github [embedmd]:# (examples/ksonnet-example.jsonnet) ```jsonnet -((import 'kube-prometheus/kube-prometheus.libsonnet') + { +((import 'kube-prometheus/main.libsonnet') + { nodeExporter+: { daemonset+: { metadata+: { @@ -530,12 +471,12 @@ Standard Kubernetes manifests are all written using [ksonnet-lib](https://github ### Alertmanager configuration -The Alertmanager configuration is located in the `_config.alertmanager.config` configuration field. In order to set a custom Alertmanager configuration simply set this field. +The Alertmanager configuration is located in the `values.alertmanager.config` configuration field. In order to set a custom Alertmanager configuration simply set this field. [embedmd]:# (examples/alertmanager-config.jsonnet) ```jsonnet -((import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { +((import 'kube-prometheus/main.libsonnet') + { + values+:: { alertmanager+: { config: ||| global: @@ -562,8 +503,8 @@ In the above example the configuration has been inlined, but can just as well be [embedmd]:# (examples/alertmanager-config-external.jsonnet) ```jsonnet -((import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { +((import 'kube-prometheus/main.libsonnet') + { + values+:: { alertmanager+: { config: importstr 'alertmanager-config.yaml', }, @@ -573,15 +514,17 @@ In the above example the configuration has been inlined, but can just as well be ### Adding additional namespaces to monitor -In order to monitor additional namespaces, the Prometheus server requires the appropriate `Role` and `RoleBinding` to be able to discover targets from that namespace. By default the Prometheus server is limited to the three namespaces it requires: default, kube-system and the namespace you configure the stack to run in via `$._config.namespace`. This is specified in `$._config.prometheus.namespaces`, to add new namespaces to monitor, simply append the additional namespaces: +In order to monitor additional namespaces, the Prometheus server requires the appropriate `Role` and `RoleBinding` to be able to discover targets from that namespace. By default the Prometheus server is limited to the three namespaces it requires: default, kube-system and the namespace you configure the stack to run in via `$.values.namespace`. This is specified in `$.values.prometheus.namespaces`, to add new namespaces to monitor, simply append the additional namespaces: [embedmd]:# (examples/additional-namespaces.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, - prometheus+:: { + prometheus+: { namespaces+: ['my-namespace', 'my-second-namespace'], }, }, @@ -606,14 +549,16 @@ You can define ServiceMonitor resources in your `jsonnet` spec. See the snippet [embedmd]:# (examples/additional-namespaces-servicemonitor.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, prometheus+:: { namespaces+: ['my-namespace', 'my-second-namespace'], }, }, - prometheus+:: { + exampleApplication: { serviceMonitorMyNamespace: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', @@ -645,7 +590,8 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } ``` > NOTE: make sure your service resources have the right labels (eg. `'app': 'myapp'`) applied. Prometheus uses kubernetes labels to discover resources inside the namespaces. @@ -656,12 +602,13 @@ In case you want to monitor all namespaces in a cluster, you can add the followi [embedmd]:# (examples/all-namespaces.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-all-namespaces.libsonnet') + { - _config+:: { - namespace: 'monitoring', - - prometheus+:: { +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/all-namespaces.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + prometheus+: { namespaces: [], }, }, @@ -689,11 +636,26 @@ In order to configure a static etcd cluster to scrape there is a simple [kube-pr ### Pod Anti-Affinity To prevent `Prometheus` and `Alertmanager` instances from being deployed onto the same node when -possible, one can include the [kube-prometheus-anti-affinity.libsonnet](jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet) mixin: +possible, one can include the [kube-prometheus-anti-affinity.libsonnet](jsonnet/kube-prometheus/addons/anti-affinity.libsonnet) mixin: +[embedmd]:# (examples/anti-affinity.jsonnet) ```jsonnet -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/anti-affinity.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` ### Stripping container resource limits @@ -703,10 +665,12 @@ To do that, one can import the following mixin [embedmd]:# (examples/strip-limits.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-strip-limits.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/strip-limits.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; @@ -727,6 +691,36 @@ See [developing Prometheus rules and Grafana dashboards](docs/developing-prometh See [exposing Prometheus/Alertmanager/Grafana](docs/exposing-prometheus-alertmanager-grafana-ingress.md) guide. +### Setting up a blackbox exporter + +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + // ... all necessary mixins ... + { + values+:: { + // ... configuration for other features ... + blackboxExporter+:: { + modules+:: { + tls_connect: { + prober: 'tcp', + tcp: { + tls: true + } + } + } + } + } + }; + +{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +// ... other rendering blocks ... +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } +``` + +Then describe the actual blackbox checks you want to run using `Probe` resources. Specify `blackbox-exporter.<namespace>.svc.cluster.local:9115` as the `spec.prober.url` field of the `Probe` resource. + +See the [blackbox exporter guide](docs/blackbox-exporter.md) for the list of configurable options and a complete example. + ## Minikube Example To use an easy to reproduce example, see [minikube.jsonnet](examples/minikube.jsonnet), which uses the minikube setup as demonstrated in [Prerequisites](#prerequisites). Because we would like easy access to our Prometheus, Alertmanager and Grafana UIs, `minikube.jsonnet` exposes the services as NodePort type services. @@ -737,6 +731,8 @@ Working examples of use with continuous delivery tools are found in examples/con ## Troubleshooting +See the general [guidelines](docs/community-support.md) for getting support from the community. + ### Error retrieving kubelet metrics Should the Prometheus `/targets` page show kubelet targets, but not able to successfully scrape the metrics, then most likely it is a problem with the authentication and authorization setup of the kubelets. @@ -762,7 +758,7 @@ resources. One driver for more resource needs, is a high number of namespaces. There may be others. kube-state-metrics resource allocation is managed by -[addon-resizer](https://github.com/kubernetes/autoscaler/tree/master/addon-resizer/nanny) +[addon-resizer](https://github.com/kubernetes/autoscaler/tree/main/addon-resizer/nanny) You can control it's parameters by setting variables in the config. They default to: @@ -775,6 +771,13 @@ config. They default to: } ``` +### Error retrieving kube-proxy metrics +By default, kubeadm will configure kube-proxy to listen on 127.0.0.1 for metrics. Because of this prometheus would not be able to scrape these metrics. This would have to be changed to 0.0.0.0 in one of the following two places: + +1. Before cluster initialization, the config file passed to kubeadm init should have KubeProxyConfiguration manifest with the field metricsBindAddress set to 0.0.0.0:10249 +2. If the k8s cluster is already up and running, we'll have to modify the configmap kube-proxy in the namespace kube-system and set the metricsBindAddress field. After this kube-proxy daemonset would have to be restarted with +`kubectl -n kube-system rollout restart daemonset kube-proxy` + ## Contributing All `.yaml` files in the `/manifests` folder are generated via @@ -787,3 +790,7 @@ the following process: 3. Update the pinned kube-prometheus dependency in `jsonnetfile.lock.json`: `jb update` 3. Generate dependent `*.yaml` files: `make generate` 4. Commit the generated changes. + +## License + +Apache License 2.0, see [LICENSE](https://github.com/prometheus-operator/kube-prometheus/blob/main/LICENSE). diff --git a/code-of-conduct.md b/code-of-conduct.md index a234f3609d09ad5dedda772ad1da05dd0ee91a46..d1adc78033dad3908328a96aa725fcd2333b20dc 100644 --- a/code-of-conduct.md +++ b/code-of-conduct.md @@ -1,4 +1,4 @@ -## CoreOS Community Code of Conduct +## Community Code of Conduct ### Contributor Code of Conduct @@ -33,29 +33,9 @@ This code of conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting a project maintainer, Brandon Philips -<brandon.philips@coreos.com>, and/or Rithu John <rithu.john@coreos.com>. +reported by contacting a project maintainer listed in +https://github.com/prometheus-operator/prometheus-operator/blob/master/MAINTAINERS.md. This Code of Conduct is adapted from the Contributor Covenant (http://contributor-covenant.org), version 1.2.0, available at http://contributor-covenant.org/version/1/2/0/ - -### CoreOS Events Code of Conduct - -CoreOS events are working conferences intended for professional networking and -collaboration in the CoreOS community. Attendees are expected to behave -according to professional standards and in accordance with their employer’s -policies on appropriate workplace behavior. - -While at CoreOS events or related social networking opportunities, attendees -should not engage in discriminatory or offensive speech or actions including -but not limited to gender, sexuality, race, age, disability, or religion. -Speakers should be especially aware of these concerns. - -CoreOS does not condone any statements by speakers contrary to these standards. -CoreOS reserves the right to deny entrance and/or eject from an event (without -refund) any individual found to be engaging in discriminatory or offensive -speech or actions. - -Please bring any concerns to the immediate attention of designated on-site -staff, Brandon Philips <brandon.philips@coreos.com>, and/or Rithu John <rithu.john@coreos.com>. diff --git a/docs/EKS-cni-support.md b/docs/EKS-cni-support.md index fb559d7845700770e462bc1c9cc0cb99e877a50c..1cd8b146471674287f0c6690e59b93829a7b1f9a 100644 --- a/docs/EKS-cni-support.md +++ b/docs/EKS-cni-support.md @@ -7,23 +7,31 @@ One fatal issue that can occur is that you run out of IP addresses in your eks c You can monitor the `awscni` using kube-promethus with : [embedmd]:# (../examples/eks-cni-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + kubePrometheus+: { + platform: 'eks', + }, }, - prometheusRules+:: { - groups+: [ - { - name: 'example-group', - rules: [ + kubernetesControlPlane+: { + prometheusRuleEksCNI+: { + spec+: { + groups+: [ { - record: 'aws_eks_available_ip', - expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + name: 'example-group', + rules: [ + { + record: 'aws_eks_available_ip', + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + }, + ], }, ], }, - ], + }, }, }; diff --git a/docs/blackbox-exporter.md b/docs/blackbox-exporter.md new file mode 100644 index 0000000000000000000000000000000000000000..e6a5272512aec4881a81b10307a5ce4fd1bf4bef --- /dev/null +++ b/docs/blackbox-exporter.md @@ -0,0 +1,97 @@ +--- +title: "Blackbox Exporter" +description: "Generated API docs for the Prometheus Operator" +lead: "This Document documents the types introduced by the Prometheus Operator to be consumed by users." +date: 2021-03-08T08:49:31+00:00 +lastmod: 2021-03-08T08:49:31+00:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 630 +toc: true +--- + +# Setting up a blackbox exporter + +The `prometheus-operator` defines a `Probe` resource type that can be used to describe blackbox checks. To execute these, a separate component called [`blackbox_exporter`](https://github.com/prometheus/blackbox_exporter) has to be deployed, which can be scraped to retrieve the results of these checks. You can use `kube-prometheus` to set up such a blackbox exporter within your Kubernetes cluster. + +## Adding blackbox exporter manifests to an existing `kube-prometheus` configuration + +1. Override blackbox-related configuration parameters as needed. +2. Add the following to the list of renderers to render the blackbox exporter manifests: +``` +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } +``` + +## Configuration parameters influencing the blackbox exporter + +* `_config.namespace`: the namespace where the various generated resources (`ConfigMap`, `Deployment`, `Service`, `ServiceAccount` and `ServiceMonitor`) will reside. This does not affect where you can place `Probe` objects; that is determined by the configuration of the `Prometheus` resource. This option is shared with other `kube-prometheus` components; defaults to `default`. +* `_config.imageRepos.blackboxExporter`: the name of the blackbox exporter image to deploy. Defaults to `quay.io/prometheus/blackbox-exporter`. +* `_config.versions.blackboxExporter`: the tag of the blackbox exporter image to deploy. Defaults to the version `kube-prometheus` was tested with. +* `_config.imageRepos.configmapReloader`: the name of the ConfigMap reloader image to deploy. Defaults to `jimmidyson/configmap-reload`. +* `_config.versions.configmapReloader`: the tag of the ConfigMap reloader image to deploy. Defaults to the version `kube-prometheus` was tested with. +* `_config.resources.blackbox-exporter.requests`: the requested resources; this is used for each container. Defaults to `10m` CPU and `20Mi` RAM. See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ for details. +* `_config.resources.blackbox-exporter.limits`: the resource limits; this is used for each container. Defaults to `20m` CPU and `40Mi` RAM. See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ for details. +* `_config.blackboxExporter.port`: the exposed HTTPS port of the exporter. This is what Prometheus can scrape for metrics related to the blackbox exporter itself. Defaults to `9115`. +* `_config.blackboxExporter.internalPort`: the internal plaintext port of the exporter. Prometheus scrapes configured via `Probe` objects cannot access the HTTPS port right now, so you have to specify this port in the `url` field. Defaults to `19115`. +* `_config.blackboxExporter.replicas`: the number of exporter replicas to be deployed. Defaults to `1`. +* `_config.blackboxExporter.matchLabels`: map of the labels to be used to select resources belonging to the instance deployed. Defaults to `{ 'app.kubernetes.io/name': 'blackbox-exporter' }` +* `_config.blackboxExporter.assignLabels`: map of the labels applied to components of the instance deployed. Defaults to all the labels included in the `matchLabels` option, and additionally `app.kubernetes.io/version` is set to the version of the blackbox exporter. +* `_config.blackboxExporter.modules`: the modules available in the blackbox exporter installation, i.e. the types of checks it can perform. The default value includes most of the modules defined in the default blackbox exporter configuration: `http_2xx`, `http_post_2xx`, `tcp_connect`, `pop3s_banner`, `ssh_banner`, and `irc_banner`. `icmp` is omitted so the exporter can be run with minimum privileges, but you can add it back if needed - see the example below. See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md for the configuration format, except you have to use JSON instead of YAML here. +* `_config.blackboxExporter.privileged`: whether the `blackbox-exporter` container should be running as non-root (`false`) or root with heavily-restricted capability set (`true`). Defaults to `true` if you have any ICMP modules defined (which need the extra permissions) and `false` otherwise. + +## Complete example + +```jsonnet +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + { + _config+:: { + namespace: 'monitoring', + blackboxExporter+:: { + modules+:: { + icmp: { + prober: 'icmp', + }, + }, + }, + }, + }; + +{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor is separated so that it can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +``` + +After installing the generated manifests, you can create `Probe` resources, for example: + +```yaml +kind: Probe +apiVersion: monitoring.coreos.com/v1 +metadata: + name: example-com-website + namespace: monitoring +spec: + interval: 60s + module: http_2xx + prober: + url: blackbox-exporter.monitoring.svc.cluster.local:19115 + targets: + staticConfig: + static: + - http://example.com + - https://example.com +``` diff --git a/docs/community-support.md b/docs/community-support.md new file mode 100644 index 0000000000000000000000000000000000000000..218eaa74c534876444f7b999dc01503c0574b6f9 --- /dev/null +++ b/docs/community-support.md @@ -0,0 +1,84 @@ +# Community support + +For bugs, you can use the GitHub [issue tracker](https://github.com/prometheus-operator/kube-prometheus/issues/new/choose). + +For questions, you can use the GitHub [discussions forum](https://github.com/prometheus-operator/kube-prometheus/discussions). + +Many of the `kube-prometheus` project's contributors and users can also be found on the #prometheus-operator channel of the [Kubernetes Slack][Kubernetes Slack]. + +`kube-prometheus` is the aggregation of many projects that all have different +channels to reach out for help and support. This community strives at +supporting all users and you should never be afraid of asking us first. However +if your request relates specifically to one of the projects listed below, it is +often more efficient to reach out to the project directly. If you are unsure, +please feel free to open an issue in this repository and we will redirect you +if applicable. + +## prometheus-operator + +For documentation, check the project's [documentation directory](https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation). + +For questions, use the #prometheus-operator channel on the [Kubernetes Slack][Kubernetes Slack]. + +For bugs, use the GitHub [issue tracker](https://github.com/prometheus-operator/prometheus-operator/issues/new/choose). + +## Prometheus, Alertmanager, node_exporter + +For documentation, check the Prometheus [online docs](https://prometheus.io/docs/). There is a +[section](https://prometheus.io/docs/introduction/media/) with links to blog +posts, recorded talks and presentations. This [repository](https://github.com/roaldnefs/awesome-prometheus) +(not affiliated to the Prometheus project) has also a list of curated resources +related to the Prometheus ecosystem. + +For questions, see the Prometheus [community page](https://prometheus.io/community/) for the various channels. + +There is also a #prometheus channel on the [CNCF Slack][CNCF Slack]. + +## kube-state-metrics + +For documentation, see the project's [docs directory](https://github.com/kubernetes/kube-state-metrics/tree/master/docs). + +For questions, use the #kube-state-metrics channel on the [Kubernetes Slack][Kubernetes Slack]. + +For bugs, use the GitHub [issue tracker](https://github.com/kubernetes/kube-state-metrics/issues/new/choose). + +## Kubernetes + +For documentation, check the [Kubernetes docs](https://kubernetes.io/docs/home/). + +For questions, use the [community forums](https://discuss.kubernetes.io/) and the [Kubernetes Slack][Kubernetes Slack]. Check also the [community page](https://kubernetes.io/community/#discuss). + +For bugs, use the GitHub [issue tracker](https://github.com/kubernetes/kubernetes/issues/new/choose). + +## Prometheus adapter + +For documentation, check the project's [README](https://github.com/DirectXMan12/k8s-prometheus-adapter/blob/master/README.md). + +For questions, use the #sig-instrumentation channel on the [Kubernetes Slack][Kubernetes Slack]. + +For bugs, use the GitHub [issue tracker](https://github.com/DirectXMan12/k8s-prometheus-adapter/issues/new). + +## Grafana + +For documentation, check the [Grafana docs](https://grafana.com/docs/grafana/latest/). + +For questions, use the [community forums](https://community.grafana.com/). + +For bugs, use the GitHub [issue tracker](https://github.com/grafana/grafana/issues/new/choose). + +## kubernetes-mixin + +For documentation, check the project's [README](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/README.md). + +For questions, use #monitoring-mixins channel on the [Kubernetes Slack][Kubernetes Slack]. + +For bugs, use the GitHub [issue tracker](https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/new). + +## Jsonnet + +For documentation, check the [Jsonnet](https://jsonnet.org/) website. + +For questions, use the [mailing list](https://groups.google.com/forum/#!forum/jsonnet). + +[Kubernetes Slack]: https://slack.k8s.io/ +[CNCF Slack]: https://slack.cncf.io/ diff --git a/docs/deploy-kind.md b/docs/deploy-kind.md new file mode 100644 index 0000000000000000000000000000000000000000..ea66c59a16e3e9f34e4fe4afebe5a6163236af18 --- /dev/null +++ b/docs/deploy-kind.md @@ -0,0 +1,19 @@ +--- +title: "Deploy to kind" +description: "Deploy kube-prometheus to Kubernets kind." +lead: "Deploy kube-prometheus to Kubernets kind." +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 500 +toc: true +--- + +--- + +Time to explain how! + +Your chance of [**contributing**](https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/deploy-kind.md)! diff --git a/docs/developing-prometheus-rules-and-grafana-dashboards.md b/docs/developing-prometheus-rules-and-grafana-dashboards.md index a0a41c5ead83b08660e7a20a97fcfb135eafdd55..6aa853cd69398992f9c91b9a67e6f7c1dd3aa152 100644 --- a/docs/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/developing-prometheus-rules-and-grafana-dashboards.md @@ -1,4 +1,16 @@ -# Developing Prometheus Rules and Grafana Dashboards +--- +title: "Prometheus Rules and Grafana Dashboards" +description: "Create Prometheus Rules and Grafana Dashboards on top of kube-prometheus" +lead: "Create Prometheus Rules and Grafana Dashboards on top of kube-prometheus" +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 650 +toc: true +--- `kube-prometheus` ships with a set of default [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) and [Grafana](http://grafana.com/) dashboards. At some point one might like to extend them, the purpose of this document is to explain how to do this. @@ -11,33 +23,39 @@ As a basis, all examples in this guide are based on the base example of the kube [embedmd]:# (../example.jsonnet) ```jsonnet local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + // Uncomment the following imports to enable its patches - // (import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-managed-cluster.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/anti-affinity.libsonnet') + + // (import 'kube-prometheus/addons/managed-cluster.libsonnet') + + // (import 'kube-prometheus/addons/node-ports.libsonnet') + + // (import 'kube-prometheus/addons/static-etcd.libsonnet') + + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; -{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + -// serviceMonitor is separated so that it can be created after the CRDs are ready +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + -{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } ``` ## Prometheus rules @@ -52,28 +70,40 @@ The format is exactly the Prometheus format, so there should be no changes neces [embedmd]:# (../examples/prometheus-additional-alert-rule-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups+: [ - { - name: 'example-group', - rules: [ + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: [ { - alert: 'Watchdog', - expr: 'vector(1)', - labels: { - severity: 'none', - }, - annotations: { - description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.', - }, + name: 'example-group', + rules: [ + { + alert: 'ExampleAlert', + expr: 'vector(1)', + labels: { + severity: 'warning', + }, + annotations: { + description: 'This is an example alert.', + }, + }, + ], }, ], }, - ], + }, }, }; @@ -84,7 +114,8 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } ``` ### Recording rules @@ -95,22 +126,34 @@ In order to add a recording rule, simply do the same with the `prometheusRules` [embedmd]:# (../examples/prometheus-additional-recording-rule-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusRules+:: { - groups+: [ - { - name: 'example-group', - rules: [ + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: [ { - record: 'some_recording_rule_name', - expr: 'vector(1)', + name: 'example-group', + rules: [ + { + record: 'some_recording_rule_name', + expr: 'vector(1)', + }, + ], }, ], }, - ], + }, }, }; @@ -121,7 +164,8 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } ``` ### Pre-rendered rules @@ -142,12 +186,24 @@ Then import it in jsonnet: [embedmd]:# (../examples/prometheus-additional-rendered-rule-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups+: (import 'existingrule.json').groups, + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: (import 'existingrule.json').groups, + }, + }, }, }; @@ -158,76 +214,118 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } ``` ### Changing default rules -Along with adding additional rules, we give the user the option to filter or adjust the existing rules imported by `kube-prometheus/kube-prometheus.libsonnet`. The recording rules can be found in [kube-prometheus/rules](../jsonnet/kube-prometheus/rules) and [kubernetes-mixin/rules](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/rules) while the alerting rules can be found in [kube-prometheus/alerts](../jsonnet/kube-prometheus/alerts) and [kubernetes-mixin/alerts](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/alerts). +Along with adding additional rules, we give the user the option to filter or adjust the existing rules imported by `kube-prometheus/main.libsonnet`. The recording rules can be found in [kube-prometheus/components/mixin/rules](../jsonnet/kube-prometheus/components/mixin/rules) and [kubernetes-mixin/rules](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/rules) while the alerting rules can be found in [kube-prometheus/components/mixin/alerts](../jsonnet/kube-prometheus/components/mixin/alerts) and [kubernetes-mixin/alerts](https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/alerts). Knowing which rules to change, the user can now use functions from the [Jsonnet standard library](https://jsonnet.org/ref/stdlib.html) to make these changes. Below are examples of both a filter and an adjustment being made to the default rules. These changes can be assigned to a local variable and then added to the `local kp` object as seen in the examples above. #### Filter -Here the alert `KubeStatefulSetReplicasMismatch` is being filtered out of the group `kubernetes-apps`. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). +Here the alert `KubeStatefulSetReplicasMismatch` is being filtered out of the group `kubernetes-apps`. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). You first need to find out in which component the rule is defined (here it is kuberentesControlPlane). ```jsonnet local filter = { - prometheusAlerts+:: { - groups: std.map( - function(group) - if group.name == 'kubernetes-apps' then - group { - rules: std.filter(function(rule) - rule.alert != "KubeStatefulSetReplicasMismatch", - group.rules - ) - } - else - group, - super.groups - ), + kubernetesControlPlane+: { + prometheusRule+: { + spec+: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.filter( + function(rule) + rule.alert != 'KubeStatefulSetReplicasMismatch', + group.rules + ), + } + else + group, + super.groups + ), + }, + }, }, }; ``` + #### Adjustment -Here the expression for the alert used above is updated from its previous value. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). +Here the expression for another alert in the same component is updated from its previous value. The default rule can be seen [here](https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/alerts/apps_alerts.libsonnet). ```jsonnet local update = { - prometheusAlerts+:: { - groups: std.map( - function(group) - if group.name == 'kubernetes-apps' then - group { - rules: std.map( - function(rule) - if rule.alert == "KubeStatefulSetReplicasMismatch" then - rule { - expr: "kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\",statefulset!=\"vault\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\",statefulset!=\"vault\"}" - } - else - rule, - group.rules - ) - } - else - group, - super.groups - ), + kubernetesControlPlane+: { + prometheusRule+: { + spec+: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.map( + function(rule) + if rule.alert == 'KubePodCrashLooping' then + rule { + expr: 'rate(kube_pod_container_status_restarts_total{namespace=kube-system,job="kube-state-metrics"}[10m]) * 60 * 5 > 0', + } + else + rule, + group.rules + ), + } + else + group, + super.groups + ), + }, + }, }, }; ``` + Using the example from above about adding in pre-rendered rules, the new local variables can be added in as follows: ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + filter + update + { - prometheusAlerts+:: (import 'existingrule.json'), +local add = { + exampleApplication:: { + prometheusRule+: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'example-application-rules', + namespace: $.values.common.namespace, + }, + spec: (import 'existingrule.json'), + }, + }, }; - -{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + -{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +local kp = (import 'kube-prometheus/main.libsonnet') + filter + update + add; +local kp = (import 'kube-prometheus/main.libsonnet') + + filter + + update + + add + { + values+:: { + common+: { + namespace: 'monitoring', + }, + }, + }; +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } + +{ ['exampleApplication-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } ``` ## Dashboards @@ -248,35 +346,37 @@ local prometheus = grafana.prometheus; local template = grafana.template; local graphPanel = grafana.graphPanel; -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafana+:: { - dashboards+:: { - 'my-dashboard.json': - dashboard.new('My Dashboard') - .addTemplate( - { - current: { - text: 'Prometheus', - value: 'Prometheus', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + dashboards+:: { + 'my-dashboard.json': + dashboard.new('My Dashboard') + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', }, - hide: 0, - label: null, - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - ) - .addRow( - row.new() - .addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource') - .addTarget(prometheus.target('vector(1)'))) - ), + ) + .addRow( + row.new() + .addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource') + .addTarget(prometheus.target('vector(1)'))) + ), + }, }, }, }; @@ -296,16 +396,15 @@ As jsonnet is a superset of json, the jsonnet `import` function can be used to i [embedmd]:# (../examples/grafana-additional-rendered-dashboard-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafanaDashboards+:: { // monitoring-mixin compatibility - 'my-dashboard.json': (import 'example-grafana-dashboard.json'), - }, - grafana+:: { - dashboards+:: { // use this method to import your dashboards to Grafana - 'my-dashboard.json': (import 'example-grafana-dashboard.json'), +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + dashboards+:: { // use this method to import your dashboards to Grafana + 'my-dashboard.json': (import 'example-grafana-dashboard.json'), + }, }, }, }; @@ -322,13 +421,15 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { In case you have lots of json dashboard exported out from grafana UI the above approach is going to take lots of time to improve performance we can use `rawDashboards` field and provide it's value as json string by using `importstr` [embedmd]:# (../examples/grafana-additional-rendered-dashboard-example-2.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafana+:: { - rawDashboards+:: { - 'my-dashboard.json': (importstr 'example-grafana-dashboard.json'), +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + rawDashboards+:: { + 'my-dashboard.json': (importstr 'example-grafana-dashboard.json'), + }, }, }, }; @@ -341,3 +442,117 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } ``` + +### Mixins + +Kube-prometheus comes with a couple of default mixins as the Kubernetes-mixin and the Node-exporter mixin, however there [are many more mixins](https://monitoring.mixins.dev/). To use other mixins Kube-prometheus has a jsonnet library for creating a Kubernetes PrometheusRule CRD and Grafana dashboards from a mixin. Below is an example of creating a mixin object that has Prometheus rules and Grafana dashboards: + +```jsonnet +// Import the library function for adding mixins +local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet'); + +// Create your mixin +local myMixin = addMixin({ + name: 'myMixin', + mixin: import 'my-mixin/mixin.libsonnet', +}); +``` + +The myMixin object will have two objects - `prometheusRules` and `grafanaDashboards`. The `grafanaDashboards` object will be needed to be added to the `dashboards` field as in the example below: + +```jsonnet +values+:: { + grafana+:: { + dashboards+:: myMixin.grafanaDashboards +``` + +The `prometheusRules` object is a PrometheusRule Kubernetes CRD and it should be defined as its own jsonnet object. If you define multiple mixins in a single jsonnet object there is a possibility that they will overwrite each others' configuration and there will be unintended effects. Therefore, use the `prometheusRules` object as its own jsonnet object: + +```jsonnet +... +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ 'external-mixins/my-mixin-prometheus-rules': myMixin.prometheusRules } // one object for each mixin +``` + +As mentioned above each mixin is configurable and you would configure the mixin as in the example below: + +```jsonnet +local myMixin = addMixin({ + name: 'myMixin', + mixin: (import 'my-mixin/mixin.libsonnet') + { + _config+:: { + myMixinSelector: 'my-selector', + interval: '30d', // example + }, + }, +}); +``` + +The library has also two optional parameters - the namespace for the `PrometheusRule` CRD and the dashboard folder for the Grafana dashboards. The below example shows how to use both: + +```jsonnet +local myMixin = addMixin({ + name: 'myMixin', + namespace: 'prometheus', // default is monitoring + dashboardFolder: 'Observability', + mixin: (import 'my-mixin/mixin.libsonnet') + { + _config+:: { + myMixinSelector: 'my-selector', + interval: '30d', // example + }, + }, +}); +``` + +The created `prometheusRules` object will have the metadata field `namespace` added and the usage will remain the same. However, the `grafanaDasboards` will be added to the `folderDashboards` field instead of the `dashboards` field as shown in the example below: + +```jsonnet +values+:: { + grafana+:: { + folderDashboards+:: { + Kubernetes: { + ... + }, + Misc: { + 'grafana-home.json': import 'dashboards/misc/grafana-home.json', + }, + } + myMixin.grafanaDashboards +``` + +Full example of including etcd mixin using method described above: + +[embedmd]:# (../examples/mixin-inclusion.jsonnet) +```jsonnet +local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet'); +local etcdMixin = addMixin({ + name: 'etcd', + mixin: (import 'github.com/etcd-io/etcd/contrib/mixin/mixin.libsonnet') + { + _config+: {}, // mixin configuration object + }, +}); + +local kp = (import 'kube-prometheus/main.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + grafana+: { + // Adding new dashboard to grafana. This will modify grafana configMap with dashboards + dashboards+: etcdMixin.grafanaDashboards, + }, + }, + }; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +// Rendering prometheusRules object. This is an object compatible with prometheus-operator CRD definition for prometheusRule +{ 'external-mixins/etcd-mixin-prometheus-rules': etcdMixin.prometheusRules } +``` diff --git a/docs/exposing-prometheus-alertmanager-grafana-ingress.md b/docs/exposing-prometheus-alertmanager-grafana-ingress.md index f05ab4ce52c9b0e8fe00ddb0242662d1ccf1c851..be1ba130cf09ad3666132912b3518b2db3a34a9a 100644 --- a/docs/exposing-prometheus-alertmanager-grafana-ingress.md +++ b/docs/exposing-prometheus-alertmanager-grafana-ingress.md @@ -1,12 +1,24 @@ -# Exposing Prometheus, Alertmanager and Grafana UIs via Ingress - -In order to access the web interfaces via the Internet [Kubernetes Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) is a popular option. This guide explains, how Kubernetes Ingress can be setup, in order to expose the Prometheus, Alertmanager and Grafana UIs, that are included in the [kube-prometheus](https://github.com/coreos/kube-prometheus) project. - -Note: before continuing, it is recommended to first get familiar with the [kube-prometheus](https://github.com/coreos/kube-prometheus) stack by itself. +--- +title: "Expose via Ingress" +description: "How to setup a Kubernetes Ingress to expose the Prometheus, Alertmanager and Grafana." +lead: "How to setup a Kubernetes Ingress to expose the Prometheus, Alertmanager and Grafana." +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 500 +toc: true +--- + +In order to access the web interfaces via the Internet [Kubernetes Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) is a popular option. This guide explains, how Kubernetes Ingress can be setup, in order to expose the Prometheus, Alertmanager and Grafana UIs, that are included in the [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) project. + +Note: before continuing, it is recommended to first get familiar with the [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) stack by itself. ## Prerequisites -Apart from a running Kubernetes cluster with a running [kube-prometheus](https://github.com/coreos/kube-prometheus) stack, a Kubernetes Ingress controller must be installed and functional. This guide was tested with the [nginx-ingress-controller](https://github.com/kubernetes/ingress-nginx). If you wish to reproduce the exact result in as depicted in this guide we recommend using the nginx-ingress-controller. +Apart from a running Kubernetes cluster with a running [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) stack, a Kubernetes Ingress controller must be installed and functional. This guide was tested with the [nginx-ingress-controller](https://github.com/kubernetes/ingress-nginx). If you wish to reproduce the exact result in as depicted in this guide we recommend using the nginx-ingress-controller. ## Setting up Ingress diff --git a/docs/kube-prometheus-on-kubeadm.md b/docs/kube-prometheus-on-kubeadm.md index db15c4319c20974f5fbd461f1b837ba66b57a815..37610593986f3a83d5f82dd0cecdbb7f81ba9f93 100644 --- a/docs/kube-prometheus-on-kubeadm.md +++ b/docs/kube-prometheus-on-kubeadm.md @@ -1,15 +1,22 @@ -<br> -<div class="alert alert-info" role="alert"> - <i class="fa fa-exclamation-triangle"></i><b> Note:</b> Starting with v0.12.0, Prometheus Operator requires use of Kubernetes v1.7.x and up. -</div> - -# Kube Prometheus on Kubeadm - -The [kubeadm](https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/) tool is linked by Kubernetes as the offical way to deploy and manage self-hosted clusters. Kubeadm does a lot of heavy lifting by automatically configuring your Kubernetes cluster with some common options. This guide is intended to show you how to deploy Prometheus, Prometheus Operator and Kube Prometheus to get you started monitoring your cluster that was deployed with Kubeadm. +--- +title: "Deploy to kubeadm" +description: "Deploy kube-prometheus to Kubernets kubeadm." +lead: "Deploy kube-prometheus to Kubernets kubeadm." +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 500 +toc: true +--- + +The [kubeadm](https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/) tool is linked by Kubernetes as the offical way to deploy and manage self-hosted clusters. kubeadm does a lot of heavy lifting by automatically configuring your Kubernetes cluster with some common options. This guide is intended to show you how to deploy Prometheus, Prometheus Operator and Kube Prometheus to get you started monitoring your cluster that was deployed with kubeadm. This guide assumes you have a basic understanding of how to use the functionality the Prometheus Operator implements. If you haven't yet, we recommend reading through the [getting started guide](https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/getting-started.md) as well as the [alerting guide](https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/alerting.md). -## Kubeadm Pre-requisites +## kubeadm Pre-requisites This guide assumes you have some familiarity with `kubeadm` or at least have deployed a cluster using `kubeadm`. By default, `kubeadm` does not expose two of the services that we will be monitoring. Therefore, in order to get the most out of the `kube-prometheus` package, we need to make some quick tweaks to the Kubernetes cluster. Since we will be monitoring the `kube-controller-manager` and `kube-scheduler`, we must expose them to the cluster. diff --git a/docs/migration-example/my.release-0.3.jsonnet b/docs/migration-example/my.release-0.3.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..a6a87818dd2c7eef08e4b82f194ed40b5cbe68d7 --- /dev/null +++ b/docs/migration-example/my.release-0.3.jsonnet @@ -0,0 +1,296 @@ +// Has the following customisations +// Custom alert manager config +// Ingresses for the alert manager, prometheus and grafana +// Grafana admin user password +// Custom prometheus rules +// Custom grafana dashboards +// Custom prometheus config - Data retention, memory, etc. +// Node exporter role and role binding so we can use a PSP for the node exporter + + +// External variables +// See https://jsonnet.org/learning/tutorial.html +local cluster_identifier = std.extVar('cluster_identifier'); +local etcd_ip = std.extVar('etcd_ip'); +local etcd_tls_ca = std.extVar('etcd_tls_ca'); +local etcd_tls_cert = std.extVar('etcd_tls_cert'); +local etcd_tls_key = std.extVar('etcd_tls_key'); +local grafana_admin_password = std.extVar('grafana_admin_password'); +local prometheus_data_retention_period = std.extVar('prometheus_data_retention_period'); +local prometheus_request_memory = std.extVar('prometheus_request_memory'); + + +// Derived variables +local alert_manager_host = 'alertmanager.' + cluster_identifier + '.myorg.local'; +local grafana_host = 'grafana.' + cluster_identifier + '.myorg.local'; +local prometheus_host = 'prometheus.' + cluster_identifier + '.myorg.local'; + + +// Imports +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; +local ingress = k.extensions.v1beta1.ingress; +local ingressRule = ingress.mixin.spec.rulesType; +local ingressRuleHttpPath = ingressRule.mixin.http.pathsType; +local ingressTls = ingress.mixin.spec.tlsType; +local role = k.rbac.v1.role; +local roleBinding = k.rbac.v1.roleBinding; +local roleRulesType = k.rbac.v1.role.rulesType; + + +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + + (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + + + { + _config+:: { + // Override namespace + namespace: 'monitoring', + + + // Override alert manager config + // See https://github.com/coreos/kube-prometheus/tree/master/examples/alertmanager-config-external.jsonnet + alertmanager+: { + config: importstr 'alertmanager.yaml', + }, + + // Override etcd config + // See https://github.com/coreos/kube-prometheus/blob/master/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet + // See https://github.com/coreos/kube-prometheus/blob/master/examples/etcd-skip-verify.jsonnet + etcd+:: { + clientCA: etcd_tls_ca, + clientCert: etcd_tls_cert, + clientKey: etcd_tls_key, + ips: [etcd_ip], + }, + + // Override grafana config + // anonymous access + // See http://docs.grafana.org/installation/configuration/ + // See http://docs.grafana.org/auth/overview/#anonymous-authentication + // admin_password + // See http://docs.grafana.org/installation/configuration/#admin-password + grafana+:: { + config: { + sections: { + 'auth.anonymous': { + enabled: true, + }, + security: { + admin_password: grafana_admin_password, + }, + }, + }, + + + }, + }, + + // Additional grafana dashboards + grafanaDashboards+:: { + 'my-specific.json': (import 'my-grafana-dashboard-definitions.json'), + }, + + // Alert manager needs an externalUrl + alertmanager+:: { + alertmanager+: { + spec+: { + // See https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md + // See https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/exposing-prometheus-and-alertmanager.md + externalUrl: 'https://' + alert_manager_host, + }, + }, + }, + + + // Add additional ingresses + // See https://github.com/coreos/kube-prometheus/tree/master/examples/ingress.jsonnet + ingress+:: { + alertmanager: + ingress.new() + + + + ingress.mixin.metadata.withName('alertmanager') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'kubernetes.io/ingress.class': 'nginx-api', + }) + + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost(alert_manager_host) + + ingressRule.mixin.http.withPaths( + ingressRuleHttpPath.new() + + + + ingressRuleHttpPath.mixin.backend.withServiceName('alertmanager-operated') + + + ingressRuleHttpPath.mixin.backend.withServicePort(9093) + ), + ) + + + + // Note we do not need a TLS secretName here as we are going to use the nginx-ingress default secret which is a wildcard + // secretName would need to be in the same namespace at this time, see https://github.com/kubernetes/ingress-nginx/issues/2371 + ingress.mixin.spec.withTls( + ingressTls.new() + + ingressTls.withHosts(alert_manager_host) + ), + + + grafana: + ingress.new() + + + + ingress.mixin.metadata.withName('grafana') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'kubernetes.io/ingress.class': 'nginx-api', + }) + + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost(grafana_host) + + ingressRule.mixin.http.withPaths( + ingressRuleHttpPath.new() + + + + ingressRuleHttpPath.mixin.backend.withServiceName('grafana') + + + ingressRuleHttpPath.mixin.backend.withServicePort(3000) + ), + ) + + + + // Note we do not need a TLS secretName here as we are going to use the nginx-ingress default secret which is a wildcard + // secretName would need to be in the same namespace at this time, see https://github.com/kubernetes/ingress-nginx/issues/2371 + ingress.mixin.spec.withTls( + ingressTls.new() + + ingressTls.withHosts(grafana_host) + ), + + + prometheus: + ingress.new() + + + + ingress.mixin.metadata.withName('prometheus') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'kubernetes.io/ingress.class': 'nginx-api', + }) + + ingress.mixin.spec.withRules( + ingressRule.new() + + + ingressRule.withHost(prometheus_host) + + ingressRule.mixin.http.withPaths( + ingressRuleHttpPath.new() + + + + ingressRuleHttpPath.mixin.backend.withServiceName('prometheus-operated') + + + ingressRuleHttpPath.mixin.backend.withServicePort(9090) + ), + ) + + + + // Note we do not need a TLS secretName here as we are going to use the nginx-ingress default secret which is a wildcard + // secretName would need to be in the same namespace at this time, see https://github.com/kubernetes/ingress-nginx/issues/2371 + ingress.mixin.spec.withTls( + ingressTls.new() + + ingressTls.withHosts(prometheus_host) + ), + }, + + + // Node exporter PSP role and role binding + // Add a new top level field for this, the "node-exporter" PSP already exists, so not defining here just referencing + // See https://github.com/coreos/prometheus-operator/issues/787 + nodeExporterPSP: { + role: + role.new() + + + + role.mixin.metadata.withName('node-exporter-psp') + + role.mixin.metadata.withNamespace($._config.namespace) + + role.withRules([ + roleRulesType.new() + + roleRulesType.withApiGroups(['policy']) + + roleRulesType.withResources(['podsecuritypolicies']) + + roleRulesType.withVerbs(['use']) + + roleRulesType.withResourceNames(['node-exporter']), + ]), + + roleBinding: + roleBinding.new() + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + + + roleBinding.mixin.metadata.withName('node-exporter-psp') + + roleBinding.mixin.metadata.withNamespace($._config.namespace) + + + + roleBinding.mixin.roleRef.withName('node-exporter-psp') + + roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + + + + roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'node-exporter' }]), + + + }, + + + // Prometheus needs some extra custom config + prometheus+:: { + prometheus+: { + spec+: { + // See https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec + externalLabels: { + cluster: cluster_identifier, + }, + // See https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md + // See https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/exposing-prometheus-and-alertmanager.md + externalUrl: 'https://' + prometheus_host, + // Override reuest memory + resources: { + requests: { + memory: prometheus_request_memory, + }, + }, + // Override data retention period + retention: prometheus_data_retention_period, + }, + }, + }, + + + // Additional prometheus rules + // See https://github.com/coreos/kube-prometheus/docs/developing-prometheus-rules-and-grafana-dashboards.md + // cat my-prometheus-rules.yaml | gojsontoyaml -yamltojson | jq . > my-prometheus-rules.json + prometheusRules+:: { + + + groups+: import 'my-prometheus-rules.json', + + + }, + }; + + +// Render +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + + + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + + + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + + +{ [name + '-ingress']: kp.ingress[name] for name in std.objectFields(kp.ingress) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['node-exporter-psp-' + name]: kp.nodeExporterPSP[name] for name in std.objectFields(kp.nodeExporterPSP) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } diff --git a/docs/migration-example/my.release-0.8.jsonnet b/docs/migration-example/my.release-0.8.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..368938b2c4bb0fa8c9c735646cb8e6575ffd9c08 --- /dev/null +++ b/docs/migration-example/my.release-0.8.jsonnet @@ -0,0 +1,316 @@ +// Has the following customisations +// Custom alert manager config +// Ingresses for the alert manager, prometheus and grafana +// Grafana admin user password +// Custom prometheus rules +// Custom grafana dashboards +// Custom prometheus config - Data retention, memory, etc. +// Node exporter role and role binding so we can use a PSP for the node exporter + +// for help with expected content, see https://github.com/thaum-xyz/ankhmorpork + +// External variables +// See https://jsonnet.org/learning/tutorial.html +local cluster_identifier = std.extVar('cluster_identifier'); +local etcd_ip = std.extVar('etcd_ip'); +local etcd_tls_ca = std.extVar('etcd_tls_ca'); +local etcd_tls_cert = std.extVar('etcd_tls_cert'); +local etcd_tls_key = std.extVar('etcd_tls_key'); +local grafana_admin_password = std.extVar('grafana_admin_password'); +local prometheus_data_retention_period = std.extVar('prometheus_data_retention_period'); +local prometheus_request_memory = std.extVar('prometheus_request_memory'); + + +// Derived variables +local alert_manager_host = 'alertmanager.' + cluster_identifier + '.myorg.local'; +local grafana_host = 'grafana.' + cluster_identifier + '.myorg.local'; +local prometheus_host = 'prometheus.' + cluster_identifier + '.myorg.local'; + + +// ksonnet no longer required + + +local kp = + (import 'kube-prometheus/main.libsonnet') + + // kubeadm now achieved by setting platform value - see 9 lines below + (import 'kube-prometheus/addons/static-etcd.libsonnet') + + (import 'kube-prometheus/addons/podsecuritypolicies.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + + // Add kubeadm platform-specific items, + // including kube-contoller-manager and kube-scheduler discovery + kubePrometheus+: { + platform: 'kubeadm', + }, + + // Override alert manager config + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/examples/alertmanager-config-external.jsonnet + alertmanager+: { + config: importstr 'alertmanager.yaml', + }, + + // Override etcd config + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/jsonnet/kube-prometheus/addons/static-etcd.libsonnet + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/examples/etcd-skip-verify.jsonnet + etcd+:: { + clientCA: etcd_tls_ca, + clientCert: etcd_tls_cert, + clientKey: etcd_tls_key, + ips: [etcd_ip], + }, + + // Override grafana config + // anonymous access + // See http://docs.grafana.org/installation/configuration/ + // See http://docs.grafana.org/auth/overview/#anonymous-authentication + // admin_password + // See http://docs.grafana.org/installation/configuration/#admin-password + grafana+:: { + config: { + sections: { + 'auth.anonymous': { + enabled: true, + }, + security: { + admin_password: grafana_admin_password, + }, + }, + }, + // Additional grafana dashboards + dashboards+:: { + 'my-specific.json': (import 'my-grafana-dashboard-definitions.json'), + }, + }, + }, + + + // Alert manager needs an externalUrl + alertmanager+:: { + alertmanager+: { + spec+: { + + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/exposing-prometheus-alertmanager-grafana-ingress.md + externalUrl: 'https://' + alert_manager_host, + }, + }, + }, + + + // Add additional ingresses + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/examples/ingress.jsonnet + ingress+:: { + alertmanager: { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: 'alertmanager', + namespace: $.values.common.namespace, + annotations: { + 'kubernetes.io/ingress.class': 'nginx-api', + }, + }, + spec: { + rules: [{ + host: alert_manager_host, + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'alertmanager-operated', + port: { + number: 9093, + }, + }, + }, + }], + }, + }], + tls: [{ + + hosts: [alert_manager_host], + }], + }, + }, + grafana: { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: 'grafana', + namespace: $.values.common.namespace, + annotations: { + 'kubernetes.io/ingress.class': 'nginx-api', + }, + }, + spec: { + rules: [{ + host: grafana_host, + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'grafana', + port: { + number: 3000, + }, + }, + }, + }], + }, + }], + tls: [{ + + hosts: [grafana_host], + }], + }, + }, + prometheus: { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: 'prometheus', + namespace: $.values.common.namespace, + annotations: { + 'kubernetes.io/ingress.class': 'nginx-api', + }, + }, + spec: { + rules: [{ + host: prometheus_host, + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'prometheus-operated', + port: { + number: 9090, + }, + }, + }, + }], + }, + }], + tls: [{ + + hosts: [prometheus_host], + }], + }, + }, + }, + + + // Node exporter PSP role and role binding + nodeExporter+: { + 'psp-role'+: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'node-exporter-psp', + namespace: $.values.common.namespace, + }, + rules: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: ['node-exporter'], + }], + }, + 'psp-rolebinding'+: { + + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'node-exporter-psp', + namespace: $.values.common.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + name: 'node-exporter-psp', + kind: 'Role', + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'node-exporter', + }], + }, + }, + + // Prometheus needs some extra custom config + prometheus+:: { + prometheus+: { + spec+: { + + externalLabels: { + cluster: cluster_identifier, + }, + + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/exposing-prometheus-alertmanager-grafana-ingress.md + externalUrl: 'https://' + prometheus_host, + // Override reuest memory + resources: { + requests: { + memory: prometheus_request_memory, + }, + }, + // Override data retention period + retention: prometheus_data_retention_period, + }, + }, + }, + + + // Additional prometheus rules + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/developing-prometheus-rules-and-grafana-dashboards.md#pre-rendered-rules + // cat my-prometheus-rules.yaml | gojsontoyaml -yamltojson | jq . > my-prometheus-rules.json + prometheusMe: { + rules: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + labels: { + 'app.kubernetes.io/name': 'kube-prometheus', + 'app.kubernetes.io/part-of': 'kube-prometheus', + prometheus: 'k8s', + role: 'alert-rules', + }, + }, + spec: { + groups: import 'my-prometheus-rules.json', + }, + }, + }, + }; + + +// Render +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ [name + '-ingress']: kp.ingress[name] for name in std.objectFields(kp.ingress) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } ++ { ['prometheus-my-' + name]: kp.prometheusMe[name] for name in std.objectFields(kp.prometheusMe) } diff --git a/docs/migration-example/readme.md b/docs/migration-example/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..5e9def04d79c4924a6456de8893aba8b09af161b --- /dev/null +++ b/docs/migration-example/readme.md @@ -0,0 +1,250 @@ +## Example of conversion of a legacy my.jsonnet file + +An example conversion of a legacy custom jsonnet file to release-0.8 +format can be seen by viewing and comparing this +[release-0.3 jsonnet file](./my.release-0.3.jsonnet) (when the github +repo was under `https://github.com/coreos/kube-prometheus...`) +and the corresponding [release-0.8 jsonnet file](./my.release-0.8.jsonnet). + +These two files have had necessary blank lines added so that they +can be compared side-by-side and line-by-line on screen. + +The conversion covers both the change of stopping using ksonnet after +release-0.3 and also the major migration after release-0.7 as described in +[migration-guide.md](../migration-guide.md) + +The sample files are intended as an example of format conversion and +not necessarily best practice for the files in release-0.3 or release-0.8. + +Below are three sample extracts of the conversion as an indication of the +changes required. +<table> +<tr> +<th> release-0.3 </th> +<th> release-0.8 </th> +</tr> +<tr> +<td> + +```jsonnet +local kp = + (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + + (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + + + { + _config+:: { + // Override namespace + namespace: 'monitoring', + + + + + + + +``` + +</td> +<td> + +```jsonnet +local kp = + (import 'kube-prometheus/main.libsonnet') + + // kubeadm now achieved by setting platform value - see 9 lines below + (import 'kube-prometheus/addons/static-etcd.libsonnet') + + (import 'kube-prometheus/addons/podsecuritypolicies.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + + // Add kubeadm platform-specific items, + // including kube-contoller-manager and kube-scheduler discovery + kubePrometheus+: { + platform: 'kubeadm', + }, +``` + +</td> +</tr> +</table> +<table> +<tr> +<th> release-0.3 </th> +<th> release-0.8 </th> +</tr> +<tr> +<td> + +```jsonnet + // Add additional ingresses + // See https://github.com/coreos/kube-prometheus/... + // tree/master/examples/ingress.jsonnet + ingress+:: { + alertmanager: + ingress.new() + + + + ingress.mixin.metadata.withName('alertmanager') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.metadata.withAnnotations({ + 'kubernetes.io/ingress.class': 'nginx-api', + }) + + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost(alert_manager_host) + + ingressRule.mixin.http.withPaths( + ingressRuleHttpPath.new() + + + + + + ingressRuleHttpPath.mixin.backend + .withServiceName('alertmanager-operated') + + ingressRuleHttpPath.mixin.backend.withServicePort(9093) + ), + ) + + // Note we do not need a TLS secretName here as we are going to use the + // nginx-ingress default secret which is a wildcard + // secretName would need to be in the same namespace at this time, + // see https://github.com/kubernetes/ingress-nginx/issues/2371 + ingress.mixin.spec.withTls( + ingressTls.new() + + ingressTls.withHosts(alert_manager_host) + ), + + +``` + +</td> +<td> + +```jsonnet + // Add additional ingresses + // See https://github.com/prometheus-operator/kube-prometheus/... + // blob/main/examples/ingress.jsonnet + ingress+:: { + alertmanager: { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: 'alertmanager', + namespace: $.values.common.namespace, + annotations: { + 'kubernetes.io/ingress.class': 'nginx-api', + }, + }, + spec: { + rules: [{ + host: alert_manager_host, + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'alertmanager-operated', + port: { + number: 9093, + }, + }, + }, + }], + }, + }], + tls: [{ + + hosts: [alert_manager_host], + }], + }, + }, +``` + +</td> +</tr> +</table> +<table> +<tr> +<th> release-0.3 </th> +<th> release-0.8 </th> +</tr> +<tr> +<td> + +```jsonnet + // Additional prometheus rules + // See https://github.com/coreos/kube-prometheus/docs/... + // developing-prometheus-rules-and-grafana-dashboards.md + // + // cat my-prometheus-rules.yaml | \ + // gojsontoyaml -yamltojson | \ + // jq . > my-prometheus-rules.json + prometheusRules+:: { + + + + + + + + + + + + + + + groups+: import 'my-prometheus-rules.json', + + + }, + }; + + + + +``` + +</td> +<td> + +```jsonnet + // Additional prometheus rules + // See https://github.com/prometheus-operator/kube-prometheus/blob/main/... + // docs/developing-prometheus-rules-and-grafana-dashboards.md... + // #pre-rendered-rules + // cat my-prometheus-rules.yaml | \ + // gojsontoyaml -yamltojson | \ + // jq . > my-prometheus-rules.json + prometheusMe: { + rules: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + labels: { + 'app.kubernetes.io/name': 'kube-prometheus', + 'app.kubernetes.io/part-of': 'kube-prometheus', + prometheus: 'k8s', + role: 'alert-rules', + }, + }, + spec: { + groups: import 'my-prometheus-rules.json', + }, + }, + }, + }; + +... + ++ { ['prometheus-my-' + name]: kp.prometheusMe[name] for name in std.objectFields(kp.prometheusMe) } +``` + +</td> +</tr> +</table> diff --git a/docs/migration-guide.md b/docs/migration-guide.md new file mode 100644 index 0000000000000000000000000000000000000000..a33a8b6186fe0785b73b2f0476feac21209730dd --- /dev/null +++ b/docs/migration-guide.md @@ -0,0 +1,87 @@ +# Migration guide from release-0.7 and earlier + +## Why? + +Thanks to our community we identified a lot of short-commings of previous design, varying from issues with global state to UX problems. Hoping to fix at least part of those issues we decided to do a complete refactor of the codebase. + +## Overview + +### Breaking Changes + +- global `_config` object is removed and the new `values` object is a partial replacement +- `imageRepos` field was removed and the project no longer tries to compose image strings. Use `$.values.common.images` to override default images. +- prometheus alerting and recording rules are split into multiple `PrometheusRule` objects +- kubernetes control plane ServiceMonitors and Services are now part of the new `kubernetesControlPlane` top-level object instead of `prometheus` object +- `jsonnet/kube-prometheus/kube-prometheus.libsonnet` file was renamed to `jsonnet/kube-prometheus/main.libsonnet` and slimmed down to bare minimum +- `jsonnet/kube-prometheus/kube-prometheus*-.libsonnet` files were move either to `jsonnet/kube-prometheus/addons/` or `jsonnet/kube-prometheus/platforms/` depending on the feature they provided +- all component libraries are now function- and not object-based +- monitoring-mixins are included inside each component and not globally. `prometheusRules`, `prometheusAlerts`, and `grafanaDashboards` are accessible only per component via `mixin` object (ex. `$.alertmanager.mixin.prometheusAlerts`) +- default repository branch changed from `master` to `main` +- labels on resources have changes, `kubectl apply` will not work correctly due to those field being immutable. Deleting the resource first before applying is a workaround if you are using the kubectl CLI. (This only applies to `Deployments` and `DaemonSets`.) + +### New Features + +- concept of `addons`, `components`, and `platforms` was introduced +- all main `components` are now represented internally by a function with default values and required parameters (see #Component-configuration for more information) +- `$.values` holds main configuration parameters and should be used to set basic stack configuration. +- common parameters across all `components` are stored now in `$.values.common` +- removed dependency on deprecated ksonnet library + +## Details + +### Components, Addons, Platforms + +Those concepts were already present in the repository but it wasn't clear which file is holding what. After refactoring we categorized jsonnet code into 3 buckets and put them into separate directories: +- `components` - main building blocks for kube-prometheus, written as functions responsible for creating multiple objects representing kubernetes manifests. For example all objects for node_exporter deployment are bundled in `components/node_exporter.libsonnet` library +- `addons` - everything that can enhance kube-prometheus deployment. Those are small snippets of code adding a small feature, for example adding anti-affinity to pods via [`addons/anti-affinity.libsonnet`][antiaffinity]. Addons are meant to be used in object-oriented way like `local kp = (import 'kube-prometheus/main.libsonnet') + (import 'kube-prometheus/addons/all-namespaces.libsonnet')` +- `platforms` - currently those are `addons` specialized to allow deploying kube-prometheus project on a specific platform. + +### Component configuration + +Refactoring main components to use functions allowed us to define APIs for said components. Each function has a default set of parameters that can be overridden or that are required to be set by a user. Those default parameters are represented in each component by `defaults` map at the top of each library file, for example in [`node_exporter.libsonnet`][node_exporter_defaults_example]. + +This API is meant to ease the use of kube-prometheus as parameters can be passed from a JSON file and don't need to be in jsonnet format. However, if you need to modify particular parts of the stack, jsonnet allows you to do this and we are also not restricting such access in any way. An example of such modifications can be seen in any of our `addons`, like the [`addons/anti-affinity.libsonnet`][antiaffinity] one. + +### Mixin integration + +Previously kube-prometheus project joined all mixins on a global level. However with a wider adoption of monitoring mixins this turned out to be a problem, especially apparent when two mixins started to use the same configuration field for different purposes. To fix this we moved all mixins into their own respective components: +- alertmanager mixin -> `alertmanager.libsonnet` +- kubernetes mixin -> `k8s-control-plane.libsonnet` +- kube-state-metrics mixin -> `kube-state-metrics.libsonnet` +- node_exporter mixin -> `node_exporter.libsonnet` +- prometheus and thanos sidecar mixins -> `prometheus.libsonnet` +- prometheus-operator mixin -> `prometheus-operator.libsonnet` +- kube-prometheus alerts and rules -> `components/mixin/custom.libsonnet` + +> etcd mixin is a special case as we add it inside an `addon` in `addons/static-etcd.libsonnet` + +This results in creating multiple `PrometheusRule` objects instead of having one giant object as before. It also means each mixin is configured separately and accessing mixin objects is done via `$.<component>.mixin`. + +## Examples + +All examples from `examples/` directory were adapted to the new codebase. [Please take a look at them for guideance](https://github.com/prometheus-operator/kube-prometheus/tree/main/examples) + +## Legacy migration + +An example of conversion of a legacy release-0.3 my.jsonnet file to release-0.8 can be found in [migration-example](./migration-example) + +## Advanced usage examples + +For more advanced usage examples you can take a look at those two, open to public, implementations: +- [thaum-xyz/ankhmorpork][thaum] - extending kube-prometheus to adapt to a required environment +- [openshift/cluster-monitoring-operator][openshift] - using kube-prometheus components as standalone libraries to build a custom solution + +## Final note + +Refactoring was a huge undertaking and possibly this document didn't describe in enough detail how to help you with migration to the new stack. If that is the case, please reach out to us by using [GitHub discussions][discussions] feature or directly on [#prometheus-operator kubernetes slack channel][slack]. + + +[antiaffinity]: https://github.com/prometheus-operator/kube-prometheus/blob/main/jsonnet/kube-prometheus/addons/anti-affinity.libsonnet + +[node_exporter_defaults_example]: https://github.com/prometheus-operator/kube-prometheus/blob/1d2a0e275af97948667777739a18b24464480dc8/jsonnet/kube-prometheus/components/node-exporter.libsonnet#L3-L34 + +[openshift]: https://github.com/openshift/cluster-monitoring-operator/pull/1044 +[thaum]: https://github.com/thaum-xyz/ankhmorpork/blob/master/apps/monitoring/jsonnet + +[discussions]: https://github.com/prometheus-operator/kube-prometheus/discussions +[slack]: http://slack.k8s.io/ diff --git a/docs/monitoring-external-etcd.md b/docs/monitoring-external-etcd.md index f70718511c4b6ca9702b2efcd413134d72e5893b..6ddecb1876a744ae5db74b841a153d79f2075955 100644 --- a/docs/monitoring-external-etcd.md +++ b/docs/monitoring-external-etcd.md @@ -1,5 +1,18 @@ -# Monitoring external etcd -This guide will help you monitor an external etcd cluster. When the etcd cluster is not hosted inside Kubernetes. +--- +title: "Monitoring external etcd" +description: "This guide will help you monitor an external etcd cluster." +lead: "This guide will help you monitor an external etcd cluster." +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 640 +toc: true +--- + +When the etcd cluster is not hosted inside Kubernetes. This is often the case with Kubernetes setups. This approach has been tested with kube-aws but the same principals apply to other tools. Note that [etcd.jsonnet](../examples/etcd.jsonnet) & [kube-prometheus-static-etcd.libsonnet](../jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet) (which are described by a section of the [Readme](../README.md#static-etcd-configuration)) do the following: diff --git a/docs/monitoring-other-namespaces.md b/docs/monitoring-other-namespaces.md index 51f21201b58f3d2353a6ef6fca78a9a3973a2a18..dc111b69d42059ff16e7917002e234ca265e322f 100644 --- a/docs/monitoring-other-namespaces.md +++ b/docs/monitoring-other-namespaces.md @@ -1,4 +1,17 @@ -# Monitoring other Kubernetes Namespaces +--- +title: "Monitoring other Namespaces" +description: "This guide will help you monitor applications in other Namespaces." +lead: "This guide will help you monitor applications in other Namespaces." +date: 2021-03-08T23:04:32+01:00 +draft: false +images: [] +menu: + docs: + parent: "kube" +weight: 640 +toc: true +--- + This guide will help you monitor applications in other Namespaces. By default the RBAC rules are only enabled for the `Default` and `kube-system` Namespace during Install. # Setup @@ -7,7 +20,7 @@ This is done in the variable `prometheus.roleSpecificNamespaces`. You usually se Example to create the needed `Role` and `RoleBinding` for the Namespace `foo` : ``` -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { +local kp = (import 'kube-prometheus/main.libsonnet') + { _config+:: { namespace: 'monitoring', diff --git a/docs/weave-net-support.md b/docs/weave-net-support.md index 9924434a6c98d6ccb283ccaf6a2384aa44f34109..e8c2dfce4109a155fefaa254251a9ccc9149a684 100644 --- a/docs/weave-net-support.md +++ b/docs/weave-net-support.md @@ -17,36 +17,42 @@ Using kube-prometheus and kubectl you will be able install the following for mon [embedmd]:# (../examples/weave-net-example.jsonnet) ```jsonnet -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-weave-net.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/weave-net/weave-net.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups: std.map( - function(group) - if group.name == 'weave-net' then - group { - rules: std.map( - function(rule) - if rule.alert == 'WeaveNetFastDPFlowsLow' then - rule { - expr: 'sum(weave_flows) < 20000', - } - else if rule.alert == 'WeaveNetIPAMUnreachable' then - rule { - expr: 'weave_ipam_unreachable_percentage > 25', - } - else - rule - , - group.rules - ), - } - else - group, - super.groups - ), + kubernetesControlPlane+: { + prometheusRuleWeaveNet+: { + spec+: { + groups: std.map( + function(group) + if group.name == 'weave-net' then + group { + rules: std.map( + function(rule) + if rule.alert == 'WeaveNetFastDPFlowsLow' then + rule { + expr: 'sum(weave_flows) < 20000', + } + else if rule.alert == 'WeaveNetIPAMUnreachable' then + rule { + expr: 'weave_ipam_unreachable_percentage > 25', + } + else + rule + , + group.rules + ), + } + else + group, + super.groups + ), + }, + }, }, }; diff --git a/docs/windows.md b/docs/windows.md new file mode 100644 index 0000000000000000000000000000000000000000..dcdc2be9af41db17ad84259f57b5f039ae2dbcaf --- /dev/null +++ b/docs/windows.md @@ -0,0 +1,22 @@ +# Windows + +The [Windows addon](../examples/windows.jsonnet) adds the dashboards and rules from [kubernetes-monitoring/kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin#dashboards-for-windows-nodes). + +Currently, Windows does not support running with [windows_exporter](https://github.com/prometheus-community/windows_exporter) in a pod so this add on uses [additional scrape configuration](https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/additional-scrape-config.md) to set up a static config to scrape the node ports where windows_exporter is configured. + + +The addon requires you to specify the node ips and ports where it can find the windows_exporter. See the [full example](../examples/windows.jsonnet) for setup. + +``` +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/windows.libsonnet') + + { + values+:: { + windowsScrapeConfig+:: { + static_configs: { + targets: ["10.240.0.65:5000", "10.240.0.63:5000"], + }, + }, + }, + }; +``` diff --git a/example.jsonnet b/example.jsonnet index 54de1e3554854b8a8142aeb035a95b704752a63a..b181d64773d6e52ef362b36c3e3ecf12f69e1965 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -1,28 +1,34 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + // Uncomment the following imports to enable its patches - // (import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-managed-cluster.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/anti-affinity.libsonnet') + + // (import 'kube-prometheus/addons/managed-cluster.libsonnet') + + // (import 'kube-prometheus/addons/node-ports.libsonnet') + + // (import 'kube-prometheus/addons/static-etcd.libsonnet') + + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; -{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + -// serviceMonitor is separated so that it can be created after the CRDs are ready +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + -{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } diff --git a/examples/additional-namespaces-servicemonitor.jsonnet b/examples/additional-namespaces-servicemonitor.jsonnet index 0f3add96d462392f903fbf64276d3c42f822f459..ec978da7806e058fab93070d2547f558893798dd 100644 --- a/examples/additional-namespaces-servicemonitor.jsonnet +++ b/examples/additional-namespaces-servicemonitor.jsonnet @@ -1,11 +1,13 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, prometheus+:: { namespaces+: ['my-namespace', 'my-second-namespace'], }, }, - prometheus+:: { + exampleApplication: { serviceMonitorMyNamespace: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', @@ -37,4 +39,5 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } diff --git a/examples/additional-namespaces.jsonnet b/examples/additional-namespaces.jsonnet index 957fd9124741ef75acf4541fe5bfaeea52e4b55d..45c606a6022487ca5076a11a9342a07f8220eb85 100644 --- a/examples/additional-namespaces.jsonnet +++ b/examples/additional-namespaces.jsonnet @@ -1,8 +1,10 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, - prometheus+:: { + prometheus+: { namespaces+: ['my-namespace', 'my-second-namespace'], }, }, diff --git a/examples/alertmanager-config-external.jsonnet b/examples/alertmanager-config-external.jsonnet index c2b34ccaa36892ce5a0fa28ff1947083d5d93167..49f6b558dd87e9ecf843e636aec878efe8ead8ad 100644 --- a/examples/alertmanager-config-external.jsonnet +++ b/examples/alertmanager-config-external.jsonnet @@ -1,5 +1,5 @@ -((import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { +((import 'kube-prometheus/main.libsonnet') + { + values+:: { alertmanager+: { config: importstr 'alertmanager-config.yaml', }, diff --git a/examples/alertmanager-config.jsonnet b/examples/alertmanager-config.jsonnet index f08dbe19e333073d6553fda87bf5b4671787c0ac..9702711aead46d011c90e097dcb818d826bf8c66 100644 --- a/examples/alertmanager-config.jsonnet +++ b/examples/alertmanager-config.jsonnet @@ -1,5 +1,5 @@ -((import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { +((import 'kube-prometheus/main.libsonnet') + { + values+:: { alertmanager+: { config: ||| global: diff --git a/examples/all-namespaces.jsonnet b/examples/all-namespaces.jsonnet index 7c5d149f1473ae4a856bbf386845562b594aa15f..52534766efa4d2b3f7dc4741a72936d7f7f8340a 100644 --- a/examples/all-namespaces.jsonnet +++ b/examples/all-namespaces.jsonnet @@ -1,9 +1,10 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-all-namespaces.libsonnet') + { - _config+:: { - namespace: 'monitoring', - - prometheus+:: { +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/all-namespaces.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + prometheus+: { namespaces: [], }, }, diff --git a/jsonnet/kube-prometheus/kube-prometheus-ksonnet.libsonnet b/examples/anti-affinity.jsonnet similarity index 66% rename from jsonnet/kube-prometheus/kube-prometheus-ksonnet.libsonnet rename to examples/anti-affinity.jsonnet index ad13373fda32dadc392e6f88df587ccc74de2e1c..23720837ab936d8f9ef2c05271262685816d98cb 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-ksonnet.libsonnet +++ b/examples/anti-affinity.jsonnet @@ -1,5 +1,13 @@ -local kp = (import './kube-prometheus/kube-prometheus.libsonnet'); +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/anti-affinity.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + }, +}; +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + diff --git a/examples/changing-default-rules.libsonnet b/examples/changing-default-rules.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..1a0c192b9041da8d15af7f6b6db1a75efda49acc --- /dev/null +++ b/examples/changing-default-rules.libsonnet @@ -0,0 +1,92 @@ +local filter = { + kubernetesControlPlane+: { + prometheusRule+:: { + spec+: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.filter( + function(rule) + rule.alert != 'KubeStatefulSetReplicasMismatch', + group.rules + ), + } + else + group, + super.groups + ), + }, + }, + }, +}; +local update = { + kubernetesControlPlane+: { + prometheusRule+:: { + spec+: { + groups: std.map( + function(group) + if group.name == 'kubernetes-apps' then + group { + rules: std.map( + function(rule) + if rule.alert == 'KubePodCrashLooping' then + rule { + expr: 'rate(kube_pod_container_status_restarts_total{namespace=kube-system,job="kube-state-metrics"}[10m]) * 60 * 5 > 0', + } + else + rule, + group.rules + ), + } + else + group, + super.groups + ), + }, + }, + }, +}; + +local add = { + exampleApplication:: { + prometheusRule+: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'example-application-rules', + namespace: $.values.common.namespace, + }, + spec: (import 'existingrule.json'), + }, + }, +}; +local kp = (import 'kube-prometheus/main.libsonnet') + + filter + + update + + add + { + values+:: { + common+: { + namespace: 'monitoring', + }, + }, +}; + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } + +{ ['exampleApplication-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } diff --git a/examples/continuous-delivery/argocd/kube-prometheus/argocd-basic.jsonnet b/examples/continuous-delivery/argocd/kube-prometheus/argocd-basic.jsonnet index 8600d818df6065823be4252f871f9b4902ffd471..94089e0c948155a390b595f130d5f16eae691cae 100644 --- a/examples/continuous-delivery/argocd/kube-prometheus/argocd-basic.jsonnet +++ b/examples/continuous-delivery/argocd/kube-prometheus/argocd-basic.jsonnet @@ -1,7 +1,8 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; diff --git a/examples/eks-cni-example.jsonnet b/examples/eks-cni-example.jsonnet index dcebf6ddcd56240e26d66a84eb378e4566cf003d..1b37af501225eb85a49b1eb40fe55e4aa6b8a62a 100644 --- a/examples/eks-cni-example.jsonnet +++ b/examples/eks-cni-example.jsonnet @@ -1,20 +1,28 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + kubePrometheus+: { + platform: 'eks', + }, }, - prometheusRules+:: { - groups+: [ - { - name: 'example-group', - rules: [ + kubernetesControlPlane+: { + prometheusRuleEksCNI+: { + spec+: { + groups+: [ { - record: 'aws_eks_available_ip', - expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + name: 'example-group', + rules: [ + { + record: 'aws_eks_available_ip', + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + }, + ], }, ], }, - ], + }, }, }; diff --git a/examples/etcd-skip-verify.jsonnet b/examples/etcd-skip-verify.jsonnet index 603ba710e48a5cefd0c2c80df0796fb2e5ce5484..9982fa1699eb6ebf995f1bb927362f977511cec1 100644 --- a/examples/etcd-skip-verify.jsonnet +++ b/examples/etcd-skip-verify.jsonnet @@ -1,8 +1,9 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + { - _config+:: { - namespace: 'monitoring', - +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/static-etcd.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, etcd+:: { ips: ['127.0.0.1'], clientCA: importstr 'etcd-client-ca.crt', diff --git a/examples/etcd.jsonnet b/examples/etcd.jsonnet index 03d390cddb69fadd3c575bc97d14557def5fa531..7126ee314b330a418790056ca1b2ed5453c92193 100644 --- a/examples/etcd.jsonnet +++ b/examples/etcd.jsonnet @@ -1,10 +1,12 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/static-etcd.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, // Reference info: https://github.com/coreos/kube-prometheus/blob/master/README.md#static-etcd-configuration - etcd+:: { + etcd+: { // Configure this to be the IP(s) to scrape - i.e. your etcd node(s) (use commas to separate multiple values). ips: ['127.0.0.1'], diff --git a/examples/existingrule.json b/examples/existingrule.json index 41d6620b87749b68f9aa222b9582a27ac0469068..8486eb7d721506133c2720706f48892e6190e2d4 100644 --- a/examples/existingrule.json +++ b/examples/existingrule.json @@ -1 +1 @@ -{"groups":[{"name":"example-group","rules":[{"alert":"Watchdog","annotations":{"description":"This is a Watchdog meant to ensure that the entire alerting pipeline is functional."},"expr":"vector(1)","labels":{"severity":"none"}}]}]} \ No newline at end of file +{"groups":[{"name":"example-group","rules":[{"alert":"ExampleAlert","annotations":{"description":"This is an example alert."},"expr":"vector(1)","labels":{"severity":"warning"}}]}]} \ No newline at end of file diff --git a/examples/existingrule.yaml b/examples/existingrule.yaml index 6a67032f96accb758e0ab108c1c72be037b7efe4..ab5de27041282bd0fe173bc8ad0e8401811c5e29 100644 --- a/examples/existingrule.yaml +++ b/examples/existingrule.yaml @@ -1,9 +1,9 @@ groups: - name: example-group rules: - - alert: Watchdog + - alert: ExampleAlert expr: vector(1) labels: - severity: "none" + severity: "warning" annotations: - description: This is a Watchdog meant to ensure that the entire alerting pipeline is functional. + description: This is an example alert. diff --git a/examples/grafana-additional-jsonnet-dashboard-example.jsonnet b/examples/grafana-additional-jsonnet-dashboard-example.jsonnet index b9b26fca552e983ab74d3ea2505240cb0892700e..8c1973caa3a4896b28c6ba42981fdc05ac2322aa 100644 --- a/examples/grafana-additional-jsonnet-dashboard-example.jsonnet +++ b/examples/grafana-additional-jsonnet-dashboard-example.jsonnet @@ -5,35 +5,37 @@ local prometheus = grafana.prometheus; local template = grafana.template; local graphPanel = grafana.graphPanel; -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafana+:: { - dashboards+:: { - 'my-dashboard.json': - dashboard.new('My Dashboard') - .addTemplate( - { - current: { - text: 'Prometheus', - value: 'Prometheus', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + dashboards+:: { + 'my-dashboard.json': + dashboard.new('My Dashboard') + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', }, - hide: 0, - label: null, - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - ) - .addRow( - row.new() - .addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource') - .addTarget(prometheus.target('vector(1)'))) - ), + ) + .addRow( + row.new() + .addPanel(graphPanel.new('My Panel', span=6, datasource='$datasource') + .addTarget(prometheus.target('vector(1)'))) + ), + }, }, }, }; diff --git a/examples/grafana-additional-rendered-dashboard-example-2.jsonnet b/examples/grafana-additional-rendered-dashboard-example-2.jsonnet index 7d0926e99c05885cc1a0dbf1950c50b59db1a4d3..7b8825d759f38329f9471129aba6d3480385f4e8 100644 --- a/examples/grafana-additional-rendered-dashboard-example-2.jsonnet +++ b/examples/grafana-additional-rendered-dashboard-example-2.jsonnet @@ -1,10 +1,12 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafana+:: { - rawDashboards+:: { - 'my-dashboard.json': (importstr 'example-grafana-dashboard.json'), +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + rawDashboards+:: { + 'my-dashboard.json': (importstr 'example-grafana-dashboard.json'), + }, }, }, }; diff --git a/examples/grafana-additional-rendered-dashboard-example.jsonnet b/examples/grafana-additional-rendered-dashboard-example.jsonnet index 883c609700a8e9c6ccdf19ad6d068d9a55c778a5..1a8c8b9a731a0218a2a4d8f63ad3c6af9e17b895 100644 --- a/examples/grafana-additional-rendered-dashboard-example.jsonnet +++ b/examples/grafana-additional-rendered-dashboard-example.jsonnet @@ -1,13 +1,12 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, - grafanaDashboards+:: { // monitoring-mixin compatibility - 'my-dashboard.json': (import 'example-grafana-dashboard.json'), - }, - grafana+:: { - dashboards+:: { // use this method to import your dashboards to Grafana - 'my-dashboard.json': (import 'example-grafana-dashboard.json'), +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+:: { + namespace: 'monitoring', + }, + grafana+: { + dashboards+:: { // use this method to import your dashboards to Grafana + 'my-dashboard.json': (import 'example-grafana-dashboard.json'), + }, }, }, }; diff --git a/examples/grafana-only-dashboards.jsonnet b/examples/grafana-only-dashboards.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..2c5520c0998ba28df416856845628eb0a609d883 --- /dev/null +++ b/examples/grafana-only-dashboards.jsonnet @@ -0,0 +1,25 @@ +local kp = + (import 'kube-prometheus/main.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + }, + + // Disable all grafana-related objects apart from dashboards and datasource + grafana: { + dashboardSources:: {}, + deployment:: {}, + serviceAccount:: {}, + serviceMonitor:: {}, + service:: {}, + }, + }; + +// Manifestation +{ + [component + '-' + resource + '.json']: kp[component][resource] + for component in std.objectFields(kp) + for resource in std.objectFields(kp[component]) +} diff --git a/examples/ingress.jsonnet b/examples/ingress.jsonnet index 023af577bafad7dddf6a76184108138861cb47d3..b1197c5d36aa2837f546bcfc87b1cb44c84df336 100644 --- a/examples/ingress.jsonnet +++ b/examples/ingress.jsonnet @@ -14,10 +14,12 @@ local ingress(name, namespace, rules) = { }; local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, grafana+:: { config+: { sections+: { @@ -47,15 +49,19 @@ local kp = ingress+:: { 'alertmanager-main': ingress( 'alertmanager-main', - $._config.namespace, + $.values.common.namespace, [{ host: 'alertmanager.example.com', http: { paths: [{ + path: '/', + pathType: 'Prefix', backend: { service: { name: 'alertmanager-main', - port: 'web', + port: { + name: 'web', + }, }, }, }], @@ -64,15 +70,19 @@ local kp = ), grafana: ingress( 'grafana', - $._config.namespace, + $.values.common.namespace, [{ host: 'grafana.example.com', http: { paths: [{ + path: '/', + pathType: 'Prefix', backend: { service: { name: 'grafana', - port: 'http', + port: { + name: 'http', + }, }, }, }], @@ -81,15 +91,19 @@ local kp = ), 'prometheus-k8s': ingress( 'prometheus-k8s', - $._config.namespace, + $.values.common.namespace, [{ host: 'prometheus.example.com', http: { paths: [{ + path: '/', + pathType: 'Prefix', backend: { service: { name: 'prometheus-k8s', - port: 'web', + port: { + name: 'web', + }, }, }, }], @@ -105,7 +119,7 @@ local kp = kind: 'Secret', metadata: { name: 'basic-auth', - namespace: $._config.namespace, + namespace: $.values.common.namespace, }, data: { auth: std.base64(importstr 'auth') }, type: 'Opaque', diff --git a/examples/internal-registry.jsonnet b/examples/internal-registry.jsonnet index f1d1e8ac9e49774b16cc4b97a5803f67b0f790c2..fc470de61da6e5e140aafa53fdd8f172b4630539 100644 --- a/examples/internal-registry.jsonnet +++ b/examples/internal-registry.jsonnet @@ -1,7 +1,9 @@ -local mixin = import 'kube-prometheus/kube-prometheus-config-mixins.libsonnet'; -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local mixin = import 'kube-prometheus/addons/config-mixins.libsonnet'; +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, } + mixin.withImageRepository('internal-registry.com/organization'); diff --git a/examples/jsonnet-snippets/bootkube.jsonnet b/examples/jsonnet-snippets/bootkube.jsonnet deleted file mode 100644 index f7386a012ab6e8594d6d833aee3264e5e40caf7d..0000000000000000000000000000000000000000 --- a/examples/jsonnet-snippets/bootkube.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-bootkube.libsonnet') diff --git a/examples/jsonnet-snippets/kops-coredns.jsonnet b/examples/jsonnet-snippets/kops-coredns.jsonnet deleted file mode 100644 index 6ba445dff751e10c8c52e2c7c57dd08eb43cd6e9..0000000000000000000000000000000000000000 --- a/examples/jsonnet-snippets/kops-coredns.jsonnet +++ /dev/null @@ -1,3 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') diff --git a/examples/jsonnet-snippets/kops.jsonnet b/examples/jsonnet-snippets/kops.jsonnet deleted file mode 100644 index 4ff9ceaea1f8dca3b474d0cb5d1f4e6d895a71ea..0000000000000000000000000000000000000000 --- a/examples/jsonnet-snippets/kops.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kops.libsonnet') diff --git a/examples/jsonnet-snippets/kube-aws.jsonnet b/examples/jsonnet-snippets/kube-aws.jsonnet deleted file mode 100644 index b0842eb24e1dde19565afd6ae88ff52dab819b26..0000000000000000000000000000000000000000 --- a/examples/jsonnet-snippets/kube-aws.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kube-aws.libsonnet') diff --git a/examples/jsonnet-snippets/kubeadm.jsonnet b/examples/jsonnet-snippets/kubeadm.jsonnet deleted file mode 100644 index a7837163e4ffe9cae140382eae63dde0d2395321..0000000000000000000000000000000000000000 --- a/examples/jsonnet-snippets/kubeadm.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') diff --git a/examples/jsonnet-snippets/kubespray.jsonnet b/examples/jsonnet-snippets/kubespray.jsonnet deleted file mode 100644 index 1665cf7285921d27f4e445c68087ff6cd8a175f4..0000000000000000000000000000000000000000 --- a/examples/jsonnet-snippets/kubespray.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-kubespray.libsonnet') diff --git a/examples/jsonnet-snippets/node-ports.jsonnet b/examples/jsonnet-snippets/node-ports.jsonnet index c02f1ae729edd7f8fb9df8f39d8a8f2a68c600c7..abc70c94b4832887ee564ac697f82ea8ff382e9f 100644 --- a/examples/jsonnet-snippets/node-ports.jsonnet +++ b/examples/jsonnet-snippets/node-ports.jsonnet @@ -1,2 +1,2 @@ -(import 'kube-prometheus/kube-prometheus.libsonnet') + -(import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') +(import 'kube-prometheus/main.libsonnet') + +(import 'kube-prometheus/addons/node-ports.libsonnet') diff --git a/examples/jsonnet-snippets/platform.jsonnet b/examples/jsonnet-snippets/platform.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..e3a588049d91d6fe82ad7de69713f377e3b30553 --- /dev/null +++ b/examples/jsonnet-snippets/platform.jsonnet @@ -0,0 +1,8 @@ +(import 'kube-prometheus/main.libsonnet') + +{ + values+:: { + common+: { + platform: 'example-platform', + }, + }, +} diff --git a/examples/ksonnet-example.jsonnet b/examples/ksonnet-example.jsonnet index eadd2cb79c7309558abece9a1a05747cf7359366..36640ab4309282810785c58cb6a1c91e9f59bdd5 100644 --- a/examples/ksonnet-example.jsonnet +++ b/examples/ksonnet-example.jsonnet @@ -1,4 +1,4 @@ -((import 'kube-prometheus/kube-prometheus.libsonnet') + { +((import 'kube-prometheus/main.libsonnet') + { nodeExporter+: { daemonset+: { metadata+: { diff --git a/examples/kubeProxy.jsonnet b/examples/kubeProxy.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..03a7b3c74b5c81f116c729f7e888a835ce6a08ab --- /dev/null +++ b/examples/kubeProxy.jsonnet @@ -0,0 +1,20 @@ +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, + + kubernetesControlPlane+: { + kubeProxy: true, + }, + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } diff --git a/examples/kustomize.jsonnet b/examples/kustomize.jsonnet index 38dd6c89d66559beff0c949545c2f1e5b40cccbf..455b38bd77eb1ef1a912801cc8cc3cdb1ff2cb6b 100644 --- a/examples/kustomize.jsonnet +++ b/examples/kustomize.jsonnet @@ -1,26 +1,32 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', + (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; local manifests = // Uncomment line below to enable vertical auto scaling of kube-state-metrics //{ ['ksm-autoscaler-' + name]: kp.ksmAutoscaler[name] for name in std.objectFields(kp.ksmAutoscaler) } + - { ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + + { 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) } + - // serviceMonitor is separated so that it can be created after the CRDs are ready + // serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + + { 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + + { 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + - { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }; + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + + { ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) }; local kustomizationResourceFile(name) = './manifests/' + name + '.yaml'; local kustomization = { diff --git a/examples/minikube.jsonnet b/examples/minikube.jsonnet index 3073612a4849c849d0e366c366795edbed243c7b..c5a1bc685f8ebb0a4165571cec87fa8ecbbd64f1 100644 --- a/examples/minikube.jsonnet +++ b/examples/minikube.jsonnet @@ -1,15 +1,16 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + // Note that NodePort type services is likely not a good idea for your production use case, it is only used for demonstration purposes here. - (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + + (import 'kube-prometheus/addons/node-ports.libsonnet') + { - _config+:: { - namespace: 'monitoring', - alertmanager+:: { + values+:: { + common+: { + namespace: 'monitoring', + }, + alertmanager+: { config: importstr 'alertmanager-config.yaml', }, - grafana+:: { + grafana+: { config: { // http://docs.grafana.org/installation/configuration/ sections: { // Do not require grafana users to login/authenticate @@ -17,12 +18,15 @@ local kp = }, }, }, + kubePrometheus+: { + platform: 'kubeadm', + }, }, // For simplicity, each of the following values for 'externalUrl': // * assume that `minikube ip` prints "192.168.99.100" // * hard-code the NodePort for each app - prometheus+:: { + prometheus+: { prometheus+: { // Reference info: https://coreos.com/operators/prometheus/docs/latest/api.html#prometheusspec spec+: { @@ -38,7 +42,7 @@ local kp = }, }, }, - alertmanager+:: { + alertmanager+: { alertmanager+: { // Reference info: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#alertmanagerspec spec+: { diff --git a/examples/mixin-inclusion.jsonnet b/examples/mixin-inclusion.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..fc75c6283426aa1a077929152b2926a9e661f7af --- /dev/null +++ b/examples/mixin-inclusion.jsonnet @@ -0,0 +1,30 @@ +local addMixin = (import 'kube-prometheus/lib/mixin.libsonnet'); +local etcdMixin = addMixin({ + name: 'etcd', + mixin: (import 'github.com/etcd-io/etcd/contrib/mixin/mixin.libsonnet') + { + _config+: {}, // mixin configuration object + }, +}); + +local kp = (import 'kube-prometheus/main.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + grafana+: { + // Adding new dashboard to grafana. This will modify grafana configMap with dashboards + dashboards+: etcdMixin.grafanaDashboards, + }, + }, + }; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +// Rendering prometheusRules object. This is an object compatible with prometheus-operator CRD definition for prometheusRule +{ 'external-mixins/etcd-mixin-prometheus-rules': etcdMixin.prometheusRules } diff --git a/examples/pod-security-policies.jsonnet b/examples/pod-security-policies.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..3274c93727287d014a0e5c4d5e966152157c5f26 --- /dev/null +++ b/examples/pod-security-policies.jsonnet @@ -0,0 +1,23 @@ +local kp = + (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/podsecuritypolicies.libsonnet'); + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + +// Add the restricted psp to setup +{ 'setup/0podsecuritypolicy-restricted': kp.restrictedPodSecurityPolicy } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } diff --git a/examples/prometheus-additional-alert-rule-example.jsonnet b/examples/prometheus-additional-alert-rule-example.jsonnet index 6e63382e20e97b43d9342562cda2c0981706b766..43dc538c158e49561fadb3a581d05e35244d0f78 100644 --- a/examples/prometheus-additional-alert-rule-example.jsonnet +++ b/examples/prometheus-additional-alert-rule-example.jsonnet @@ -1,25 +1,37 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups+: [ - { - name: 'example-group', - rules: [ + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: [ { - alert: 'Watchdog', - expr: 'vector(1)', - labels: { - severity: 'none', - }, - annotations: { - description: 'This is a Watchdog meant to ensure that the entire alerting pipeline is functional.', - }, + name: 'example-group', + rules: [ + { + alert: 'ExampleAlert', + expr: 'vector(1)', + labels: { + severity: 'warning', + }, + annotations: { + description: 'This is an example alert.', + }, + }, + ], }, ], }, - ], + }, }, }; @@ -30,4 +42,5 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } diff --git a/examples/prometheus-additional-recording-rule-example.jsonnet b/examples/prometheus-additional-recording-rule-example.jsonnet index 132bd0db0f4a1b891385c455064c71068d35a94b..5e67b03f018e95b2afad9a2f0e40ef62af4e3d99 100644 --- a/examples/prometheus-additional-recording-rule-example.jsonnet +++ b/examples/prometheus-additional-recording-rule-example.jsonnet @@ -1,19 +1,31 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusRules+:: { - groups+: [ - { - name: 'example-group', - rules: [ + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: [ { - record: 'some_recording_rule_name', - expr: 'vector(1)', + name: 'example-group', + rules: [ + { + record: 'some_recording_rule_name', + expr: 'vector(1)', + }, + ], }, ], }, - ], + }, }, }; @@ -24,4 +36,5 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } diff --git a/examples/prometheus-additional-rendered-rule-example.jsonnet b/examples/prometheus-additional-rendered-rule-example.jsonnet index b6d39a4052e168bc7f46fc1278eb32d9c48fe482..66c7937c15bdbdc12cf4106fb594aee8431877c3 100644 --- a/examples/prometheus-additional-rendered-rule-example.jsonnet +++ b/examples/prometheus-additional-rendered-rule-example.jsonnet @@ -1,9 +1,21 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups+: (import 'existingrule.json').groups, + exampleApplication: { + prometheusRuleExample: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'my-prometheus-rule', + namespace: $.values.common.namespace, + }, + spec: { + groups: (import 'existingrule.json').groups, + }, + }, }, }; @@ -14,4 +26,5 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + { ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['example-application-' + name]: kp.exampleApplication[name] for name in std.objectFields(kp.exampleApplication) } diff --git a/examples/prometheus-name-override.jsonnet b/examples/prometheus-name-override.jsonnet index 862180129d9f3440ccc158ea07e33bc440b22561..b6c3906059b88c938d60e84dd1a7f603fbfadbc5 100644 --- a/examples/prometheus-name-override.jsonnet +++ b/examples/prometheus-name-override.jsonnet @@ -1,4 +1,4 @@ -((import 'kube-prometheus/kube-prometheus.libsonnet') + { +((import 'kube-prometheus/main.libsonnet') + { prometheus+: { prometheus+: { metadata+: { diff --git a/examples/prometheus-pvc.jsonnet b/examples/prometheus-pvc.jsonnet index e753499e17b273a588875332634af842410acc2e..5beffd84a51054a48927162c5d2f1a074fa12243 100644 --- a/examples/prometheus-pvc.jsonnet +++ b/examples/prometheus-pvc.jsonnet @@ -1,14 +1,15 @@ local kp = - (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/main.libsonnet') + // Uncomment the following imports to enable its patches - // (import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-managed-cluster.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-node-ports.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-static-etcd.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet') + + // (import 'kube-prometheus/addons/anti-affinity.libsonnet') + + // (import 'kube-prometheus/addons/managed-cluster.libsonnet') + + // (import 'kube-prometheus/addons/node-ports.libsonnet') + + // (import 'kube-prometheus/addons/static-etcd.libsonnet') + { - _config+:: { - namespace: 'monitoring', + values+:: { + common+: { + namespace: 'monitoring', + }, }, prometheus+:: { diff --git a/examples/strip-limits.jsonnet b/examples/strip-limits.jsonnet index 69912b648d471f4a97aad3a012436f65ba80dd1f..fc43e286066a0409388e5edd70c0c51ab9481aa6 100644 --- a/examples/strip-limits.jsonnet +++ b/examples/strip-limits.jsonnet @@ -1,7 +1,9 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-strip-limits.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/strip-limits.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, }; diff --git a/examples/thanos-sidecar.jsonnet b/examples/thanos-sidecar.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..7ed55036633be6941f9770357a7330cb33f9a899 --- /dev/null +++ b/examples/thanos-sidecar.jsonnet @@ -0,0 +1,33 @@ +local kp = + (import 'kube-prometheus/main.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + prometheus+: { + thanos: { + version: '0.19.0', + image: 'quay.io/thanos/thanos:v0.19.0', + objectStorageConfig: { + key: 'thanos.yaml', // How the file inside the secret is called + name: 'thanos-objectstorage', // This is the name of your Kubernetes secret with the config + }, + }, + }, + }, + }; + +{ ['setup/0namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor is separated so that it can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/examples/tolerations.libsonnet b/examples/tolerations.libsonnet index 35776f397b93813d56f709666014a4b703bd45da..f244cebf3a5e1d6684b46e678876a38ca60eb360 100644 --- a/examples/tolerations.libsonnet +++ b/examples/tolerations.libsonnet @@ -1,23 +1,19 @@ { - _config+:: { - tolerations+:: [ - { - key: 'key1', - operator: 'Equal', - value: 'value1', - effect: 'NoSchedule', - }, - { - key: 'key2', - operator: 'Exists', - }, - ], - }, - prometheus+: { prometheus+: { spec+: { - tolerations: [t for t in $._config.tolerations], + tolerations: [ + { + key: 'key1', + operator: 'Equal', + value: 'value1', + effect: 'NoSchedule', + }, + { + key: 'key2', + operator: 'Exists', + }, + ], }, }, }, diff --git a/examples/weave-net-example.jsonnet b/examples/weave-net-example.jsonnet index c6cc733c27a465b5ddd169d0314451d7d328950a..eeeb622df0ed30eab3a985f5302f1aa588b7f317 100644 --- a/examples/weave-net-example.jsonnet +++ b/examples/weave-net-example.jsonnet @@ -1,33 +1,39 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - (import 'kube-prometheus/kube-prometheus-weave-net.libsonnet') + { - _config+:: { - namespace: 'monitoring', +local kp = (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/weave-net/weave-net.libsonnet') + { + values+:: { + common+: { + namespace: 'monitoring', + }, }, - prometheusAlerts+:: { - groups: std.map( - function(group) - if group.name == 'weave-net' then - group { - rules: std.map( - function(rule) - if rule.alert == 'WeaveNetFastDPFlowsLow' then - rule { - expr: 'sum(weave_flows) < 20000', - } - else if rule.alert == 'WeaveNetIPAMUnreachable' then - rule { - expr: 'weave_ipam_unreachable_percentage > 25', - } - else - rule - , - group.rules - ), - } - else - group, - super.groups - ), + kubernetesControlPlane+: { + prometheusRuleWeaveNet+: { + spec+: { + groups: std.map( + function(group) + if group.name == 'weave-net' then + group { + rules: std.map( + function(rule) + if rule.alert == 'WeaveNetFastDPFlowsLow' then + rule { + expr: 'sum(weave_flows) < 20000', + } + else if rule.alert == 'WeaveNetIPAMUnreachable' then + rule { + expr: 'weave_ipam_unreachable_percentage > 25', + } + else + rule + , + group.rules + ), + } + else + group, + super.groups + ), + }, + }, }, }; diff --git a/examples/windows.jsonnet b/examples/windows.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..d90fb8dab6d675b9f51f5bbd4d1fda7f56885b8f --- /dev/null +++ b/examples/windows.jsonnet @@ -0,0 +1,33 @@ +local kp = + (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/windows.libsonnet') + + { + values+:: { + common+: { + namespace: 'monitoring', + }, + windowsScrapeConfig+:: { + static_configs: [{ + targets: ['10.240.0.65:5000', '10.240.0.63:5000'], + }], + }, + }, + }; + +{ 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + +{ + ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] + for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) +} + +// serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready +{ 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + +{ 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } diff --git a/go.mod b/go.mod index 1c6c40f1d0d1914f54dc826af02a232396b0a9b3..b0bfa19c54b5c801624f7805d3932f9d887e9347 100644 --- a/go.mod +++ b/go.mod @@ -1,13 +1,9 @@ module github.com/prometheus-operator/kube-prometheus -go 1.13 +go 1.15 require ( github.com/Jeffail/gabs v1.4.0 - github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c - github.com/campoy/embedmd v1.0.0 - github.com/google/go-jsonnet v0.16.1-0.20200703153429-aaf50f5b655f - github.com/jsonnet-bundler/jsonnet-bundler v0.4.0 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.8.0 k8s.io/apimachinery v0.19.3 diff --git a/go.sum b/go.sum index 657f1cd9eeaf217291efe38e029191f7de72a028..1fca7e54925d228858bfabd61695f404026c5fe4 100644 --- a/go.sum +++ b/go.sum @@ -54,10 +54,6 @@ github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= -github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c h1:hb6WqfcKQZlNx/vahy51SaIvKnoXD5609Nm0PC4msEM= -github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c/go.mod h1:+00lOjYXPgMfxHVPvg9GDtc3BX5Xh5aFpB4gMB8gfMo= -github.com/campoy/embedmd v1.0.0 h1:V4kI2qTJJLf4J29RzI/MAt2c3Bl4dQSYPuflzwFH2hY= -github.com/campoy/embedmd v1.0.0/go.mod h1:oxyr9RCiSXg0M3VJ3ks0UGfp98BpSSGr0kpiX3MzVl8= github.com/casbin/casbin/v2 v2.1.2/go.mod h1:YcPU1XXisHhLzuxH9coDNf2FbKpjGlbCg3n9yuLkIJQ= github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= @@ -93,8 +89,6 @@ github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7 github.com/evanphx/json-patch v4.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= -github.com/fatih/color v1.9.0 h1:8xPHl4/q1VyqGIPif1F+1V3Y3lSmrq01EabUW3CoW5s= -github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU= github.com/franela/goblin v0.0.0-20200105215937-c9ffbefa60db/go.mod h1:7dvUGVsVBjqR7JHJk0brhHOZYGmfBYOrK0ZhYMEtBr4= github.com/franela/goreq v0.0.0-20171204163338-bcd34c9993f8/go.mod h1:ZhphrRTfi2rbfLwlschooIH4+wKKDR4Pdxhh+TRoA20= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= @@ -155,8 +149,6 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-jsonnet v0.16.1-0.20200703153429-aaf50f5b655f h1:mw4KoMG5/DXLPhpKXQRYTEIZFkFo0a1HU2R1HbeYpek= -github.com/google/go-jsonnet v0.16.1-0.20200703153429-aaf50f5b655f/go.mod h1:sOcuej3UW1vpPTZOr8L7RQimqai1a57bt5j22LzGZCw= github.com/google/gofuzz v1.0.0 h1:A8PeW59pxE9IoFRqBp37U+mSNaQoZ46F1f0f863XSXw= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= @@ -216,8 +208,6 @@ github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/u github.com/json-iterator/go v1.1.8/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.10 h1:Kz6Cvnvv2wGdaG/V8yMvfkmNiXq9Ya2KUv4rouJJr68= github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= -github.com/jsonnet-bundler/jsonnet-bundler v0.4.0 h1:4BKZ6LDqPc2wJDmaKnmYD/vDjUptJtnUpai802MibFc= -github.com/jsonnet-bundler/jsonnet-bundler v0.4.0/go.mod h1:/by7P/OoohkI3q4CgSFqcoFsVY+IaNbzOVDknEsKDeU= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= @@ -241,15 +231,8 @@ github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0Q github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mattn/go-colorable v0.0.9 h1:UVL0vNpWh04HeJXV0KLcaT7r06gOH2l4OW6ddYRUIY4= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= -github.com/mattn/go-colorable v0.1.4 h1:snbPLB8fVfU9iwbbo30TPtbLRzwWu6aJS6Xh4eaaviA= -github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= -github.com/mattn/go-isatty v0.0.6 h1:SrwhHcpV4nWrMGdNcC2kXpMfcBVYGDuTArqyhocJgvA= -github.com/mattn/go-isatty v0.0.6/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= -github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= -github.com/mattn/go-isatty v0.0.11 h1:FxPOTFNqGkuDUGi3H/qkUbQO4ZiBa2brKq5r0l8TGeM= -github.com/mattn/go-isatty v0.0.11/go.mod h1:PhnuNfih5lzO57/f3n+odYbM4JtupLOxQOAqxQCu2WE= github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= @@ -344,8 +327,6 @@ github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= github.com/samuel/go-zookeeper v0.0.0-20190923202752-2cc03de413da/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= -github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= -github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= @@ -462,8 +443,6 @@ golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181122145206-62eef0e2fa9b/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190310054646-10058d7d4faa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -474,8 +453,6 @@ golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191026070338-33540a1f6037 h1:YyJpGZS1sBuBCzLAR1VEpK193GlqGZbnPFnPV/5Rsb4= -golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/hack/example-service-monitoring/deploy b/hack/example-service-monitoring/deploy deleted file mode 100755 index 0c7cd7c1a4632240478354f51f3aaead0ff0f990..0000000000000000000000000000000000000000 --- a/hack/example-service-monitoring/deploy +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash -# exit immediately when a command fails -set -e -# only exit with zero if all commands of the pipeline exit successfully -set -o pipefail -# error on unset variables -set -u - -kubectl apply -f examples/example-app diff --git a/hack/example-service-monitoring/teardown b/hack/example-service-monitoring/teardown deleted file mode 100755 index 1a49f4625cf5378b091762900c0d791e46a93095..0000000000000000000000000000000000000000 --- a/hack/example-service-monitoring/teardown +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash -# exit immediately when a command fails -set -e -# only exit with zero if all commands of the pipeline exit successfully -set -o pipefail -# error on unset variables -set -u - -kubectl delete -f examples/example-app diff --git a/hack/jsonnet-docker-image b/hack/jsonnet-docker-image deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/jsonnet/kube-prometheus/addons/all-namespaces.libsonnet b/jsonnet/kube-prometheus/addons/all-namespaces.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..34f831733350635d11c3efbc0239254328eb548d --- /dev/null +++ b/jsonnet/kube-prometheus/addons/all-namespaces.libsonnet @@ -0,0 +1,22 @@ +{ + prometheus+:: { + clusterRole+: { + rules+: [ + { + apiGroups: [''], + resources: ['services', 'endpoints', 'pods'], + verbs: ['get', 'list', 'watch'], + }, + { + apiGroups: ['networking.k8s.io'], + resources: ['ingresses'], + verbs: ['get', 'list', 'watch'], + }, + ], + }, + // There is no need for specific namespaces RBAC as this addon grants + // all required permissions for every namespace + roleBindingSpecificNamespaces:: null, + roleSpecificNamespaces:: null, + }, +} diff --git a/jsonnet/kube-prometheus/addons/anti-affinity.libsonnet b/jsonnet/kube-prometheus/addons/anti-affinity.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..e266b9131c2055863d8b9bb6b907ef8e6b0eaff8 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/anti-affinity.libsonnet @@ -0,0 +1,99 @@ +{ + values+:: { + alertmanager+: { + podAntiAffinity: 'soft', + podAntiAffinityTopologyKey: 'kubernetes.io/hostname', + }, + prometheus+: { + podAntiAffinity: 'soft', + podAntiAffinityTopologyKey: 'kubernetes.io/hostname', + }, + blackboxExporter+: { + podAntiAffinity: 'soft', + podAntiAffinityTopologyKey: 'kubernetes.io/hostname', + }, + prometheusAdapter+: { + podAntiAffinity: 'soft', + podAntiAffinityTopologyKey: 'kubernetes.io/hostname', + }, + }, + + antiaffinity(labelSelector, namespace, type, topologyKey):: { + local podAffinityTerm = { + namespaces: [namespace], + topologyKey: topologyKey, + labelSelector: { + matchLabels: labelSelector, + }, + }, + + affinity: { + podAntiAffinity: if type == 'soft' then { + preferredDuringSchedulingIgnoredDuringExecution: [{ + weight: 100, + podAffinityTerm: podAffinityTerm, + }], + } else if type == 'hard' then { + requiredDuringSchedulingIgnoredDuringExecution: [ + podAffinityTerm, + ], + } else error 'podAntiAffinity must be either "soft" or "hard"', + }, + }, + + alertmanager+: { + alertmanager+: { + spec+: + $.antiaffinity( + $.alertmanager._config.selectorLabels, + $.values.common.namespace, + $.values.alertmanager.podAntiAffinity, + $.values.alertmanager.podAntiAffinityTopologyKey, + ), + }, + }, + + prometheus+: { + prometheus+: { + spec+: + $.antiaffinity( + $.prometheus._config.selectorLabels, + $.values.common.namespace, + $.values.prometheus.podAntiAffinity, + $.values.prometheus.podAntiAffinityTopologyKey, + ), + }, + }, + + blackboxExporter+: { + deployment+: { + spec+: { + template+: { + spec+: + $.antiaffinity( + $.blackboxExporter._config.selectorLabels, + $.values.common.namespace, + $.values.blackboxExporter.podAntiAffinity, + $.values.blackboxExporter.podAntiAffinityTopologyKey, + ), + }, + }, + }, + }, + + prometheusAdapter+: { + deployment+: { + spec+: { + template+: { + spec+: + $.antiaffinity( + $.prometheusAdapter._config.selectorLabels, + $.values.common.namespace, + $.values.prometheusAdapter.podAntiAffinity, + $.values.prometheusAdapter.podAntiAffinityTopologyKey, + ), + }, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/aws-vpc-cni.libsonnet b/jsonnet/kube-prometheus/addons/aws-vpc-cni.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..61e7aaa41f612d8c5dfa2a65261f82c72cab0a02 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/aws-vpc-cni.libsonnet @@ -0,0 +1,110 @@ +{ + values+:: { + awsVpcCni: { + // `minimumWarmIPs` should be inferior or equal to `WARM_IP_TARGET`. + // + // References: + // https://github.com/aws/amazon-vpc-cni-k8s/blob/v1.9.0/docs/eni-and-ip-target.md + // https://github.com/aws/amazon-vpc-cni-k8s/blob/v1.9.0/pkg/ipamd/ipamd.go#L61-L71 + minimumWarmIPs: 10, + minimumWarmIPsTime: '10m', + }, + }, + kubernetesControlPlane+: { + serviceAwsVpcCni: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: 'aws-node', + namespace: 'kube-system', + labels: { 'app.kubernetes.io/name': 'aws-node' }, + }, + spec: { + ports: [ + { + name: 'cni-metrics-port', + port: 61678, + targetPort: 61678, + }, + ], + selector: { 'app.kubernetes.io/name': 'aws-node' }, + clusterIP: 'None', + }, + }, + + serviceMonitorAwsVpcCni: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'aws-node', + namespace: $.values.common.namespace, + labels: { + 'app.kubernetes.io/name': 'aws-node', + }, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + selector: { + matchLabels: { + 'app.kubernetes.io/name': 'aws-node', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + endpoints: [ + { + port: 'cni-metrics-port', + interval: '30s', + path: '/metrics', + relabelings: [ + { + action: 'replace', + regex: '(.*)', + replacement: '$1', + sourceLabels: ['__meta_kubernetes_pod_node_name'], + targetLabel: 'instance', + }, + ], + }, + ], + }, + }, + + prometheusRuleAwsVpcCni: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: $.prometheus._config.commonLabels + $.prometheus._config.mixin.ruleLabels, + name: 'aws-vpc-cni-rules', + namespace: $.prometheus._config.namespace, + }, + spec: { + groups: [ + { + name: 'aws-vpc-cni.rules', + rules: [ + { + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $.values.awsVpcCni.minimumWarmIPs, + labels: { + severity: 'critical', + }, + annotations: { + summary: 'AWS VPC CNI has a low warm IP pool', + description: ||| + Instance {{ $labels.instance }} has only {{ $value }} warm IPs which is lower than set threshold of %s. + It could mean the current subnet is out of available IP addresses or the CNI is unable to request them from the EC2 API. + ||| % $.values.awsVpcCni.minimumWarmIPs, + }, + 'for': $.values.awsVpcCni.minimumWarmIPsTime, + alert: 'AwsVpcCniWarmIPsLow', + }, + ], + }, + ], + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/config-mixins.libsonnet b/jsonnet/kube-prometheus/addons/config-mixins.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..ca2f4468e1b0292363d8a09198c1d1119fdcfb6d --- /dev/null +++ b/jsonnet/kube-prometheus/addons/config-mixins.libsonnet @@ -0,0 +1,36 @@ +local imageName(image) = + local parts = std.split(image, '/'); + local len = std.length(parts); + if len == 3 then + // registry.com/org/image + parts[2] + else if len == 2 then + // org/image + parts[1] + else if len == 1 then + // image, ie. busybox + parts[0] + else + error 'unknown image format: ' + image; + + +// withImageRepository is a mixin that replaces all images prefixes by repository. eg. +// quay.io/coreos/addon-resizer -> $repository/addon-resizer +// grafana/grafana -> grafana $repository/grafana +local withImageRepository(repository) = { + local oldRepos = super.values.common.images, + local substituteRepository(image, repository) = + if repository == null then image else repository + '/' + imageName(image), + values+:: { + common+:: { + images:: { + [field]: substituteRepository(oldRepos[field], repository) + for field in std.objectFields(oldRepos) + }, + }, + }, +}; + +{ + withImageRepository:: withImageRepository, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-custom-metrics.libsonnet b/jsonnet/kube-prometheus/addons/custom-metrics.libsonnet similarity index 94% rename from jsonnet/kube-prometheus/kube-prometheus-custom-metrics.libsonnet rename to jsonnet/kube-prometheus/addons/custom-metrics.libsonnet index b0240ec31c8dfcacb9d1930ff59a1a8b1531383d..06e9c5a0b22a53a4c56994fa40dd315281f899aa 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-custom-metrics.libsonnet +++ b/jsonnet/kube-prometheus/addons/custom-metrics.libsonnet @@ -2,9 +2,9 @@ // For more details on usage visit https://github.com/DirectXMan12/k8s-prometheus-adapter#quick-links { - _config+:: { - prometheusAdapter+:: { - namespace: $._config.namespace, + values+:: { + prometheusAdapter+: { + namespace: $.values.common.namespace, // Rules for custom-metrics config+:: { rules+: [ @@ -78,7 +78,7 @@ }, }, - prometheusAdapter+:: { + prometheusAdapter+: { customMetricsApiService: { apiVersion: 'apiregistration.k8s.io/v1', kind: 'APIService', @@ -88,7 +88,7 @@ spec: { service: { name: $.prometheusAdapter.service.metadata.name, - namespace: $._config.prometheusAdapter.namespace, + namespace: $.values.prometheusAdapter.namespace, }, group: 'custom.metrics.k8s.io', version: 'v1beta1', @@ -106,7 +106,7 @@ spec: { service: { name: $.prometheusAdapter.service.metadata.name, - namespace: $._config.prometheusAdapter.namespace, + namespace: $.values.prometheusAdapter.namespace, }, group: 'custom.metrics.k8s.io', version: 'v1beta2', @@ -133,7 +133,6 @@ metadata: { name: 'custom-metrics-server-resources', }, - roleRef: { apiGroup: 'rbac.authorization.k8s.io', kind: 'ClusterRole', @@ -142,7 +141,7 @@ subjects: [{ kind: 'ServiceAccount', name: $.prometheusAdapter.serviceAccount.metadata.name, - namespace: $._config.prometheusAdapter.namespace, + namespace: $.values.prometheusAdapter.namespace, }], }, customMetricsClusterRoleBindingHPA: { @@ -151,7 +150,6 @@ metadata: { name: 'hpa-controller-custom-metrics', }, - roleRef: { apiGroup: 'rbac.authorization.k8s.io', kind: 'ClusterRole', diff --git a/jsonnet/kube-prometheus/addons/dropping-deprecated-metrics-relabelings.libsonnet b/jsonnet/kube-prometheus/addons/dropping-deprecated-metrics-relabelings.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..374b8601c9ef6c7b4428fd1788d4a539ce80f18c --- /dev/null +++ b/jsonnet/kube-prometheus/addons/dropping-deprecated-metrics-relabelings.libsonnet @@ -0,0 +1,139 @@ +[ + // Drop all kubelet metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)', + action: 'drop', + }, + // Drop all scheduler metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)', + action: 'drop', + }, + // Drop all apiserver metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)', + action: 'drop', + }, + // Drop all docker metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)', + action: 'drop', + }, + // Drop all reflector metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)', + action: 'drop', + }, + // Drop all etcd metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)', + action: 'drop', + }, + // Drop all transformation metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: 'transformation_(transformation_latencies_microseconds|failures_total)', + action: 'drop', + }, + // Drop all other metrics which are deprecated in kubernetes. + { + sourceLabels: ['__name__'], + regex: '(' + std.join('|', + [ + 'admission_quota_controller_adds', + 'admission_quota_controller_depth', + 'admission_quota_controller_longest_running_processor_microseconds', + 'admission_quota_controller_queue_latency', + 'admission_quota_controller_unfinished_work_seconds', + 'admission_quota_controller_work_duration', + 'APIServiceOpenAPIAggregationControllerQueue1_adds', + 'APIServiceOpenAPIAggregationControllerQueue1_depth', + 'APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds', + 'APIServiceOpenAPIAggregationControllerQueue1_queue_latency', + 'APIServiceOpenAPIAggregationControllerQueue1_retries', + 'APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds', + 'APIServiceOpenAPIAggregationControllerQueue1_work_duration', + 'APIServiceRegistrationController_adds', + 'APIServiceRegistrationController_depth', + 'APIServiceRegistrationController_longest_running_processor_microseconds', + 'APIServiceRegistrationController_queue_latency', + 'APIServiceRegistrationController_retries', + 'APIServiceRegistrationController_unfinished_work_seconds', + 'APIServiceRegistrationController_work_duration', + 'autoregister_adds', + 'autoregister_depth', + 'autoregister_longest_running_processor_microseconds', + 'autoregister_queue_latency', + 'autoregister_retries', + 'autoregister_unfinished_work_seconds', + 'autoregister_work_duration', + 'AvailableConditionController_adds', + 'AvailableConditionController_depth', + 'AvailableConditionController_longest_running_processor_microseconds', + 'AvailableConditionController_queue_latency', + 'AvailableConditionController_retries', + 'AvailableConditionController_unfinished_work_seconds', + 'AvailableConditionController_work_duration', + 'crd_autoregistration_controller_adds', + 'crd_autoregistration_controller_depth', + 'crd_autoregistration_controller_longest_running_processor_microseconds', + 'crd_autoregistration_controller_queue_latency', + 'crd_autoregistration_controller_retries', + 'crd_autoregistration_controller_unfinished_work_seconds', + 'crd_autoregistration_controller_work_duration', + 'crdEstablishing_adds', + 'crdEstablishing_depth', + 'crdEstablishing_longest_running_processor_microseconds', + 'crdEstablishing_queue_latency', + 'crdEstablishing_retries', + 'crdEstablishing_unfinished_work_seconds', + 'crdEstablishing_work_duration', + 'crd_finalizer_adds', + 'crd_finalizer_depth', + 'crd_finalizer_longest_running_processor_microseconds', + 'crd_finalizer_queue_latency', + 'crd_finalizer_retries', + 'crd_finalizer_unfinished_work_seconds', + 'crd_finalizer_work_duration', + 'crd_naming_condition_controller_adds', + 'crd_naming_condition_controller_depth', + 'crd_naming_condition_controller_longest_running_processor_microseconds', + 'crd_naming_condition_controller_queue_latency', + 'crd_naming_condition_controller_retries', + 'crd_naming_condition_controller_unfinished_work_seconds', + 'crd_naming_condition_controller_work_duration', + 'crd_openapi_controller_adds', + 'crd_openapi_controller_depth', + 'crd_openapi_controller_longest_running_processor_microseconds', + 'crd_openapi_controller_queue_latency', + 'crd_openapi_controller_retries', + 'crd_openapi_controller_unfinished_work_seconds', + 'crd_openapi_controller_work_duration', + 'DiscoveryController_adds', + 'DiscoveryController_depth', + 'DiscoveryController_longest_running_processor_microseconds', + 'DiscoveryController_queue_latency', + 'DiscoveryController_retries', + 'DiscoveryController_unfinished_work_seconds', + 'DiscoveryController_work_duration', + 'kubeproxy_sync_proxy_rules_latency_microseconds', + 'non_structural_schema_condition_controller_adds', + 'non_structural_schema_condition_controller_depth', + 'non_structural_schema_condition_controller_longest_running_processor_microseconds', + 'non_structural_schema_condition_controller_queue_latency', + 'non_structural_schema_condition_controller_retries', + 'non_structural_schema_condition_controller_unfinished_work_seconds', + 'non_structural_schema_condition_controller_work_duration', + 'rest_client_request_latency_seconds', + 'storage_operation_errors_total', + 'storage_operation_status_count', + ]) + ')', + action: 'drop', + }, +] diff --git a/jsonnet/kube-prometheus/addons/external-metrics.libsonnet b/jsonnet/kube-prometheus/addons/external-metrics.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..928d29e7248293dd06859cfb769b192ed6adc1ee --- /dev/null +++ b/jsonnet/kube-prometheus/addons/external-metrics.libsonnet @@ -0,0 +1,95 @@ +// External metrics API allows the HPA v2 to scale based on metrics coming from outside of Kubernetes cluster +// For more details on usage visit https://github.com/DirectXMan12/k8s-prometheus-adapter#quick-links + +{ + values+:: { + prometheusAdapter+: { + namespace: $.values.common.namespace, + // Rules for external-metrics + config+:: { + externalRules+: [ + // { + // seriesQuery: '{__name__=~"^.*_queue$",namespace!=""}', + // seriesFilters: [], + // resources: { + // overrides: { + // namespace: { resource: 'namespace' } + // }, + // }, + // name: { matches: '^.*_queue$', as: '$0' }, + // metricsQuery: 'max(<<.Series>>{<<.LabelMatchers>>})', + // }, + ], + }, + }, + }, + + prometheusAdapter+: { + externalMetricsApiService: { + apiVersion: 'apiregistration.k8s.io/v1', + kind: 'APIService', + metadata: { + name: 'v1beta1.external.metrics.k8s.io', + }, + spec: { + service: { + name: $.prometheusAdapter.service.metadata.name, + namespace: $.values.prometheusAdapter.namespace, + }, + group: 'external.metrics.k8s.io', + version: 'v1beta1', + insecureSkipTLSVerify: true, + groupPriorityMinimum: 100, + versionPriority: 100, + }, + }, + externalMetricsClusterRoleServerResources: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: 'external-metrics-server-resources', + }, + rules: [{ + apiGroups: ['external.metrics.k8s.io'], + resources: ['*'], + verbs: ['*'], + }], + }, + externalMetricsClusterRoleBindingServerResources: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: 'external-metrics-server-resources', + }, + + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'external-metrics-server-resources', + }, + subjects: [{ + kind: 'ServiceAccount', + name: $.prometheusAdapter.serviceAccount.metadata.name, + namespace: $.values.prometheusAdapter.namespace, + }], + }, + externalMetricsClusterRoleBindingHPA: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: 'hpa-controller-external-metrics', + }, + + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'external-metrics-server-resources', + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'horizontal-pod-autoscaler', + namespace: 'kube-system', + }], + }, + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet b/jsonnet/kube-prometheus/addons/insecure-kubelet.libsonnet similarity index 98% rename from jsonnet/kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet rename to jsonnet/kube-prometheus/addons/insecure-kubelet.libsonnet index 73d0b9d741e9413425d56ee643f754af6039279d..ab6f29434e1eb460b6d242519fc694f5e1f491fe 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet +++ b/jsonnet/kube-prometheus/addons/insecure-kubelet.libsonnet @@ -1,5 +1,5 @@ { - prometheus+:: { + prometheus+: { serviceMonitorKubelet+: { spec+: { diff --git a/jsonnet/kube-prometheus/addons/ksm-autoscaler.libsonnet b/jsonnet/kube-prometheus/addons/ksm-autoscaler.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..fa2caf0e8e87cf78f34f79fef0d404602f59a741 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/ksm-autoscaler.libsonnet @@ -0,0 +1,136 @@ +{ + values+:: { + clusterVerticalAutoscaler: { + version: '0.8.1', + image: 'gcr.io/google_containers/cpvpa-amd64:v0.8.1', + baseCPU: '1m', + stepCPU: '1m', + baseMemory: '1Mi', + stepMemory: '2Mi', + }, + }, + ksmAutoscaler+: { + clusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { name: 'ksm-autoscaler' }, + rules: [{ + apiGroups: [''], + resources: ['nodes'], + verbs: ['list', 'watch'], + }], + }, + + clusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { name: 'ksm-autoscaler' }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'ksm-autoscaler', + }, + subjects: [{ kind: 'ServiceAccount', name: 'ksm-autoscaler', namespace: $.values.common.namespace }], + }, + + roleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'ksm-autoscaler', + namespace: $.values.common.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'ksm-autoscaler', + }, + subjects: [{ kind: 'ServiceAccount', name: 'ksm-autoscaler' }], + }, + + role: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'ksm-autoscaler', + namespace: $.values.common.namespace, + }, + rules: [ + { + apiGroups: ['extensions'], + resources: ['deployments'], + verbs: ['patch'], + resourceNames: ['kube-state-metrics'], + }, + { + apiGroups: ['apps'], + resources: ['deployments'], + verbs: ['patch'], + resourceNames: ['kube-state-metrics'], + }, + ], + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: 'ksm-autoscaler', + namespace: $.values.common.namespace, + }, + }, + + deployment: + local podLabels = { app: 'ksm-autoscaler' }; + local c = { + name: 'ksm-autoscaler', + image: $.values.clusterVerticalAutoscaler.image, + args: [ + '/cpvpa', + '--target=deployment/kube-state-metrics', + '--namespace=' + $.values.common.namespace, + '--logtostderr=true', + '--poll-period-seconds=10', + '--default-config={"kube-state-metrics":{"requests":{"cpu":{"base":"' + $.values.clusterVerticalAutoscaler.baseCPU + + '","step":"' + $.values.clusterVerticalAutoscaler.stepCPU + + '","nodesPerStep":1},"memory":{"base":"' + $.values.clusterVerticalAutoscaler.baseMemory + + '","step":"' + $.values.clusterVerticalAutoscaler.stepMemory + + '","nodesPerStep":1}},"limits":{"cpu":{"base":"' + $.values.clusterVerticalAutoscaler.baseCPU + + '","step":"' + $.values.clusterVerticalAutoscaler.stepCPU + + '","nodesPerStep":1},"memory":{"base":"' + $.values.clusterVerticalAutoscaler.baseMemory + + '","step":"' + $.values.clusterVerticalAutoscaler.stepMemory + '","nodesPerStep":1}}}}', + ], + resources: { + requests: { cpu: '20m', memory: '10Mi' }, + }, + }; + + { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: { + name: 'ksm-autoscaler', + namespace: $.values.common.namespace, + labels: podLabels, + }, + spec: { + replicas: 1, + selector: { matchLabels: podLabels }, + template: { + metadata: { + labels: podLabels, + }, + spec: { + containers: [c], + serviceAccount: 'ksm-autoscaler', + nodeSelector: { 'kubernetes.io/os': 'linux' }, + securityContext: { + runAsNonRoot: true, + runAsUser: 65534, + }, + }, + }, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/ksm-lite.libsonnet b/jsonnet/kube-prometheus/addons/ksm-lite.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..cf06ebdc66b6a8a027f5820c58075915d541ee55 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/ksm-lite.libsonnet @@ -0,0 +1,39 @@ +local addArgs(args, name, containers) = std.map( + function(c) if c.name == name then + c { + args+: args, + } + else c, + containers, +); + +{ + kubeStateMetrics+: { + deployment+: { + spec+: { + template+: { + spec+: { + containers: addArgs( + [||| + --metric-denylist= + kube_.+_created, + kube_.+_metadata_resource_version, + kube_replicaset_metadata_generation, + kube_replicaset_status_observed_generation, + kube_pod_restart_policy, + kube_pod_init_container_status_terminated, + kube_pod_init_container_status_running, + kube_pod_container_status_terminated, + kube_pod_container_status_running, + kube_pod_completion_time, + kube_pod_status_scheduled + |||], + 'kube-state-metrics', + super.containers + ), + }, + }, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/managed-cluster.libsonnet b/jsonnet/kube-prometheus/addons/managed-cluster.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..79c464a67eb32335d49e969d32f238918256d6f0 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/managed-cluster.libsonnet @@ -0,0 +1,20 @@ +// On managed Kubernetes clusters some of the control plane components are not exposed to customers. +// Disable scrape jobs, service monitors, and alert groups for these components by overwriting 'main.libsonnet' defaults + +{ + kubernetesControlPlane+: { + serviceMonitorKubeControllerManager:: null, + serviceMonitorKubeScheduler:: null, + } + { + prometheusRule+: { + spec+: { + local g = super.groups, + groups: [ + h + for h in g + if !std.setMember(h.name, ['kubernetes-system-controller-manager', 'kubernetes-system-scheduler']) + ], + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-node-ports.libsonnet b/jsonnet/kube-prometheus/addons/node-ports.libsonnet similarity index 95% rename from jsonnet/kube-prometheus/kube-prometheus-node-ports.libsonnet rename to jsonnet/kube-prometheus/addons/node-ports.libsonnet index b936901111317466acc54e19b73dee337a1771f8..405a70cee155950e424d3c3c72fe12c2c0dd4ab6 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-node-ports.libsonnet +++ b/jsonnet/kube-prometheus/addons/node-ports.libsonnet @@ -1,6 +1,6 @@ local patch(ports) = { spec+: { - ports+: ports, + ports: ports, type: 'NodePort', }, }; diff --git a/jsonnet/kube-prometheus/addons/podsecuritypolicies.libsonnet b/jsonnet/kube-prometheus/addons/podsecuritypolicies.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..61439b598225ea69d593a1d427dbffc3110cb243 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/podsecuritypolicies.libsonnet @@ -0,0 +1,264 @@ +local restrictedPodSecurityPolicy = { + apiVersion: 'policy/v1beta1', + kind: 'PodSecurityPolicy', + metadata: { + name: 'kube-prometheus-restricted', + }, + spec: { + privileged: false, + // Required to prevent escalations to root. + allowPrivilegeEscalation: false, + // This is redundant with non-root + disallow privilege escalation, + // but we can provide it for defense in depth. + requiredDropCapabilities: ['ALL'], + // Allow core volume types. + volumes: [ + 'configMap', + 'emptyDir', + 'secret', + // Assume that persistentVolumes set up by the cluster admin are safe to use. + 'persistentVolumeClaim', + ], + hostNetwork: false, + hostIPC: false, + hostPID: false, + runAsUser: { + // Require the container to run without root privileges. + rule: 'MustRunAsNonRoot', + }, + seLinux: { + // This policy assumes the nodes are using AppArmor rather than SELinux. + rule: 'RunAsAny', + }, + supplementalGroups: { + rule: 'MustRunAs', + ranges: [{ + // Forbid adding the root group. + min: 1, + max: 65535, + }], + }, + fsGroup: { + rule: 'MustRunAs', + ranges: [{ + // Forbid adding the root group. + min: 1, + max: 65535, + }], + }, + readOnlyRootFilesystem: false, + }, +}; + +{ + restrictedPodSecurityPolicy: restrictedPodSecurityPolicy, + + alertmanager+: { + role: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'alertmanager-' + $.values.alertmanager.name, + namespace: $.values.common.namespace, + }, + rules: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: [restrictedPodSecurityPolicy.metadata.name], + }], + }, + + roleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'alertmanager-' + $.values.alertmanager.name, + namespace: $.values.common.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'alertmanager-' + $.values.alertmanager.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'alertmanager-' + $.values.alertmanager.name, + namespace: $.values.alertmanager.namespace, + }], + }, + }, + + blackboxExporter+: { + clusterRole+: { + rules+: [ + { + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: ['blackbox-exporter-psp'], + }, + ], + }, + + podSecurityPolicy: + local blackboxExporterPspPrivileged = + if $.blackboxExporter._config.privileged then + { + metadata+: { + name: 'blackbox-exporter-psp', + }, + spec+: { + privileged: true, + allowedCapabilities: ['NET_RAW'], + runAsUser: { + rule: 'RunAsAny', + }, + }, + } + else + { + metadata+: { + name: 'blackbox-exporter-psp', + }, + }; + + restrictedPodSecurityPolicy + blackboxExporterPspPrivileged, + }, + + grafana+: { + role: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'grafana', + namespace: $.values.common.namespace, + }, + rules: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: [restrictedPodSecurityPolicy.metadata.name], + }], + }, + + roleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'grafana', + namespace: $.values.common.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'grafana', + }, + subjects: [{ + kind: 'ServiceAccount', + name: $.grafana.serviceAccount.metadata.name, + namespace: $.grafana.serviceAccount.metadata.namespace, + }], + }, + }, + + kubeStateMetrics+: { + clusterRole+: { + rules+: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: ['kube-state-metrics-psp'], + }], + }, + + podSecurityPolicy: restrictedPodSecurityPolicy { + metadata+: { + name: 'kube-state-metrics-psp', + }, + spec+: { + runAsUser: { + rule: 'RunAsAny', + }, + }, + }, + }, + + nodeExporter+: { + clusterRole+: { + rules+: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: ['node-exporter-psp'], + }], + }, + + podSecurityPolicy: restrictedPodSecurityPolicy { + metadata+: { + name: 'node-exporter-psp', + }, + spec+: { + allowedHostPaths+: [ + { + pathPrefix: '/proc', + readOnly: true, + }, + { + pathPrefix: '/sys', + readOnly: true, + }, + { + pathPrefix: '/', + readOnly: true, + }, + ], + hostNetwork: true, + hostPID: true, + hostPorts: [ + { + max: $.nodeExporter._config.port, + min: $.nodeExporter._config.port, + }, + ], + readOnlyRootFilesystem: true, + volumes+: [ + 'hostPath', + ], + }, + }, + }, + + prometheusAdapter+: { + clusterRole+: { + rules+: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: [restrictedPodSecurityPolicy.metadata.name], + }], + }, + }, + + prometheusOperator+: { + clusterRole+: { + rules+: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: [restrictedPodSecurityPolicy.metadata.name], + }], + }, + }, + + prometheus+: { + clusterRole+: { + rules+: [{ + apiGroups: ['policy'], + resources: ['podsecuritypolicies'], + verbs: ['use'], + resourceNames: [restrictedPodSecurityPolicy.metadata.name], + }], + }, + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet b/jsonnet/kube-prometheus/addons/static-etcd.libsonnet similarity index 55% rename from jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet rename to jsonnet/kube-prometheus/addons/static-etcd.libsonnet index 9bc77385f4b6d860abfdd40905e0e05d793c4803..4f11a07629cdfafe908a1049c7bb82ab44209193 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-static-etcd.libsonnet +++ b/jsonnet/kube-prometheus/addons/static-etcd.libsonnet @@ -1,7 +1,5 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -(import 'github.com/etcd-io/etcd/Documentation/etcd-mixin/mixin.libsonnet') + { - _config+:: { +(import 'github.com/etcd-io/etcd/contrib/mixin/mixin.libsonnet') + { + values+:: { etcd: { ips: [], clientCA: null, @@ -11,14 +9,14 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; insecureSkipVerify: null, }, }, - prometheus+:: { + prometheus+: { serviceEtcd: { apiVersion: 'v1', kind: 'Service', metadata: { name: 'etcd', namespace: 'kube-system', - labels: { 'k8s-app': 'etcd' }, + labels: { 'app.kubernetes.io/name': 'etcd' }, }, spec: { ports: [ @@ -28,23 +26,23 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; }, }, endpointsEtcd: { - apiVersion: 'v1', - kind: 'Endpoints', - metadata: { - name: 'etcd', - namespace: 'kube-system', - labels: { 'k8s-app': 'etcd' }, - }, - subsets: [{ - addresses: [ - { ip: etcdIP } - for etcdIP in $._config.etcd.ips - ], - ports: [ - { name: 'metrics', port: 2379, protocol: 'TCP' }, - ], - }], + apiVersion: 'v1', + kind: 'Endpoints', + metadata: { + name: 'etcd', + namespace: 'kube-system', + labels: { 'app.kubernetes.io/name': 'etcd' }, }, + subsets: [{ + addresses: [ + { ip: etcdIP } + for etcdIP in $.values.etcd.ips + ], + ports: [ + { name: 'metrics', port: 2379, protocol: 'TCP' }, + ], + }], + }, serviceMonitorEtcd: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', @@ -52,11 +50,11 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; name: 'etcd', namespace: 'kube-system', labels: { - 'k8s-app': 'etcd', + 'app.kubernetes.io/name': 'etcd', }, }, spec: { - jobLabel: 'k8s-app', + jobLabel: 'app.kubernetes.io/name', endpoints: [ { port: 'metrics', @@ -67,14 +65,14 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; caFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client-ca.crt', keyFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.key', certFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.crt', - [if $._config.etcd.serverName != null then 'serverName']: $._config.etcd.serverName, - [if $._config.etcd.insecureSkipVerify != null then 'insecureSkipVerify']: $._config.etcd.insecureSkipVerify, + [if $.values.etcd.serverName != null then 'serverName']: $.values.etcd.serverName, + [if $.values.etcd.insecureSkipVerify != null then 'insecureSkipVerify']: $.values.etcd.insecureSkipVerify, }, }, ], selector: { matchLabels: { - 'k8s-app': 'etcd', + 'app.kubernetes.io/name': 'etcd', }, }, }, @@ -86,20 +84,19 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; type: 'Opaque', metadata: { name: 'kube-etcd-client-certs', - namespace: $._config.namespace, + namespace: $.values.common.namespace, }, data: { - 'etcd-client-ca.crt': std.base64($._config.etcd.clientCA), - 'etcd-client.key': std.base64($._config.etcd.clientKey), - 'etcd-client.crt': std.base64($._config.etcd.clientCert), + 'etcd-client-ca.crt': std.base64($.values.etcd.clientCA), + 'etcd-client.key': std.base64($.values.etcd.clientKey), + 'etcd-client.crt': std.base64($.values.etcd.clientCert), }, }, - prometheus+: - { - // Reference info: https://coreos.com/operators/prometheus/docs/latest/api.html#prometheusspec - spec+: { - secrets+: [$.prometheus.secretEtcdCerts.metadata.name], - }, + prometheus+: { + // Reference info: https://coreos.com/operators/prometheus/docs/latest/api.html#prometheusspec + spec+: { + secrets+: [$.prometheus.secretEtcdCerts.metadata.name], }, + }, }, } diff --git a/jsonnet/kube-prometheus/addons/strip-limits.libsonnet b/jsonnet/kube-prometheus/addons/strip-limits.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..6faec94418c1fb7a309078c4be4b80a3b432a8eb --- /dev/null +++ b/jsonnet/kube-prometheus/addons/strip-limits.libsonnet @@ -0,0 +1,48 @@ +// Strips spec.containers[].limits for certain containers +// https://github.com/prometheus-operator/kube-prometheus/issues/72 + +{ + local noLimit(c) = + //if std.objectHas(c, 'resources') && c.name != 'kube-state-metrics' + if c.name != 'kube-state-metrics' + then c { resources+: { limits: {} } } + else c, + + nodeExporter+: { + daemonset+: { + spec+: { + template+: { + spec+: { + containers: std.map(noLimit, super.containers), + }, + }, + }, + }, + }, + kubeStateMetrics+: { + deployment+: { + spec+: { + template+: { + spec+: { + containers: std.map(noLimit, super.containers), + }, + }, + }, + }, + }, + prometheusOperator+: { + deployment+: { + spec+: { + template+: { + spec+: { + local addArgs(c) = + if c.name == 'prometheus-operator' + then c { args+: ['--config-reloader-cpu-limit=0', '--config-reloader-memory-limit=0'] } + else c, + containers: std.map(addArgs, super.containers), + }, + }, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/weave-net/alerts.libsonnet b/jsonnet/kube-prometheus/addons/weave-net/alerts.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..c0ca940a9ca7260f99c0e2cacbbdec3ce1d5b71a --- /dev/null +++ b/jsonnet/kube-prometheus/addons/weave-net/alerts.libsonnet @@ -0,0 +1,134 @@ +[ + { + alert: 'WeaveNetIPAMSplitBrain', + expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.', + description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.', + }, + }, + { + alert: 'WeaveNetIPAMUnreachable', + expr: 'weave_ipam_unreachable_percentage > 25', + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.', + description: 'actionable: Please find the problem and fix it.', + }, + }, + { + alert: 'WeaveNetIPAMPendingAllocates', + expr: 'sum(weave_ipam_pending_allocates) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Number of pending allocates is above the threshold.', + description: 'actionable: Please find the problem and fix it.', + }, + }, + { + alert: 'WeaveNetIPAMPendingClaims', + expr: 'sum(weave_ipam_pending_claims) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Number of pending claims is above the threshold.', + description: 'actionable: Please find the problem and fix it.', + }, + }, + { + alert: 'WeaveNetFastDPFlowsLow', + expr: 'sum(weave_flows) < 15000', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Number of FastDP flows is below the threshold.', + description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.', + }, + }, + { + alert: 'WeaveNetFastDPFlowsOff', + expr: 'sum(weave_flows == bool 0) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'FastDP flows is zero.', + description: 'actionable: Please find the reason for FastDP flows to be off and fix it.', + }, + }, + { + alert: 'WeaveNetHighConnectionTerminationRate', + expr: 'rate(weave_connection_terminations_total[5m]) > 0.1', + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A lot of connections are getting terminated.', + description: 'actionable: Please find the reason for the high connection termination rate and fix it.', + }, + }, + { + alert: 'WeaveNetConnectionsConnecting', + expr: 'sum(weave_connections{state="connecting"}) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A lot of connections are in connecting state.', + description: 'actionable: Please find the reason for this and fix it.', + }, + }, + { + alert: 'WeaveNetConnectionsRetying', + expr: 'sum(weave_connections{state="retrying"}) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A lot of connections are in retrying state.', + description: 'actionable: Please find the reason for this and fix it.', + }, + }, + { + alert: 'WeaveNetConnectionsPending', + expr: 'sum(weave_connections{state="pending"}) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A lot of connections are in pending state.', + description: 'actionable: Please find the reason for this and fix it.', + }, + }, + { + alert: 'WeaveNetConnectionsFailed', + expr: 'sum(weave_connections{state="failed"}) > 0', + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A lot of connections are in failed state.', + description: 'actionable: Please find the reason and fix it.', + }, + }, +] diff --git a/jsonnet/kube-prometheus/grafana-weave-net-cluster.json b/jsonnet/kube-prometheus/addons/weave-net/grafana-weave-net-cluster.json similarity index 100% rename from jsonnet/kube-prometheus/grafana-weave-net-cluster.json rename to jsonnet/kube-prometheus/addons/weave-net/grafana-weave-net-cluster.json diff --git a/jsonnet/kube-prometheus/grafana-weave-net.json b/jsonnet/kube-prometheus/addons/weave-net/grafana-weave-net.json similarity index 100% rename from jsonnet/kube-prometheus/grafana-weave-net.json rename to jsonnet/kube-prometheus/addons/weave-net/grafana-weave-net.json diff --git a/jsonnet/kube-prometheus/addons/weave-net/weave-net.libsonnet b/jsonnet/kube-prometheus/addons/weave-net/weave-net.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..d5cc9ead70a19b42b896d9a7a6ea2182a63cab42 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/weave-net/weave-net.libsonnet @@ -0,0 +1,73 @@ +{ + prometheus+: { + local p = self, + serviceWeaveNet: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: 'weave-net', + namespace: 'kube-system', + labels: { 'app.kubernetes.io/name': 'weave-net' }, + }, + spec: { + ports: [ + { name: 'weave-net-metrics', targetPort: 6782, port: 6782 }, + ], + selector: { name: 'weave-net' }, + clusterIP: 'None', + }, + }, + serviceMonitorWeaveNet: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'weave-net', + labels: { + 'app.kubernetes.io/name': 'weave-net', + }, + namespace: 'monitoring', + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + endpoints: [ + { + port: 'weave-net-metrics', + path: '/metrics', + interval: '15s', + }, + ], + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + selector: { + matchLabels: { + 'app.kubernetes.io/name': 'weave-net', + }, + }, + }, + }, + prometheusRuleWeaveNet: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: p._config.mixin.ruleLabels, + name: 'weave-net-rules', + namespace: p._config.namespace, + }, + spec: { + groups: [{ + name: 'weave-net', + rules: (import './alerts.libsonnet'), + }], + }, + }, + mixin+:: { + grafanaDashboards+:: { + 'weave-net.json': (import './grafana-weave-net.json'), + 'weave-net-cluster.json': (import './grafana-weave-net-cluster.json'), + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/addons/windows.libsonnet b/jsonnet/kube-prometheus/addons/windows.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..d97e8ffeb2523800900e3bbbbe68d4f73932b647 --- /dev/null +++ b/jsonnet/kube-prometheus/addons/windows.libsonnet @@ -0,0 +1,70 @@ +local windowsdashboards = import 'github.com/kubernetes-monitoring/kubernetes-mixin/dashboards/windows.libsonnet'; +local windowsrules = import 'github.com/kubernetes-monitoring/kubernetes-mixin/rules/windows.libsonnet'; + +{ + values+:: { + // This needs to follow prometheus naming convention and not prometheus-operator one + windowsScrapeConfig+:: { + job_name: 'windows-exporter', + static_configs: [ + { + targets: [error 'must provide targets array'], + }, + ], + relabel_configs: [ + { + action: 'replace', + regex: '(.*)', + replacement: '$1', + source_labels: [ + '__meta_kubernetes_endpoint_address_target_name', + ], + target_label: 'instance', + }, + ], + }, + + grafana+:: { + dashboards+:: windowsdashboards { + _config: $.kubernetesControlPlane.mixin._config { + wmiExporterSelector: 'job="' + $.values.windowsScrapeConfig.job_name + '"', + }, + }.grafanaDashboards, + }, + }, + kubernetesControlPlane+: { + mixin+:: { + prometheusRules+:: { + groups+: windowsrules { + _config: $.kubernetesControlPlane.mixin._config { + wmiExporterSelector: 'job="' + $.values.windowsScrapeConfig.job_name + '"', + }, + }.prometheusRules.groups, + }, + }, + }, + prometheus+: { + local p = self, + local sc = [$.values.windowsScrapeConfig], + prometheus+: { + spec+: { + additionalScrapeConfigs: { + name: 'prometheus-' + p._config.name + '-additional-scrape-config', + key: 'prometheus-additional.yaml', + }, + }, + + }, + windowsConfig: { + apiVersion: 'v1', + kind: 'Secret', + metadata: { + name: 'prometheus-' + p._config.name + '-additional-scrape-config', + namespace: p._config.namespace, + }, + stringData: { + 'prometheus-additional.yaml': std.manifestYamlDoc(sc), + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet b/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet deleted file mode 100644 index 677c788974c1ef50f5dd91ec448746ddc1466a0d..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/alertmanager/alertmanager.libsonnet +++ /dev/null @@ -1,149 +0,0 @@ -{ - _config+:: { - namespace: 'default', - - versions+:: { - alertmanager: 'v0.21.0', - }, - - imageRepos+:: { - alertmanager: 'quay.io/prometheus/alertmanager', - }, - - alertmanager+:: { - name: 'main', - config: { - global: { - resolve_timeout: '5m', - }, - inhibit_rules: [{ - source_match: { - severity: 'critical', - }, - target_match_re: { - severity: 'warning|info', - }, - equal: ['namespace', 'alertname'], - }, { - source_match: { - severity: 'warning', - }, - target_match_re: { - severity: 'info', - }, - equal: ['namespace', 'alertname'], - }], - route: { - group_by: ['namespace'], - group_wait: '30s', - group_interval: '5m', - repeat_interval: '12h', - receiver: 'Default', - routes: [ - { receiver: 'Watchdog', match: { alertname: 'Watchdog' } }, - { receiver: 'Critical', match: { severity: 'critical' } }, - ], - }, - receivers: [ - { name: 'Default' }, - { name: 'Watchdog' }, - { name: 'Critical' }, - ], - }, - replicas: 3, - }, - }, - - alertmanager+:: { - secret: { - apiVersion: 'v1', - kind: 'Secret', - type: 'Opaque', - metadata: { - name: 'alertmanager-' + $._config.alertmanager.name, - namespace: $._config.namespace, - }, - stringData: { - 'alertmanager.yaml': if std.type($._config.alertmanager.config) == 'object' - then - std.manifestYamlDoc($._config.alertmanager.config) - else - $._config.alertmanager.config, - }, - }, - - serviceAccount: { - apiVersion: 'v1', - kind: 'ServiceAccount', - metadata: { - name: 'alertmanager-' + $._config.alertmanager.name, - namespace: $._config.namespace, - }, - }, - - service: { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: 'alertmanager-' + $._config.alertmanager.name, - namespace: $._config.namespace, - labels: { alertmanager: $._config.alertmanager.name }, - }, - spec: { - ports: [ - { name: 'web', targetPort: 'web', port: 9093 }, - ], - selector: { app: 'alertmanager', alertmanager: $._config.alertmanager.name }, - sessionAffinity: 'ClientIP', - }, - }, - - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'alertmanager', - namespace: $._config.namespace, - labels: { - 'k8s-app': 'alertmanager', - }, - }, - spec: { - selector: { - matchLabels: { - alertmanager: $._config.alertmanager.name, - }, - }, - endpoints: [ - { port: 'web', interval: '30s' }, - ], - }, - }, - - alertmanager: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'Alertmanager', - metadata: { - name: $._config.alertmanager.name, - namespace: $._config.namespace, - labels: { - alertmanager: $._config.alertmanager.name, - }, - }, - spec: { - replicas: $._config.alertmanager.replicas, - version: $._config.versions.alertmanager, - image: $._config.imageRepos.alertmanager + ':' + $._config.versions.alertmanager, - nodeSelector: { 'kubernetes.io/os': 'linux' }, - serviceAccountName: 'alertmanager-' + $._config.alertmanager.name, - securityContext: { - runAsUser: 1000, - runAsNonRoot: true, - fsGroup: 2000, - }, - }, - }, - }, -} diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet deleted file mode 100644 index bcabf4d9fce152864c3f43be051647c63341b117..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ /dev/null @@ -1,57 +0,0 @@ -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'alertmanager.rules', - rules: [ - { - alert: 'AlertmanagerConfigInconsistent', - annotations: { - message: ||| - The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync. - {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }} - Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}" - {{ end }} - |||, - }, - expr: ||| - count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1 - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - }, - { - alert: 'AlertmanagerFailedReload', - annotations: { - message: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.", - }, - expr: ||| - alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'warning', - }, - }, - { - alert: 'AlertmanagerMembersInconsistent', - annotations: { - message: 'Alertmanager has not found all other members of the cluster.', - }, - expr: ||| - alertmanager_cluster_members{%(alertmanagerSelector)s} - != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{%(alertmanagerSelector)s}) - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - }, - ], - }, - ], - }, -} diff --git a/jsonnet/kube-prometheus/alerts/tests.yaml b/jsonnet/kube-prometheus/alerts/tests.yaml deleted file mode 100644 index 532bb895561757e0024a66ef045dbce11f90afc8..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/alerts/tests.yaml +++ /dev/null @@ -1,157 +0,0 @@ -# TODO(metalmatze): This file is temporarily saved here for later reference -# until we find out how to integrate the tests into our jsonnet stack. - -rule_files: - - rules.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}' - values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}' - values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3' - alert_rule_test: - - eval_time: 5m - alertname: AlertmanagerMembersInconsistent - - eval_time: 11m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 17m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 23m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - interval: 1m - input_series: - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}' - values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2' - - series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}' - values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2' - alert_rule_test: - - eval_time: 5m - alertname: AlertmanagerMembersInconsistent - - eval_time: 11m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 17m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - eval_time: 23m - alertname: AlertmanagerMembersInconsistent - exp_alerts: - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.0 - namespace: monitoring - pod: alertmanager-main-0 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.1 - namespace: monitoring - pod: alertmanager-main-1 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' - - exp_labels: - service: 'alertmanager-main' - severity: critical - job: 'alertmanager-main' - instance: 10.10.10.2 - namespace: monitoring - pod: alertmanager-main-2 - exp_annotations: - message: 'Alertmanager has not found all other members of the cluster.' diff --git a/jsonnet/kube-prometheus/components/alertmanager.libsonnet b/jsonnet/kube-prometheus/components/alertmanager.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..bda39ec4c7747ed516b7ea3abb9380b6f4c8f683 --- /dev/null +++ b/jsonnet/kube-prometheus/components/alertmanager.libsonnet @@ -0,0 +1,213 @@ +local defaults = { + local defaults = self, + namespace: error 'must provide namespace', + image: error 'must provide image', + version: error 'must provide version', + resources: { + limits: { cpu: '100m', memory: '100Mi' }, + requests: { cpu: '4m', memory: '100Mi' }, + }, + commonLabels:: { + 'app.kubernetes.io/name': 'alertmanager', + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'alert-router', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + name: error 'must provide name', + config: { + global: { + resolve_timeout: '5m', + }, + inhibit_rules: [{ + source_match: { + severity: 'critical', + }, + target_match_re: { + severity: 'warning|info', + }, + equal: ['namespace', 'alertname'], + }, { + source_match: { + severity: 'warning', + }, + target_match_re: { + severity: 'info', + }, + equal: ['namespace', 'alertname'], + }], + route: { + group_by: ['namespace'], + group_wait: '30s', + group_interval: '5m', + repeat_interval: '12h', + receiver: 'Default', + routes: [ + { receiver: 'Watchdog', match: { alertname: 'Watchdog' } }, + { receiver: 'Critical', match: { severity: 'critical' } }, + ], + }, + receivers: [ + { name: 'Default' }, + { name: 'Watchdog' }, + { name: 'Critical' }, + ], + }, + replicas: 3, + mixin: { + ruleLabels: {}, + _config: { + alertmanagerName: '{{ $labels.namespace }}/{{ $labels.pod}}', + alertmanagerClusterLabels: 'namespace,service', + alertmanagerSelector: 'job="alertmanager-' + defaults.name + '",namespace="' + defaults.namespace + '"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/alertmanager/%s', + }, + }, +}; + + +function(params) { + local am = self, + _config:: defaults + params, + // Safety check + assert std.isObject(am._config.resources), + assert std.isObject(am._config.mixin._config), + + mixin:: (import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/mixin.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') { + _config+:: am._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: am._config.commonLabels + am._config.mixin.ruleLabels, + name: 'alertmanager-' + am._config.name + '-rules', + namespace: am._config.namespace, + }, + spec: { + local r = if std.objectHasAll(am.mixin, 'prometheusRules') then am.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(am.mixin, 'prometheusAlerts') then am.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + secret: { + apiVersion: 'v1', + kind: 'Secret', + type: 'Opaque', + metadata: { + name: 'alertmanager-' + am._config.name, + namespace: am._config.namespace, + labels: { alertmanager: am._config.name } + am._config.commonLabels, + }, + stringData: { + 'alertmanager.yaml': if std.type(am._config.config) == 'object' + then + std.manifestYamlDoc(am._config.config) + else + am._config.config, + }, + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: 'alertmanager-' + am._config.name, + namespace: am._config.namespace, + labels: { alertmanager: am._config.name } + am._config.commonLabels, + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: 'alertmanager-' + am._config.name, + namespace: am._config.namespace, + labels: { alertmanager: am._config.name } + am._config.commonLabels, + }, + spec: { + ports: [ + { name: 'web', targetPort: 'web', port: 9093 }, + ], + selector: { + app: 'alertmanager', + alertmanager: am._config.name, + } + am._config.selectorLabels, + sessionAffinity: 'ClientIP', + }, + }, + + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'alertmanager', + namespace: am._config.namespace, + labels: am._config.commonLabels, + }, + spec: { + selector: { + matchLabels: { + alertmanager: am._config.name, + } + am._config.selectorLabels, + }, + endpoints: [ + { port: 'web', interval: '30s' }, + ], + }, + }, + + [if (defaults + params).replicas > 1 then 'podDisruptionBudget']: { + apiVersion: 'policy/v1beta1', + kind: 'PodDisruptionBudget', + metadata: { + name: 'alertmanager-' + am._config.name, + namespace: am._config.namespace, + labels: am._config.commonLabels, + }, + spec: { + maxUnavailable: 1, + selector: { + matchLabels: { + alertmanager: am._config.name, + } + am._config.selectorLabels, + }, + }, + }, + + alertmanager: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'Alertmanager', + metadata: { + name: am._config.name, + namespace: am._config.namespace, + labels: { + alertmanager: am._config.name, + } + am._config.commonLabels, + }, + spec: { + replicas: am._config.replicas, + version: am._config.version, + image: am._config.image, + podMetadata: { + labels: am._config.commonLabels, + }, + resources: am._config.resources, + nodeSelector: { 'kubernetes.io/os': 'linux' }, + serviceAccountName: 'alertmanager-' + am._config.name, + securityContext: { + runAsUser: 1000, + runAsNonRoot: true, + fsGroup: 2000, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/components/blackbox-exporter.libsonnet b/jsonnet/kube-prometheus/components/blackbox-exporter.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..cb4dcd9a7c974d88c8c6284ca63b7b1c202cfd93 --- /dev/null +++ b/jsonnet/kube-prometheus/components/blackbox-exporter.libsonnet @@ -0,0 +1,290 @@ +local krp = import './kube-rbac-proxy.libsonnet'; + +local defaults = { + local defaults = self, + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide version', + resources: { + requests: { cpu: '10m', memory: '20Mi' }, + limits: { cpu: '20m', memory: '40Mi' }, + }, + commonLabels:: { + 'app.kubernetes.io/name': 'blackbox-exporter', + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'exporter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + configmapReloaderImage: error 'must provide version', + kubeRbacProxyImage: error 'must provide kubeRbacProxyImage', + + port: 9115, + internalPort: 19115, + replicas: 1, + modules: { + http_2xx: { + prober: 'http', + http: { + preferred_ip_protocol: 'ip4', + }, + }, + http_post_2xx: { + prober: 'http', + http: { + method: 'POST', + preferred_ip_protocol: 'ip4', + }, + }, + tcp_connect: { + prober: 'tcp', + tcp: { + preferred_ip_protocol: 'ip4', + }, + }, + pop3s_banner: { + prober: 'tcp', + tcp: { + query_response: [ + { expect: '^+OK' }, + ], + tls: true, + tls_config: { + insecure_skip_verify: false, + }, + preferred_ip_protocol: 'ip4', + }, + }, + ssh_banner: { + prober: 'tcp', + tcp: { + query_response: [ + { expect: '^SSH-2.0-' }, + ], + preferred_ip_protocol: 'ip4', + }, + }, + irc_banner: { + prober: 'tcp', + tcp: { + query_response: [ + { send: 'NICK prober' }, + { send: 'USER prober prober prober :prober' }, + { expect: 'PING :([^ ]+)', send: 'PONG ${1}' }, + { expect: '^:[^ ]+ 001' }, + ], + preferred_ip_protocol: 'ip4', + }, + }, + }, + privileged: + local icmpModules = [self.modules[m] for m in std.objectFields(self.modules) if self.modules[m].prober == 'icmp']; + std.length(icmpModules) > 0, +}; + + +function(params) { + local bb = self, + _config:: defaults + params, + // Safety check + assert std.isObject(bb._config.resources), + + configuration: { + apiVersion: 'v1', + kind: 'ConfigMap', + metadata: { + name: 'blackbox-exporter-configuration', + namespace: bb._config.namespace, + labels: bb._config.commonLabels, + }, + data: { + 'config.yml': std.manifestYamlDoc({ modules: bb._config.modules }), + }, + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: 'blackbox-exporter', + namespace: bb._config.namespace, + }, + }, + + clusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: 'blackbox-exporter', + }, + rules: [ + { + apiGroups: ['authentication.k8s.io'], + resources: ['tokenreviews'], + verbs: ['create'], + }, + { + apiGroups: ['authorization.k8s.io'], + resources: ['subjectaccessreviews'], + verbs: ['create'], + }, + ], + }, + + clusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: 'blackbox-exporter', + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'blackbox-exporter', + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'blackbox-exporter', + namespace: bb._config.namespace, + }], + }, + + deployment: + local blackboxExporter = { + name: 'blackbox-exporter', + image: bb._config.image, + args: [ + '--config.file=/etc/blackbox_exporter/config.yml', + '--web.listen-address=:%d' % bb._config.internalPort, + ], + ports: [{ + name: 'http', + containerPort: bb._config.internalPort, + }], + resources: bb._config.resources, + securityContext: if bb._config.privileged then { + runAsNonRoot: false, + capabilities: { drop: ['ALL'], add: ['NET_RAW'] }, + } else { + runAsNonRoot: true, + runAsUser: 65534, + }, + volumeMounts: [{ + mountPath: '/etc/blackbox_exporter/', + name: 'config', + readOnly: true, + }], + }; + + local reloader = { + name: 'module-configmap-reloader', + image: bb._config.configmapReloaderImage, + args: [ + '--webhook-url=http://localhost:%d/-/reload' % bb._config.internalPort, + '--volume-dir=/etc/blackbox_exporter/', + ], + resources: bb._config.resources, + securityContext: { runAsNonRoot: true, runAsUser: 65534 }, + terminationMessagePath: '/dev/termination-log', + terminationMessagePolicy: 'FallbackToLogsOnError', + volumeMounts: [{ + mountPath: '/etc/blackbox_exporter/', + name: 'config', + readOnly: true, + }], + }; + + local kubeRbacProxy = krp({ + name: 'kube-rbac-proxy', + upstream: 'http://127.0.0.1:' + bb._config.internalPort + '/', + resources: bb._config.resources, + secureListenAddress: ':' + bb._config.port, + ports: [ + { name: 'https', containerPort: bb._config.port }, + ], + image: bb._config.kubeRbacProxyImage, + }); + + { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: { + name: 'blackbox-exporter', + namespace: bb._config.namespace, + labels: bb._config.commonLabels, + }, + spec: { + replicas: bb._config.replicas, + selector: { matchLabels: bb._config.selectorLabels }, + template: { + metadata: { + labels: bb._config.commonLabels, + annotations: { + 'kubectl.kubernetes.io/default-container': blackboxExporter.name, + }, + }, + spec: { + containers: [blackboxExporter, reloader, kubeRbacProxy], + nodeSelector: { 'kubernetes.io/os': 'linux' }, + serviceAccountName: 'blackbox-exporter', + volumes: [{ + name: 'config', + configMap: { name: 'blackbox-exporter-configuration' }, + }], + }, + }, + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: 'blackbox-exporter', + namespace: bb._config.namespace, + labels: bb._config.commonLabels, + }, + spec: { + ports: [{ + name: 'https', + port: bb._config.port, + targetPort: 'https', + }, { + name: 'probe', + port: bb._config.internalPort, + targetPort: 'http', + }], + selector: bb._config.selectorLabels, + }, + }, + + serviceMonitor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'blackbox-exporter', + namespace: bb._config.namespace, + labels: bb._config.commonLabels, + }, + spec: { + endpoints: [{ + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + interval: '30s', + path: '/metrics', + port: 'https', + scheme: 'https', + tlsConfig: { + insecureSkipVerify: true, + }, + }], + selector: { + matchLabels: bb._config.selectorLabels, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/components/grafana.libsonnet b/jsonnet/kube-prometheus/components/grafana.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..cf2c7ea2f82b131e558cfc08d16c577996130960 --- /dev/null +++ b/jsonnet/kube-prometheus/components/grafana.libsonnet @@ -0,0 +1,105 @@ +local defaults = { + local defaults = self, + name: 'grafana', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide image', + resources: { + requests: { cpu: '100m', memory: '100Mi' }, + limits: { cpu: '200m', memory: '200Mi' }, + }, + commonLabels:: { + 'app.kubernetes.io/name': defaults.name, + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'grafana', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + prometheusName: error 'must provide prometheus name', + dashboards: {}, + // TODO(paulfantom): expose those to have a stable API. After kubernetes-grafana refactor those could probably be removed. + rawDashboards: {}, + folderDashboards: {}, + containers: [], + datasources: [], + config: {}, + plugins: [], + env: [], +}; + +function(params) { + local g = self, + _config:: defaults + params, + // Safety check + assert std.isObject(g._config.resources), + + local glib = (import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') + { + _config+:: { + namespace: g._config.namespace, + versions+:: { + grafana: g._config.version, + }, + imageRepos+:: { + grafana: std.split(g._config.image, ':')[0], + }, + prometheus+:: { + name: g._config.prometheusName, + }, + grafana+:: { + labels: g._config.commonLabels, + dashboards: g._config.dashboards, + resources: g._config.resources, + rawDashboards: g._config.rawDashboards, + folderDashboards: g._config.folderDashboards, + containers: g._config.containers, + config+: g._config.config, + plugins+: g._config.plugins, + env: g._config.env, + } + ( + // Conditionally overwrite default setting. + if std.length(g._config.datasources) > 0 then + { datasources: g._config.datasources } + else {} + ), + }, + }, + + config: glib.grafana.config, + service: glib.grafana.service, + serviceAccount: glib.grafana.serviceAccount, + deployment: glib.grafana.deployment, + dashboardDatasources: glib.grafana.dashboardDatasources, + dashboardSources: glib.grafana.dashboardSources, + + dashboardDefinitions: if std.length(g._config.dashboards) > 0 || + std.length(g._config.rawDashboards) > 0 || + std.length(g._config.folderDashboards) > 0 then { + apiVersion: 'v1', + kind: 'ConfigMapList', + items: glib.grafana.dashboardDefinitions, + }, + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'grafana', + namespace: g._config.namespace, + labels: g._config.commonLabels, + }, + spec: { + selector: { + matchLabels: { + 'app.kubernetes.io/name': 'grafana', + }, + }, + endpoints: [{ + port: 'http', + interval: '15s', + }], + }, + }, +} diff --git a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..08cdfb2084f271bce38d1f7bfa1e6fa16523996d --- /dev/null +++ b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet @@ -0,0 +1,319 @@ +local relabelings = import '../addons/dropping-deprecated-metrics-relabelings.libsonnet'; + +local defaults = { + namespace: error 'must provide namespace', + commonLabels:: { + 'app.kubernetes.io/name': 'kube-prometheus', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + mixin: { + ruleLabels: {}, + _config: { + cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"', + kubeletSelector: 'job="kubelet", metrics_path="/metrics"', + kubeStateMetricsSelector: 'job="kube-state-metrics"', + nodeExporterSelector: 'job="node-exporter"', + kubeSchedulerSelector: 'job="kube-scheduler"', + kubeControllerManagerSelector: 'job="kube-controller-manager"', + kubeApiserverSelector: 'job="apiserver"', + podLabel: 'pod', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/%s', + diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', + hostNetworkInterfaceSelector: 'device!~"veth.+"', + }, + }, + kubeProxy: false, +}; + +function(params) { + local k8s = self, + _config:: defaults + params, + + mixin:: (import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') { + _config+:: k8s._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: k8s._config.commonLabels + k8s._config.mixin.ruleLabels, + name: 'kubernetes-monitoring-rules', + namespace: k8s._config.namespace, + }, + spec: { + local r = if std.objectHasAll(k8s.mixin, 'prometheusRules') then k8s.mixin.prometheusRules.groups else {}, + local a = if std.objectHasAll(k8s.mixin, 'prometheusAlerts') then k8s.mixin.prometheusAlerts.groups else {}, + groups: a + r, + }, + }, + + serviceMonitorKubeScheduler: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kube-scheduler', + namespace: k8s._config.namespace, + labels: { 'app.kubernetes.io/name': 'kube-scheduler' }, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + endpoints: [{ + port: 'https-metrics', + interval: '30s', + scheme: 'https', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { insecureSkipVerify: true }, + }], + selector: { + matchLabels: { 'app.kubernetes.io/name': 'kube-scheduler' }, + }, + namespaceSelector: { + matchNames: ['kube-system'], + }, + }, + }, + + serviceMonitorKubelet: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kubelet', + namespace: k8s._config.namespace, + labels: { 'app.kubernetes.io/name': 'kubelet' }, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + endpoints: [ + { + port: 'https-metrics', + scheme: 'https', + interval: '30s', + honorLabels: true, + tlsConfig: { insecureSkipVerify: true }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + metricRelabelings: relabelings, + relabelings: [{ + sourceLabels: ['__metrics_path__'], + targetLabel: 'metrics_path', + }], + }, + { + port: 'https-metrics', + scheme: 'https', + path: '/metrics/cadvisor', + interval: '30s', + honorLabels: true, + honorTimestamps: false, + tlsConfig: { + insecureSkipVerify: true, + }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + relabelings: [{ + sourceLabels: ['__metrics_path__'], + targetLabel: 'metrics_path', + }], + metricRelabelings: [ + // Drop a bunch of metrics which are disabled but still sent, see + // https://github.com/google/cadvisor/issues/1925. + { + sourceLabels: ['__name__'], + regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)', + action: 'drop', + }, + // Drop cAdvisor metrics with no (pod, namespace) labels while preserving ability to monitor system services resource usage (cardinality estimation) + { + sourceLabels: ['__name__', 'pod', 'namespace'], + action: 'drop', + regex: '(' + std.join('|', + [ + 'container_fs_.*', // add filesystem read/write data (nodes*disks*services*4) + 'container_spec_.*', // everything related to cgroup specification and thus static data (nodes*services*5) + 'container_blkio_device_usage_total', // useful for containers, but not for system services (nodes*disks*services*operations*2) + 'container_file_descriptors', // file descriptors limits and global numbers are exposed via (nodes*services) + 'container_sockets', // used sockets in cgroup. Usually not important for system services (nodes*services) + 'container_threads_max', // max number of threads in cgroup. Usually for system services it is not limited (nodes*services) + 'container_threads', // used threads in cgroup. Usually not important for system services (nodes*services) + 'container_start_time_seconds', // container start. Possibly not needed for system services (nodes*services) + 'container_last_seen', // not needed as system services are always running (nodes*services) + ]) + ');;', + }, + ], + }, + { + port: 'https-metrics', + scheme: 'https', + path: '/metrics/probes', + interval: '30s', + honorLabels: true, + tlsConfig: { insecureSkipVerify: true }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + relabelings: [{ + sourceLabels: ['__metrics_path__'], + targetLabel: 'metrics_path', + }], + }, + ], + selector: { + matchLabels: { 'app.kubernetes.io/name': 'kubelet' }, + }, + namespaceSelector: { + matchNames: ['kube-system'], + }, + }, + }, + + serviceMonitorKubeControllerManager: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kube-controller-manager', + namespace: k8s._config.namespace, + labels: { 'app.kubernetes.io/name': 'kube-controller-manager' }, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + endpoints: [{ + port: 'https-metrics', + interval: '30s', + scheme: 'https', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { + insecureSkipVerify: true, + }, + metricRelabelings: relabelings + [ + { + sourceLabels: ['__name__'], + regex: 'etcd_(debugging|disk|request|server).*', + action: 'drop', + }, + ], + }], + selector: { + matchLabels: { 'app.kubernetes.io/name': 'kube-controller-manager' }, + }, + namespaceSelector: { + matchNames: ['kube-system'], + }, + }, + }, + + serviceMonitorApiserver: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kube-apiserver', + namespace: k8s._config.namespace, + labels: { 'app.kubernetes.io/name': 'apiserver' }, + }, + spec: { + jobLabel: 'component', + selector: { + matchLabels: { + component: 'apiserver', + provider: 'kubernetes', + }, + }, + namespaceSelector: { + matchNames: ['default'], + }, + endpoints: [{ + port: 'https', + interval: '30s', + scheme: 'https', + tlsConfig: { + caFile: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt', + serverName: 'kubernetes', + }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + metricRelabelings: relabelings + [ + { + sourceLabels: ['__name__'], + regex: 'etcd_(debugging|disk|server).*', + action: 'drop', + }, + { + sourceLabels: ['__name__'], + regex: 'apiserver_admission_controller_admission_latencies_seconds_.*', + action: 'drop', + }, + { + sourceLabels: ['__name__'], + regex: 'apiserver_admission_step_admission_latencies_seconds_.*', + action: 'drop', + }, + { + sourceLabels: ['__name__', 'le'], + regex: 'apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)', + action: 'drop', + }, + ], + }], + }, + }, + + [if (defaults + params).kubeProxy then 'podMonitorKubeProxy']: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PodMonitor', + metadata: { + labels: { + 'k8s-app': 'kube-proxy', + }, + name: 'kube-proxy', + namespace: k8s._config.namespace, + }, + spec: { + jobLabel: 'k8s-app', + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + selector: { + matchLabels: { + 'k8s-app': 'kube-proxy', + }, + }, + podMetricsEndpoints: [{ + honorLabels: true, + targetPort: 10249, + relabelings: [ + { + action: 'replace', + regex: '(.*)', + replacement: '$1', + sourceLabels: ['__meta_kubernetes_pod_node_name'], + targetLabel: 'instance', + }, + ], + }], + }, + }, + + + serviceMonitorCoreDNS: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'coredns', + namespace: k8s._config.namespace, + labels: { 'app.kubernetes.io/name': 'coredns' }, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + selector: { + matchLabels: { 'k8s-app': 'kube-dns' }, + }, + namespaceSelector: { + matchNames: ['kube-system'], + }, + endpoints: [{ + port: 'metrics', + interval: '15s', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + }], + }, + }, + + +} diff --git a/jsonnet/kube-prometheus/components/kube-rbac-proxy.libsonnet b/jsonnet/kube-prometheus/components/kube-rbac-proxy.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..534a2eed31bec0be9c6b26b23aea77053fa664b0 --- /dev/null +++ b/jsonnet/kube-prometheus/components/kube-rbac-proxy.libsonnet @@ -0,0 +1,63 @@ +local defaults = { + namespace: error 'must provide namespace', + image: error 'must provide image', + ports: error 'must provide ports', + secureListenAddress: error 'must provide secureListenAddress', + upstream: error 'must provide upstream', + resources: { + requests: { cpu: '10m', memory: '20Mi' }, + limits: { cpu: '20m', memory: '40Mi' }, + }, + tlsCipherSuites: [ + 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 + 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 + + // 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + // 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 + // 'TLS_RSA_WITH_AES_128_CBC_SHA', // disabled by h2 + // 'TLS_RSA_WITH_AES_256_CBC_SHA', // disabled by h2 + // 'TLS_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 + // 'TLS_RSA_WITH_AES_128_GCM_SHA256', // disabled by h2 + // 'TLS_RSA_WITH_AES_256_GCM_SHA384', // disabled by h2 + // 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + // 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', // disabled by h2 + // 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', // disabled by h2 + // 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 + // 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 + // 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', // disabled by h2 + // 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', // disabled by h2 + // 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 + // 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 + + // disabled by h2 means: https://github.com/golang/net/blob/e514e69ffb8bc3c76a71ae40de0118d794855992/http2/ciphers.go + + 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', + 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', + ], +}; + + +function(params) { + local krp = self, + _config:: defaults + params, + // Safety check + assert std.isObject(krp._config.resources), + + name: krp._config.name, + image: krp._config.image, + args: [ + '--logtostderr', + '--secure-listen-address=' + krp._config.secureListenAddress, + '--tls-cipher-suites=' + std.join(',', krp._config.tlsCipherSuites), + '--upstream=' + krp._config.upstream, + ], + resources: krp._config.resources, + ports: krp._config.ports, + securityContext: { + runAsUser: 65532, + runAsGroup: 65532, + runAsNonRoot: true, + }, +} diff --git a/jsonnet/kube-prometheus/components/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/components/kube-state-metrics.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..b39a605cbac6ffd74575c3b8161e095ea4206fc2 --- /dev/null +++ b/jsonnet/kube-prometheus/components/kube-state-metrics.libsonnet @@ -0,0 +1,176 @@ +local krp = import './kube-rbac-proxy.libsonnet'; + +local defaults = { + local defaults = self, + name: 'kube-state-metrics', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide version', + kubeRbacProxyImage: error 'must provide kubeRbacProxyImage', + resources: { + requests: { cpu: '10m', memory: '190Mi' }, + limits: { cpu: '100m', memory: '250Mi' }, + }, + + kubeRbacProxyMain: { + resources+: { + limits+: { cpu: '40m' }, + requests+: { cpu: '20m' }, + }, + }, + scrapeInterval: '30s', + scrapeTimeout: '30s', + commonLabels:: { + 'app.kubernetes.io/name': defaults.name, + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'exporter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + mixin: { + ruleLabels: {}, + _config: { + kubeStateMetricsSelector: 'job="' + defaults.name + '"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/%s', + }, + }, +}; + +function(params) (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet') { + local ksm = self, + _config:: defaults + params, + // Safety check + assert std.isObject(ksm._config.resources), + assert std.isObject(ksm._config.mixin._config), + + name:: ksm._config.name, + namespace:: ksm._config.namespace, + version:: ksm._config.version, + image:: ksm._config.image, + commonLabels:: ksm._config.commonLabels, + podLabels:: ksm._config.selectorLabels, + + mixin:: (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') { + _config+:: ksm._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: ksm._config.commonLabels + ksm._config.mixin.ruleLabels, + name: ksm._config.name + '-rules', + namespace: ksm._config.namespace, + }, + spec: { + local r = if std.objectHasAll(ksm.mixin, 'prometheusRules') then ksm.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(ksm.mixin, 'prometheusAlerts') then ksm.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + service+: { + spec+: { + ports: [ + { + name: 'https-main', + port: 8443, + targetPort: 'https-main', + }, + { + name: 'https-self', + port: 9443, + targetPort: 'https-self', + }, + ], + }, + }, + + local kubeRbacProxyMain = krp(ksm._config.kubeRbacProxyMain { + name: 'kube-rbac-proxy-main', + upstream: 'http://127.0.0.1:8081/', + secureListenAddress: ':8443', + ports: [ + { name: 'https-main', containerPort: 8443 }, + ], + image: ksm._config.kubeRbacProxyImage, + }), + + local kubeRbacProxySelf = krp({ + name: 'kube-rbac-proxy-self', + upstream: 'http://127.0.0.1:8082/', + secureListenAddress: ':9443', + ports: [ + { name: 'https-self', containerPort: 9443 }, + ], + image: ksm._config.kubeRbacProxyImage, + }), + + deployment+: { + spec+: { + template+: { + metadata+: { + annotations+: { + 'kubectl.kubernetes.io/default-container': 'kube-state-metrics', + }, + }, + spec+: { + containers: std.map(function(c) c { + ports:: null, + livenessProbe:: null, + readinessProbe:: null, + args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'], + resources: ksm._config.resources, + }, super.containers) + [kubeRbacProxyMain, kubeRbacProxySelf], + }, + }, + }, + }, + serviceMonitor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: ksm.name, + namespace: ksm._config.namespace, + labels: ksm._config.commonLabels, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + selector: { matchLabels: ksm._config.selectorLabels }, + endpoints: [ + { + port: 'https-main', + scheme: 'https', + interval: ksm._config.scrapeInterval, + scrapeTimeout: ksm._config.scrapeTimeout, + honorLabels: true, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + relabelings: [ + { + regex: '(pod|service|endpoint|namespace)', + action: 'labeldrop', + }, + ], + tlsConfig: { + insecureSkipVerify: true, + }, + }, + { + port: 'https-self', + scheme: 'https', + interval: ksm._config.scrapeInterval, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { + insecureSkipVerify: true, + }, + }, + ], + }, + }, +} diff --git a/jsonnet/kube-prometheus/alerts/alerts.libsonnet b/jsonnet/kube-prometheus/components/mixin/alerts/alerts.libsonnet similarity index 61% rename from jsonnet/kube-prometheus/alerts/alerts.libsonnet rename to jsonnet/kube-prometheus/components/mixin/alerts/alerts.libsonnet index adc461303474c64a86543b64abd6d83c4e9cd04b..8733ae447f6197e3e34db14b27b91e40afbed2c2 100644 --- a/jsonnet/kube-prometheus/alerts/alerts.libsonnet +++ b/jsonnet/kube-prometheus/components/mixin/alerts/alerts.libsonnet @@ -1,3 +1,2 @@ -(import 'alertmanager.libsonnet') + (import 'general.libsonnet') + (import 'node.libsonnet') diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/components/mixin/alerts/general.libsonnet similarity index 75% rename from jsonnet/kube-prometheus/alerts/general.libsonnet rename to jsonnet/kube-prometheus/components/mixin/alerts/general.libsonnet index 16f3e39c6053a011ccdfd32bcf0867b827372cdf..cd5c71657d4ec09914cb360488e2facf239580b1 100644 --- a/jsonnet/kube-prometheus/alerts/general.libsonnet +++ b/jsonnet/kube-prometheus/components/mixin/alerts/general.libsonnet @@ -7,7 +7,8 @@ { alert: 'TargetDown', annotations: { - message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.', + summary: 'One or more targets are unreachable.', + description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.', }, expr: '100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10', 'for': '10m', @@ -18,7 +19,8 @@ { alert: 'Watchdog', annotations: { - message: ||| + summary: 'An alert that should always be firing to certify that Alertmanager is working properly.', + description: ||| This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet similarity index 68% rename from jsonnet/kube-prometheus/alerts/node.libsonnet rename to jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet index d1b9caf6def34001693b277676228f17818f82a1..5bad9bf8da7e4b5b2dd9828b863c2dd870270b91 100644 --- a/jsonnet/kube-prometheus/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/components/mixin/alerts/node.libsonnet @@ -7,7 +7,8 @@ { alert: 'NodeNetworkInterfaceFlapping', annotations: { - message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"', + summary: 'Network interface is often changing its status', + description: 'Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}', }, expr: ||| changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2 diff --git a/jsonnet/kube-prometheus/components/mixin/custom.libsonnet b/jsonnet/kube-prometheus/components/mixin/custom.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..c8f43b0340baf35faa5f64a6a7d7920c7cb1315d --- /dev/null +++ b/jsonnet/kube-prometheus/components/mixin/custom.libsonnet @@ -0,0 +1,44 @@ +local defaults = { + name: 'kube-prometheus', + namespace: error 'must provide namespace', + commonLabels:: { + 'app.kubernetes.io/name': 'kube-prometheus', + 'app.kubernetes.io/component': 'exporter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + mixin: { + ruleLabels: {}, + _config: { + nodeExporterSelector: 'job="node-exporter"', + hostNetworkInterfaceSelector: 'device!~"veth.+"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/general/%s', + }, + }, +}; + +function(params) { + local m = self, + _config:: defaults + params, + + local alertsandrules = (import './alerts/alerts.libsonnet') + (import './rules/rules.libsonnet'), + + mixin:: alertsandrules + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') { + _config+:: m._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: m._config.commonLabels + m._config.mixin.ruleLabels, + name: m._config.name + '-rules', + namespace: m._config.namespace, + }, + spec: { + local r = if std.objectHasAll(m.mixin, 'prometheusRules') then m.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(m.mixin, 'prometheusAlerts') then m.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, +} diff --git a/jsonnet/kube-prometheus/rules/general.libsonnet b/jsonnet/kube-prometheus/components/mixin/rules/general.libsonnet similarity index 100% rename from jsonnet/kube-prometheus/rules/general.libsonnet rename to jsonnet/kube-prometheus/components/mixin/rules/general.libsonnet diff --git a/jsonnet/kube-prometheus/rules/node-rules.libsonnet b/jsonnet/kube-prometheus/components/mixin/rules/node-rules.libsonnet similarity index 81% rename from jsonnet/kube-prometheus/rules/node-rules.libsonnet rename to jsonnet/kube-prometheus/components/mixin/rules/node-rules.libsonnet index acd05424ae2450bc52443d16421855352e7f7cbc..8cdee4798116642e10beabf8a72226c9ec88e5eb 100644 --- a/jsonnet/kube-prometheus/rules/node-rules.libsonnet +++ b/jsonnet/kube-prometheus/components/mixin/rules/node-rules.libsonnet @@ -5,7 +5,7 @@ name: 'kube-prometheus-node-recording.rules', rules: [ { - expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance)', + expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)', record: 'instance:node_cpu:rate:sum', }, { @@ -17,11 +17,11 @@ record: 'instance:node_network_transmit_bytes:rate:sum', }, { - expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)', + expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)', record: 'instance:node_cpu:ratio', }, { - expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))', + expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))', record: 'cluster:node_cpu:sum_rate5m', }, { diff --git a/jsonnet/kube-prometheus/rules/rules.libsonnet b/jsonnet/kube-prometheus/components/mixin/rules/rules.libsonnet similarity index 100% rename from jsonnet/kube-prometheus/rules/rules.libsonnet rename to jsonnet/kube-prometheus/components/mixin/rules/rules.libsonnet diff --git a/jsonnet/kube-prometheus/components/node-exporter.libsonnet b/jsonnet/kube-prometheus/components/node-exporter.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..1452174aa6f351034c3fb35365189aa221699036 --- /dev/null +++ b/jsonnet/kube-prometheus/components/node-exporter.libsonnet @@ -0,0 +1,259 @@ +local krp = import './kube-rbac-proxy.libsonnet'; + +local defaults = { + local defaults = self, + name: 'node-exporter', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide version', + kubeRbacProxyImage: error 'must provide kubeRbacProxyImage', + resources: { + requests: { cpu: '102m', memory: '180Mi' }, + limits: { cpu: '250m', memory: '180Mi' }, + }, + listenAddress: '127.0.0.1', + port: 9100, + commonLabels:: { + 'app.kubernetes.io/name': defaults.name, + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'exporter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + mixin: { + ruleLabels: {}, + _config: { + nodeExporterSelector: 'job="' + defaults.name + '"', + // Adjust NodeFilesystemSpaceFillingUp warning and critical thresholds according to the following default kubelet + // GC values, + // imageGCLowThresholdPercent: 80 + // imageGCHighThresholdPercent: 85 + // See https://kubernetes.io/docs/reference/config-api/kubelet-config.v1beta1/ for more details. + fsSpaceFillingUpWarningThreshold: 20, + fsSpaceFillingUpCriticalThreshold: 15, + diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/node/%s', + }, + }, +}; + + +function(params) { + local ne = self, + _config:: defaults + params, + // Safety check + assert std.isObject(ne._config.resources), + assert std.isObject(ne._config.mixin._config), + + mixin:: (import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') { + _config+:: ne._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: ne._config.commonLabels + ne._config.mixin.ruleLabels, + name: ne._config.name + '-rules', + namespace: ne._config.namespace, + }, + spec: { + local r = if std.objectHasAll(ne.mixin, 'prometheusRules') then ne.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(ne.mixin, 'prometheusAlerts') then ne.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + clusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: ne._config.name, + labels: ne._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: ne._config.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: ne._config.name, + namespace: ne._config.namespace, + }], + }, + + clusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: ne._config.name, + labels: ne._config.commonLabels, + }, + rules: [ + { + apiGroups: ['authentication.k8s.io'], + resources: ['tokenreviews'], + verbs: ['create'], + }, + { + apiGroups: ['authorization.k8s.io'], + resources: ['subjectaccessreviews'], + verbs: ['create'], + }, + ], + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: ne._config.name, + namespace: ne._config.namespace, + labels: ne._config.commonLabels, + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: ne._config.name, + namespace: ne._config.namespace, + labels: ne._config.commonLabels, + }, + spec: { + ports: [ + { name: 'https', targetPort: 'https', port: ne._config.port }, + ], + selector: ne._config.selectorLabels, + clusterIP: 'None', + }, + }, + + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: ne._config.name, + namespace: ne._config.namespace, + labels: ne._config.commonLabels, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + selector: { + matchLabels: ne._config.selectorLabels, + }, + endpoints: [{ + port: 'https', + scheme: 'https', + interval: '15s', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + relabelings: [ + { + action: 'replace', + regex: '(.*)', + replacement: '$1', + sourceLabels: ['__meta_kubernetes_pod_node_name'], + targetLabel: 'instance', + }, + ], + tlsConfig: { + insecureSkipVerify: true, + }, + }], + }, + }, + + daemonset: + local nodeExporter = { + name: ne._config.name, + image: ne._config.image, + args: [ + '--web.listen-address=' + std.join(':', [ne._config.listenAddress, std.toString(ne._config.port)]), + '--path.sysfs=/host/sys', + '--path.rootfs=/host/root', + '--no-collector.wifi', + '--no-collector.hwmon', + '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)', + // NOTE: ignore veth network interface associated with containers. + // OVN renames veth.* to <rand-hex>@if<X> where X is /sys/class/net/<if>/ifindex + // thus [a-z0-9] regex below + '--collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$', + '--collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$', + ], + volumeMounts: [ + { name: 'sys', mountPath: '/host/sys', mountPropagation: 'HostToContainer', readOnly: true }, + { name: 'root', mountPath: '/host/root', mountPropagation: 'HostToContainer', readOnly: true }, + ], + resources: ne._config.resources, + }; + + local kubeRbacProxy = krp({ + name: 'kube-rbac-proxy', + //image: krpImage, + upstream: 'http://127.0.0.1:' + ne._config.port + '/', + secureListenAddress: '[$(IP)]:' + ne._config.port, + // Keep `hostPort` here, rather than in the node-exporter container + // because Kubernetes mandates that if you define a `hostPort` then + // `containerPort` must match. In our case, we are splitting the + // host port and container port between the two containers. + // We'll keep the port specification here so that the named port + // used by the service is tied to the proxy container. We *could* + // forgo declaring the host port, however it is important to declare + // it so that the scheduler can decide if the pod is schedulable. + ports: [ + { name: 'https', containerPort: ne._config.port, hostPort: ne._config.port }, + ], + image: ne._config.kubeRbacProxyImage, + }) + { + env: [ + { name: 'IP', valueFrom: { fieldRef: { fieldPath: 'status.podIP' } } }, + ], + }; + + { + apiVersion: 'apps/v1', + kind: 'DaemonSet', + metadata: { + name: ne._config.name, + namespace: ne._config.namespace, + labels: ne._config.commonLabels, + }, + spec: { + selector: { matchLabels: ne._config.selectorLabels }, + updateStrategy: { + type: 'RollingUpdate', + rollingUpdate: { maxUnavailable: '10%' }, + }, + template: { + metadata: { labels: ne._config.commonLabels }, + spec: { + nodeSelector: { 'kubernetes.io/os': 'linux' }, + tolerations: [{ + operator: 'Exists', + }], + containers: [nodeExporter, kubeRbacProxy], + volumes: [ + { name: 'sys', hostPath: { path: '/sys' } }, + { name: 'root', hostPath: { path: '/' } }, + ], + serviceAccountName: ne._config.name, + securityContext: { + runAsUser: 65534, + runAsNonRoot: true, + }, + hostPID: true, + hostNetwork: true, + }, + }, + }, + }, + + +} diff --git a/jsonnet/kube-prometheus/components/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/components/prometheus-adapter.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..86d0475aeb80c51d19493af2a9c2577ccb0df487 --- /dev/null +++ b/jsonnet/kube-prometheus/components/prometheus-adapter.libsonnet @@ -0,0 +1,380 @@ +local defaults = { + local defaults = self, + name: 'prometheus-adapter', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide image', + resources: { + requests: { cpu: '102m', memory: '180Mi' }, + limits: { cpu: '250m', memory: '180Mi' }, + }, + replicas: 2, + listenAddress: '127.0.0.1', + port: 9100, + commonLabels:: { + 'app.kubernetes.io/name': 'prometheus-adapter', + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'metrics-adapter', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + // Default range intervals are equal to 4 times the default scrape interval. + // This is done in order to follow Prometheus rule of thumb with irate(). + rangeIntervals: { + kubelet: '4m', + nodeExporter: '4m', + windowsExporter: '4m', + }, + + prometheusURL: error 'must provide prometheusURL', + config: { + resourceRules: { + cpu: { + containerQuery: ||| + sum by (<<.GroupBy>>) ( + irate ( + container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="",pod!=""}[%(kubelet)s] + ) + ) + ||| % $.rangeIntervals, + nodeQuery: ||| + sum by (<<.GroupBy>>) ( + 1 - irate( + node_cpu_seconds_total{mode="idle"}[%(nodeExporter)s] + ) + * on(namespace, pod) group_left(node) ( + node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>} + ) + ) + or sum by (<<.GroupBy>>) ( + 1 - irate( + windows_cpu_time_total{mode="idle", job="windows-exporter",<<.LabelMatchers>>}[%(windowsExporter)s] + ) + ) + ||| % $.rangeIntervals, + resources: { + overrides: { + node: { resource: 'node' }, + namespace: { resource: 'namespace' }, + pod: { resource: 'pod' }, + }, + }, + containerLabel: 'container', + }, + memory: { + containerQuery: ||| + sum by (<<.GroupBy>>) ( + container_memory_working_set_bytes{<<.LabelMatchers>>,container!="",pod!=""} + ) + |||, + nodeQuery: ||| + sum by (<<.GroupBy>>) ( + node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} + - + node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>} + ) + or sum by (<<.GroupBy>>) ( + windows_cs_physical_memory_bytes{job="windows-exporter",<<.LabelMatchers>>} + - + windows_memory_available_bytes{job="windows-exporter",<<.LabelMatchers>>} + ) + |||, + resources: { + overrides: { + instance: { resource: 'node' }, + namespace: { resource: 'namespace' }, + pod: { resource: 'pod' }, + }, + }, + containerLabel: 'container', + }, + window: '5m', + }, + }, + tlsCipherSuites: [ + 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', + 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', + 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', + 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', + 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', + 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', + 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', + 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', + 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', + 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', + 'TLS_RSA_WITH_AES_128_GCM_SHA256', + 'TLS_RSA_WITH_AES_256_GCM_SHA384', + 'TLS_RSA_WITH_AES_128_CBC_SHA', + 'TLS_RSA_WITH_AES_256_CBC_SHA', + ], +}; + +function(params) { + local pa = self, + _config:: defaults + params, + // Safety check + assert std.isObject(pa._config.resources), + + apiService: { + apiVersion: 'apiregistration.k8s.io/v1', + kind: 'APIService', + metadata: { + name: 'v1beta1.metrics.k8s.io', + labels: pa._config.commonLabels, + }, + spec: { + service: { + name: $.service.metadata.name, + namespace: pa._config.namespace, + }, + group: 'metrics.k8s.io', + version: 'v1beta1', + insecureSkipTLSVerify: true, + groupPriorityMinimum: 100, + versionPriority: 100, + }, + }, + + configMap: { + apiVersion: 'v1', + kind: 'ConfigMap', + metadata: { + name: 'adapter-config', + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + data: { 'config.yaml': std.manifestYamlDoc(pa._config.config) }, + }, + + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: pa._config.name, + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + spec: { + selector: { + matchLabels: pa._config.selectorLabels, + }, + endpoints: [ + { + port: 'https', + interval: '30s', + scheme: 'https', + tlsConfig: { + insecureSkipVerify: true, + }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + }, + ], + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: pa._config.name, + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + spec: { + ports: [ + { name: 'https', targetPort: 6443, port: 443 }, + ], + selector: pa._config.selectorLabels, + }, + }, + + deployment: + local c = { + name: pa._config.name, + image: pa._config.image, + args: [ + '--cert-dir=/var/run/serving-cert', + '--config=/etc/adapter/config.yaml', + '--logtostderr=true', + '--metrics-relist-interval=1m', + '--prometheus-url=' + pa._config.prometheusURL, + '--secure-port=6443', + '--tls-cipher-suites=' + std.join(',', pa._config.tlsCipherSuites), + ], + resources: pa._config.resources, + ports: [{ containerPort: 6443 }], + volumeMounts: [ + { name: 'tmpfs', mountPath: '/tmp', readOnly: false }, + { name: 'volume-serving-cert', mountPath: '/var/run/serving-cert', readOnly: false }, + { name: 'config', mountPath: '/etc/adapter', readOnly: false }, + ], + }; + + { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: { + name: pa._config.name, + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + spec: { + replicas: pa._config.replicas, + selector: { matchLabels: pa._config.selectorLabels }, + strategy: { + rollingUpdate: { + maxSurge: 1, + maxUnavailable: 1, + }, + }, + template: { + metadata: { labels: pa._config.commonLabels }, + spec: { + containers: [c], + serviceAccountName: $.serviceAccount.metadata.name, + nodeSelector: { 'kubernetes.io/os': 'linux' }, + volumes: [ + { name: 'tmpfs', emptyDir: {} }, + { name: 'volume-serving-cert', emptyDir: {} }, + { name: 'config', configMap: { name: 'adapter-config' } }, + ], + }, + }, + }, + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: pa._config.name, + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + }, + + clusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: pa._config.name, + labels: pa._config.commonLabels, + }, + rules: [{ + apiGroups: [''], + resources: ['nodes', 'namespaces', 'pods', 'services'], + verbs: ['get', 'list', 'watch'], + }], + }, + + clusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: pa._config.name, + labels: pa._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: $.clusterRole.metadata.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: $.serviceAccount.metadata.name, + namespace: pa._config.namespace, + }], + }, + + clusterRoleBindingDelegator: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: 'resource-metrics:system:auth-delegator', + labels: pa._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'system:auth-delegator', + }, + subjects: [{ + kind: 'ServiceAccount', + name: $.serviceAccount.metadata.name, + namespace: pa._config.namespace, + }], + }, + + clusterRoleServerResources: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: 'resource-metrics-server-resources', + labels: pa._config.commonLabels, + }, + rules: [{ + apiGroups: ['metrics.k8s.io'], + resources: ['*'], + verbs: ['*'], + }], + }, + + clusterRoleAggregatedMetricsReader: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: 'system:aggregated-metrics-reader', + labels: { + 'rbac.authorization.k8s.io/aggregate-to-admin': 'true', + 'rbac.authorization.k8s.io/aggregate-to-edit': 'true', + 'rbac.authorization.k8s.io/aggregate-to-view': 'true', + } + pa._config.commonLabels, + }, + rules: [{ + apiGroups: ['metrics.k8s.io'], + resources: ['pods', 'nodes'], + verbs: ['get', 'list', 'watch'], + }], + }, + + roleBindingAuthReader: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'resource-metrics-auth-reader', + namespace: 'kube-system', + labels: pa._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'extension-apiserver-authentication-reader', + }, + subjects: [{ + kind: 'ServiceAccount', + name: $.serviceAccount.metadata.name, + namespace: pa._config.namespace, + }], + }, + + [if (defaults + params).replicas > 1 then 'podDisruptionBudget']: { + apiVersion: 'policy/v1beta1', + kind: 'PodDisruptionBudget', + metadata: { + name: pa._config.name, + namespace: pa._config.namespace, + labels: pa._config.commonLabels, + }, + spec: { + minAvailable: 1, + selector: { + matchLabels: pa._config.selectorLabels, + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..b0a78e0625586e049e870b90530f095c8faf7368 --- /dev/null +++ b/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet @@ -0,0 +1,130 @@ +local krp = import './kube-rbac-proxy.libsonnet'; +local prometheusOperator = import 'github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet'; + +local defaults = { + local defaults = self, + name: 'prometheus-operator', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide image', + kubeRbacProxyImage: error 'must provide kubeRbacProxyImage', + configReloaderImage: error 'must provide config reloader image', + resources: { + limits: { cpu: '200m', memory: '200Mi' }, + requests: { cpu: '100m', memory: '100Mi' }, + }, + commonLabels:: { + 'app.kubernetes.io/name': defaults.name, + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'controller', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + mixin: { + ruleLabels: { + role: 'alert-rules', + prometheus: defaults.name, + }, + _config: { + prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/%s', + }, + }, +}; + +function(params) + local config = defaults + params; + // Safety check + assert std.isObject(config.resources); + + prometheusOperator(config) { + local po = self, + // declare variable as a field to allow overriding options and to have unified API across all components + _config:: config, + mixin:: (import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') { + _config+:: po._config.mixin._config, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: po._config.commonLabels + po._config.mixin.ruleLabels, + name: po._config.name + '-rules', + namespace: po._config.namespace, + }, + spec: { + local r = if std.objectHasAll(po.mixin, 'prometheusRules') then po.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(po.mixin, 'prometheusAlerts') then po.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + service+: { + spec+: { + ports: [ + { + name: 'https', + port: 8443, + targetPort: 'https', + }, + ], + }, + }, + + serviceMonitor+: { + spec+: { + endpoints: [ + { + port: 'https', + scheme: 'https', + honorLabels: true, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { + insecureSkipVerify: true, + }, + }, + ], + }, + }, + + clusterRole+: { + rules+: [ + { + apiGroups: ['authentication.k8s.io'], + resources: ['tokenreviews'], + verbs: ['create'], + }, + { + apiGroups: ['authorization.k8s.io'], + resources: ['subjectaccessreviews'], + verbs: ['create'], + }, + ], + }, + + local kubeRbacProxy = krp({ + name: 'kube-rbac-proxy', + upstream: 'http://127.0.0.1:8080/', + secureListenAddress: ':8443', + ports: [ + { name: 'https', containerPort: 8443 }, + ], + image: po._config.kubeRbacProxyImage, + }), + + deployment+: { + spec+: { + template+: { + spec+: { + containers+: [kubeRbacProxy], + }, + }, + }, + }, + } diff --git a/jsonnet/kube-prometheus/components/prometheus.libsonnet b/jsonnet/kube-prometheus/components/prometheus.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..2df12b96100c861707e01829ba8d50cbcca81d89 --- /dev/null +++ b/jsonnet/kube-prometheus/components/prometheus.libsonnet @@ -0,0 +1,394 @@ +local defaults = { + local defaults = self, + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide image', + resources: { + requests: { memory: '400Mi' }, + }, + + name: error 'must provide name', + alertmanagerName: error 'must provide alertmanagerName', + namespaces: ['default', 'kube-system', defaults.namespace], + replicas: 2, + externalLabels: {}, + enableFeatures: [], + commonLabels:: { + 'app.kubernetes.io/name': 'prometheus', + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'prometheus', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + selectorLabels:: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + } + { prometheus: defaults.name }, + ruleSelector: {}, + mixin: { + ruleLabels: {}, + _config: { + prometheusSelector: 'job="prometheus-' + defaults.name + '",namespace="' + defaults.namespace + '"', + prometheusName: '{{$labels.namespace}}/{{$labels.pod}}', + thanosSelector: 'job="thanos-sidecar"', + runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus/%s', + }, + }, + thanos: null, +}; + + +function(params) { + local p = self, + _config:: defaults + params, + // Safety check + assert std.isObject(p._config.resources), + assert std.isObject(p._config.mixin._config), + + mixin:: + (import 'github.com/prometheus/prometheus/documentation/prometheus-mixin/mixin.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') + { + _config+:: p._config.mixin._config, + }, + + mixinThanos:: + (import 'github.com/thanos-io/thanos/mixin/alerts/sidecar.libsonnet') + + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/add-runbook-links.libsonnet') + { + _config+:: p._config.mixin._config, + targetGroups: {}, + sidecar: { + selector: p._config.mixin._config.thanosSelector, + dimensions: std.join(', ', ['job', 'instance']), + }, + }, + + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: p._config.commonLabels + p._config.mixin.ruleLabels, + name: 'prometheus-' + p._config.name + '-prometheus-rules', + namespace: p._config.namespace, + }, + spec: { + local r = if std.objectHasAll(p.mixin, 'prometheusRules') then p.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(p.mixin, 'prometheusAlerts') then p.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + labels: p._config.commonLabels, + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + labels: { prometheus: p._config.name } + p._config.commonLabels, + }, + spec: { + ports: [ + { name: 'web', targetPort: 'web', port: 9090 }, + ] + + ( + if p._config.thanos != null then + [{ name: 'grpc', port: 10901, targetPort: 10901 }] + else [] + ), + selector: { app: 'prometheus' } + p._config.selectorLabels, + sessionAffinity: 'ClientIP', + }, + }, + + roleBindingSpecificNamespaces: + local newSpecificRoleBinding(namespace) = { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: namespace, + labels: p._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'prometheus-' + p._config.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + }], + }; + { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBindingList', + items: [newSpecificRoleBinding(x) for x in p._config.namespaces], + }, + + clusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + name: 'prometheus-' + p._config.name, + labels: p._config.commonLabels, + }, + rules: [ + { + apiGroups: [''], + resources: ['nodes/metrics'], + verbs: ['get'], + }, + { + nonResourceURLs: ['/metrics'], + verbs: ['get'], + }, + ], + }, + + roleConfig: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'prometheus-' + p._config.name + '-config', + namespace: p._config.namespace, + labels: p._config.commonLabels, + }, + rules: [{ + apiGroups: [''], + resources: ['configmaps'], + verbs: ['get'], + }], + }, + + roleBindingConfig: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleBinding', + metadata: { + name: 'prometheus-' + p._config.name + '-config', + namespace: p._config.namespace, + labels: p._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'Role', + name: 'prometheus-' + p._config.name + '-config', + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + }], + }, + + clusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + name: 'prometheus-' + p._config.name, + labels: p._config.commonLabels, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: 'prometheus-' + p._config.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + }], + }, + + roleSpecificNamespaces: + local newSpecificRole(namespace) = { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'Role', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: namespace, + labels: p._config.commonLabels, + }, + rules: [ + { + apiGroups: [''], + resources: ['services', 'endpoints', 'pods'], + verbs: ['get', 'list', 'watch'], + }, + { + apiGroups: ['extensions'], + resources: ['ingresses'], + verbs: ['get', 'list', 'watch'], + }, + { + apiGroups: ['networking.k8s.io'], + resources: ['ingresses'], + verbs: ['get', 'list', 'watch'], + }, + ], + }; + { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'RoleList', + items: [newSpecificRole(x) for x in p._config.namespaces], + }, + + [if (defaults + params).replicas > 1 then 'podDisruptionBudget']: { + apiVersion: 'policy/v1beta1', + kind: 'PodDisruptionBudget', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + labels: p._config.commonLabels, + }, + spec: { + minAvailable: 1, + selector: { + matchLabels: { + prometheus: p._config.name, + } + p._config.selectorLabels, + }, + }, + }, + + prometheus: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'Prometheus', + metadata: { + name: p._config.name, + namespace: p._config.namespace, + labels: { prometheus: p._config.name } + p._config.commonLabels, + }, + spec: { + replicas: p._config.replicas, + version: p._config.version, + image: p._config.image, + podMetadata: { + labels: p._config.commonLabels, + }, + externalLabels: p._config.externalLabels, + enableFeatures: p._config.enableFeatures, + serviceAccountName: 'prometheus-' + p._config.name, + podMonitorSelector: {}, + podMonitorNamespaceSelector: {}, + probeSelector: {}, + probeNamespaceSelector: {}, + ruleNamespaceSelector: {}, + ruleSelector: p._config.ruleSelector, + serviceMonitorSelector: {}, + serviceMonitorNamespaceSelector: {}, + nodeSelector: { 'kubernetes.io/os': 'linux' }, + resources: p._config.resources, + alerting: { + alertmanagers: [{ + namespace: p._config.namespace, + name: 'alertmanager-' + p._config.alertmanagerName, + port: 'web', + apiVersion: 'v2', + }], + }, + securityContext: { + runAsUser: 1000, + runAsNonRoot: true, + fsGroup: 2000, + }, + [if std.objectHas(params, 'thanos') then 'thanos']: p._config.thanos, + }, + }, + + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'prometheus-' + p._config.name, + namespace: p._config.namespace, + labels: p._config.commonLabels, + }, + spec: { + selector: { + matchLabels: p._config.selectorLabels, + }, + endpoints: [{ + port: 'web', + interval: '30s', + }], + }, + }, + + // Include thanos sidecar PrometheusRule only if thanos config was passed by user + [if std.objectHas(params, 'thanos') && params.thanos != null then 'prometheusRuleThanosSidecar']: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: p._config.commonLabels + p._config.mixin.ruleLabels, + name: 'prometheus-' + p._config.name + '-thanos-sidecar-rules', + namespace: p._config.namespace, + }, + spec: { + local r = if std.objectHasAll(p.mixinThanos, 'prometheusRules') then p.mixinThanos.prometheusRules.groups else [], + local a = if std.objectHasAll(p.mixinThanos, 'prometheusAlerts') then p.mixinThanos.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + // Include thanos sidecar Service only if thanos config was passed by user + [if std.objectHas(params, 'thanos') && params.thanos != null then 'serviceThanosSidecar']: { + apiVersion: 'v1', + kind: 'Service', + metadata+: { + name: 'prometheus-' + p._config.name + '-thanos-sidecar', + namespace: p._config.namespace, + labels+: p._config.commonLabels { + prometheus: p._config.name, + 'app.kubernetes.io/component': 'thanos-sidecar', + }, + }, + spec+: { + ports: [ + { name: 'grpc', port: 10901, targetPort: 10901 }, + { name: 'http', port: 10902, targetPort: 10902 }, + ], + selector: p._config.selectorLabels { + prometheus: p._config.name, + 'app.kubernetes.io/component': 'prometheus', + }, + clusterIP: 'None', + }, + }, + + // Include thanos sidecar ServiceMonitor only if thanos config was passed by user + [if std.objectHas(params, 'thanos') && params.thanos != null then 'serviceMonitorThanosSidecar']: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata+: { + name: 'thanos-sidecar', + namespace: p._config.namespace, + labels: p._config.commonLabels { + prometheus: p._config.name, + 'app.kubernetes.io/component': 'thanos-sidecar', + }, + }, + spec+: { + jobLabel: 'app.kubernetes.io/component', + selector: { + matchLabels: { + prometheus: p._config.name, + 'app.kubernetes.io/component': 'thanos-sidecar', + }, + }, + endpoints: [{ + port: 'http', + interval: '30s', + }], + }, + }, +} diff --git a/jsonnet/kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet b/jsonnet/kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet deleted file mode 100644 index d83368cf1021281304309baf15189ea609b84604..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet +++ /dev/null @@ -1,50 +0,0 @@ -[ - // Drop all kubelet metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)', - action: 'drop', - }, - // Drop all scheduler metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)', - action: 'drop', - }, - // Drop all apiserver metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)', - action: 'drop', - }, - // Drop all docker metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)', - action: 'drop', - }, - // Drop all reflector metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)', - action: 'drop', - }, - // Drop all etcd metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)', - action: 'drop', - }, - // Drop all transformation metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: 'transformation_(transformation_latencies_microseconds|failures_total)', - action: 'drop', - }, - // Drop all other metrics which are deprecated in kubernetes. - { - sourceLabels: ['__name__'], - regex: '(admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)', - action: 'drop', - }, -] diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 7c74b15ef9f71da16eaacea1ad44c93bacd5bbbc..232ef3f17f81d744216817849096b9b810e837ef 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -14,10 +14,10 @@ "source": { "git": { "remote": "https://github.com/etcd-io/etcd", - "subdir": "Documentation/etcd-mixin" + "subdir": "contrib/mixin" } }, - "version": "master" + "version": "main" }, { "source": { @@ -26,7 +26,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "release-0.43" + "version": "master" }, { "source": { @@ -35,17 +35,8 @@ "subdir": "jsonnet/mixin" } }, - "version": "master" - }, - { - "source": { - "git": { - "remote": "https://github.com/ksonnet/ksonnet-lib", - "subdir": "" - } - }, "version": "master", - "name": "ksonnet" + "name": "prometheus-operator-mixin" }, { "source": { @@ -63,7 +54,7 @@ "subdir": "jsonnet/kube-state-metrics" } }, - "version": "release-1.9" + "version": "master" }, { "source": { @@ -90,8 +81,28 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "release-2.22", + "version": "main", "name": "prometheus" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/alertmanager", + "subdir": "doc/alertmanager-mixin" + } + }, + "version": "main", + "name": "alertmanager" + }, + { + "source": { + "git": { + "remote": "https://github.com/thanos-io/thanos", + "subdir": "mixin" + } + }, + "version": "main", + "name": "thanos-mixin" } ], "legacyImports": true diff --git a/jsonnet/kube-prometheus/ksm-autoscaler/ksm-autoscaler.libsonnet b/jsonnet/kube-prometheus/ksm-autoscaler/ksm-autoscaler.libsonnet deleted file mode 100644 index 1fed631ded7d73d74342a8711b7e4bcef6aa18a4..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/ksm-autoscaler/ksm-autoscaler.libsonnet +++ /dev/null @@ -1,118 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -{ - _config+:: { - versions+:: { - clusterVerticalAutoscaler: "v0.8.1" - }, - - imageRepos+:: { - clusterVerticalAutoscaler: 'gcr.io/google_containers/cpvpa-amd64' - }, - - kubeStateMetrics+:: { - stepCPU: '1m', - stepMemory: '2Mi', - }, - }, - ksmAutoscaler+:: { - clusterRole: - local clusterRole = k.rbac.v1.clusterRole; - local rulesType = clusterRole.rulesType; - - local rules = [ - rulesType.new() + - rulesType.withApiGroups(['']) + - rulesType.withResources([ - 'nodes', - ]) + - rulesType.withVerbs(['list', 'watch']), - ]; - - clusterRole.new() + - clusterRole.mixin.metadata.withName('ksm-autoscaler') + - clusterRole.withRules(rules), - - clusterRoleBinding: - local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName('ksm-autoscaler') + - clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName('ksm-autoscaler') + - clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'ksm-autoscaler', namespace: $._config.namespace }]), - - roleBinding: - local roleBinding = k.rbac.v1.roleBinding; - - roleBinding.new() + - roleBinding.mixin.metadata.withName('ksm-autoscaler') + - roleBinding.mixin.metadata.withNamespace($._config.namespace) + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('ksm-autoscaler') + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'ksm-autoscaler' }]), - - role: - local role = k.rbac.v1.role; - local rulesType = role.rulesType; - - local extensionsRule = rulesType.new() + - rulesType.withApiGroups(['extensions']) + - rulesType.withResources([ - 'deployments', - ]) + - rulesType.withVerbs(['patch']) + - rulesType.withResourceNames(['kube-state-metrics']); - - local appsRule = rulesType.new() + - rulesType.withApiGroups(['apps']) + - rulesType.withResources([ - 'deployments', - ]) + - rulesType.withVerbs(['patch']) + - rulesType.withResourceNames(['kube-state-metrics']); - - local rules = [extensionsRule, appsRule]; - - role.new() + - role.mixin.metadata.withName('ksm-autoscaler') + - role.mixin.metadata.withNamespace($._config.namespace) + - role.withRules(rules), - - serviceAccount: - local serviceAccount = k.core.v1.serviceAccount; - - serviceAccount.new('ksm-autoscaler') + - serviceAccount.mixin.metadata.withNamespace($._config.namespace), - deployment: - local deployment = k.apps.v1.deployment; - local container = deployment.mixin.spec.template.spec.containersType; - local podSelector = deployment.mixin.spec.template.spec.selectorType; - local podLabels = { app: 'ksm-autoscaler' }; - - local kubeStateMetricsAutoscaler = - container.new('ksm-autoscaler', $._config.imageRepos.clusterVerticalAutoscaler + ':' + $._config.versions.clusterVerticalAutoscaler) + - container.withArgs([ - '/cpvpa', - '--target=deployment/kube-state-metrics', - '--namespace=' + $._config.namespace, - '--logtostderr=true', - '--poll-period-seconds=10', - '--default-config={"kube-state-metrics":{"requests":{"cpu":{"base":"' + $._config.kubeStateMetrics.baseCPU + '","step":"' + $._config.kubeStateMetrics.stepCPU + '","nodesPerStep":1},"memory":{"base":"' + $._config.kubeStateMetrics.baseMemory + '","step":"' + $._config.kubeStateMetrics.stepMemory + '","nodesPerStep":1}},"limits":{"cpu":{"base":"' + $._config.kubeStateMetrics.baseCPU + '","step":"' + $._config.kubeStateMetrics.stepCPU + '","nodesPerStep":1},"memory":{"base":"' + $._config.kubeStateMetrics.baseMemory + '","step":"' + $._config.kubeStateMetrics.stepMemory + '","nodesPerStep":1}}}}' - ]) + - container.mixin.resources.withRequests({cpu: '20m', memory: '10Mi'}); - - local c = [kubeStateMetricsAutoscaler]; - - deployment.new('ksm-autoscaler', 1, c, podLabels) + - deployment.mixin.metadata.withNamespace($._config.namespace) + - deployment.mixin.metadata.withLabels(podLabels) + - deployment.mixin.spec.selector.withMatchLabels(podLabels) + - deployment.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) + - deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + - deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + - deployment.mixin.spec.template.spec.withServiceAccountName('ksm-autoscaler'), - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-all-namespaces.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-all-namespaces.libsonnet deleted file mode 100644 index e6ab55486fbc2af8d35b90cd494dfdaaaba7467d..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-prometheus-all-namespaces.libsonnet +++ /dev/null @@ -1,20 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -{ - prometheus+:: { - clusterRole+: { - rules+: - local role = k.rbac.v1.role; - local policyRule = role.rulesType; - local rule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'services', - 'endpoints', - 'pods', - ]) + - policyRule.withVerbs(['get', 'list', 'watch']); - [rule] - }, - } -} \ No newline at end of file diff --git a/jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet deleted file mode 100644 index 59014d551ad7957da1ce60c1889d2165d0e67d07..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-prometheus-anti-affinity.libsonnet +++ /dev/null @@ -1,41 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; -local statefulSet = k.apps.v1.statefulSet; -local affinity = statefulSet.mixin.spec.template.spec.affinity.podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecutionType; -local matchExpression = affinity.mixin.podAffinityTerm.labelSelector.matchExpressionsType; - -{ - local antiaffinity(key, values, namespace) = { - affinity: { - podAntiAffinity: { - preferredDuringSchedulingIgnoredDuringExecution: [ - affinity.new() + - affinity.withWeight(100) + - affinity.mixin.podAffinityTerm.withNamespaces(namespace) + - affinity.mixin.podAffinityTerm.withTopologyKey('kubernetes.io/hostname') + - affinity.mixin.podAffinityTerm.labelSelector.withMatchExpressions([ - matchExpression.new() + - matchExpression.withKey(key) + - matchExpression.withOperator('In') + - matchExpression.withValues(values), - ]), - ], - }, - }, - }, - - alertmanager+:: { - alertmanager+: { - spec+: - antiaffinity('alertmanager', [$._config.alertmanager.name], $._config.namespace), - }, - }, - - prometheus+: { - local p = self, - - prometheus+: { - spec+: - antiaffinity('prometheus', [p.name], p.namespace), - }, - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-config-mixins.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-config-mixins.libsonnet deleted file mode 100644 index ad2784076d0624b86a191cc8b8a82bb7cace31f9..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-prometheus-config-mixins.libsonnet +++ /dev/null @@ -1,20 +0,0 @@ -local l = import 'lib/lib.libsonnet'; - -// withImageRepository is a mixin that replaces all images prefixes by repository. eg. -// quay.io/coreos/addon-resizer -> $repository/addon-resizer -// grafana/grafana -> grafana $repository/grafana -local withImageRepository(repository) = { - local oldRepos = super._config.imageRepos, - local substituteRepository(image, repository) = - if repository == null then image else repository + '/' + l.imageName(image), - _config+:: { - imageRepos:: { - [field]: substituteRepository(oldRepos[field], repository), - for field in std.objectFields(oldRepos) - } - }, -}; - -{ - withImageRepository:: withImageRepository, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet deleted file mode 100644 index c4230e197e9e23af57c03d6e7c8b4db1637c4876..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet +++ /dev/null @@ -1,89 +0,0 @@ -{ - _config+:: { - eks: { - minimumAvailableIPs: 10, - minimumAvailableIPsTime: '10m', - }, - }, - prometheus+: { - serviceMonitorCoreDNS+: { - spec+: { - endpoints: [ - { - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - interval: '15s', - targetPort: 9153, - }, - ], - }, - }, - AwsEksCniMetricService: { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: 'aws-node', - namespace: 'kube-system', - labels: { 'k8s-app': 'aws-node' }, - }, - spec: { - ports: [ - { name: 'cni-metrics-port', port: 61678, targetPort: 61678 }, - ], - selector: { 'k8s-app': 'aws-node' }, - clusterIP: 'None', - }, - }, - - serviceMonitorAwsEksCNI: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'awsekscni', - namespace: $._config.namespace, - labels: { - 'k8s-app': 'eks-cni', - }, - }, - spec: { - jobLabel: 'k8s-app', - selector: { - matchLabels: { - 'k8s-app': 'aws-node', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - endpoints: [ - { - port: 'cni-metrics-port', - interval: '30s', - path: '/metrics', - }, - ], - }, - }, - }, - prometheusRules+: { - groups+: [ - { - name: 'kube-prometheus-eks.rules', - rules: [ - { - expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $._config.eks.minimumAvailableIPs, - labels: { - severity: 'critical', - }, - annotations: { - message: 'Instance {{ $labels.instance }} has less than 10 IPs available.', - }, - 'for': $._config.eks.minimumAvailableIPsTime, - alert: 'EksAvailableIPs', - }, - ], - }, - ], - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-kubespray.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-kubespray.libsonnet deleted file mode 100644 index c1e7682d57842bfa039cec4bf20c6cca11e93849..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-prometheus-kubespray.libsonnet +++ /dev/null @@ -1,56 +0,0 @@ -local service(name, namespace, labels, selector, ports) = { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: name, - namespace: namespace, - labels: labels, - }, - spec: { - ports+: ports, - selector: selector, - clusterIP: 'None', - }, -}; - -{ - - prometheus+: { - kubeControllerManagerPrometheusDiscoveryService: service( - 'kube-controller-manager-prometheus-discovery', - 'kube-system', - { 'k8s-app': 'kube-controller-manager' }, - { 'k8s-app': 'kube-controller-manager' }, - [{ name: 'https-metrics', port: 10257, targetPort: 10257 }] - ), - - kubeSchedulerPrometheusDiscoveryService: service( - 'kube-scheduler-prometheus-discovery', - 'kube-system', - { 'k8s-app': 'kube-scheduler' }, - { 'k8s-app': 'kube-scheduler' }, - [{ name: 'https-metrics', port: 10259, targetPort: 10259 }], - ), - - serviceMonitorKubeScheduler+: { - spec+: { - selector+: { - matchLabels: { - 'k8s-app': 'kube-scheduler', - }, - }, - }, - }, - - serviceMonitorKubeControllerManager+: { - spec+: { - selector+: { - matchLabels: { - 'k8s-app': 'kube-controller-manager', - }, - }, - }, - }, - - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-managed-cluster.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-managed-cluster.libsonnet deleted file mode 100644 index 9b4e1a8cdd0e764d1c5fb7029fbdead178efe374..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-prometheus-managed-cluster.libsonnet +++ /dev/null @@ -1,35 +0,0 @@ -// On managed Kubernetes clusters some of the control plane components are not exposed to customers. -// Disable scrape jobs, service monitors, and alert groups for these components by overwriting 'kube-prometheus.libsonnet' defaults - -{ - _config+:: { - // This snippet walks the original object (super.jobs, set as temp var j) and creates a replacement jobs object - // excluding any members of the set specified (eg: controller and scheduler). - local j = super.jobs, - jobs: { - [k]: j[k] - for k in std.objectFields(j) - if !std.setMember(k, ['KubeControllerManager', 'KubeScheduler']) - }, - - // Skip alerting rules too - prometheus+:: { - rules+:: { - local g = super.groups, - groups: [ - h - for h in g - if !std.setMember(h.name, ['kubernetes-system-controller-manager', 'kubernetes-system-scheduler']) - ], - }, - }, - }, - - // Same as above but for ServiceMonitor's - local p = super.prometheus, - prometheus: { - [q]: p[q] - for q in std.objectFields(p) - if !std.setMember(q, ['serviceMonitorKubeControllerManager', 'serviceMonitorKubeScheduler']) - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-strip-limits.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-strip-limits.libsonnet deleted file mode 100644 index fbd40200a47c5592783cc75d4eb2a94dbe63b68b..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-prometheus-strip-limits.libsonnet +++ /dev/null @@ -1,35 +0,0 @@ -// Strips spec.containers[].limits for certain containers -// https://github.com/prometheus-operator/kube-prometheus/issues/72 -{ - _config+:: { - resources+:: { - 'addon-resizer'+: { - limits: {}, - }, - 'kube-rbac-proxy'+: { - limits: {}, - }, - 'kube-state-metrics'+: { - limits: {}, - }, - 'node-exporter'+: { - limits: {}, - }, - }, - }, - prometheusOperator+: { - deployment+: { - spec+: { - template+: { - spec+: { - local addArgs(c) = - if c.name == 'prometheus-operator' - then c { args+: ['--config-reloader-cpu=0'] } - else c, - containers: std.map(addArgs, super.containers), - }, - }, - }, - }, - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet deleted file mode 100644 index 7d98c309953cd7debbaef378afab75ac06d5999f..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-prometheus-thanos-sidecar.libsonnet +++ /dev/null @@ -1,77 +0,0 @@ -{ - _config+:: { - versions+:: { thanos: 'v0.14.0' }, - imageRepos+:: { thanos: 'quay.io/thanos/thanos' }, - thanos+:: { - objectStorageConfig: { - key: 'thanos.yaml', // How the file inside the secret is called - name: 'thanos-objectstorage', // This is the name of your Kubernetes secret with the config - }, - }, - }, - prometheus+:: { - // Add the grpc port to the Prometheus service to be able to query it with the Thanos Querier - service+: { - spec+: { - ports+: [ - { name: 'grpc', port: 10901, targetPort: 10901 }, - ], - }, - }, - // Create a new service that exposes both sidecar's HTTP metrics port and gRPC StoreAPI - serviceThanosSidecar: { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: 'prometheus-' + $._config.prometheus.name + '-thanos-sidecar', - namespace: $._config.namespace, - labels: { prometheus: $._config.prometheus.name, app: 'thanos-sidecar' }, - }, - spec: { - ports: [ - { name: 'grpc', port: 10901, targetPort: 10901 }, - { name: 'http', port: 10902, targetPort: 10902 }, - ], - selector: { app: 'prometheus', prometheus: $._config.prometheus.name }, - clusterIP: 'None', - }, - }, - prometheus+: { - spec+: { - thanos+: { - version: $._config.versions.thanos, - image: $._config.imageRepos.thanos + ':' + $._config.versions.thanos, - objectStorageConfig: $._config.thanos.objectStorageConfig, - }, - }, - }, - serviceMonitorThanosSidecar: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'thanos-sidecar', - namespace: $._config.namespace, - labels: { - 'k8s-app': 'prometheus', - }, - }, - spec: { - // Use the service's app label (thanos-sidecar) as the value for the job label. - jobLabel: 'app', - selector: { - matchLabels: { - prometheus: $._config.prometheus.name, - app: 'thanos-sidecar', - }, - }, - endpoints: [ - { - port: 'http', - interval: '30s', - }, - ], - }, - }, - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus-weave-net.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-weave-net.libsonnet deleted file mode 100644 index 19e7b934294977faa7f326b8854c27a255d94f82..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-prometheus-weave-net.libsonnet +++ /dev/null @@ -1,196 +0,0 @@ -{ - prometheus+: { - serviceWeaveNet: { - apiVersion: 'v1', - kind: 'Service', - metadata: { - name: 'weave-net', - namespace: 'kube-system', - labels: { 'k8s-app': 'weave-net' }, - }, - spec: { - ports: [ - { name: 'weave-net-metrics', targetPort: 6782, port: 6782 }, - ], - selector: { name: 'weave-net' }, - clusterIP: 'None', - }, - }, - serviceMonitorWeaveNet: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'weave-net', - labels: { - 'k8s-app': 'weave-net', - }, - namespace: 'monitoring', - }, - spec: { - jobLabel: 'k8s-app', - endpoints: [ - { - port: 'weave-net-metrics', - path: '/metrics', - interval: '15s', - }, - ], - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - selector: { - matchLabels: { - 'k8s-app': 'weave-net', - }, - }, - }, - }, - }, - prometheusRules+: { - groups+: [ - { - name: 'weave-net', - rules: [ - { - alert: 'WeaveNetIPAMSplitBrain', - expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.', - description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.', - }, - }, - { - alert: 'WeaveNetIPAMUnreachable', - expr: 'weave_ipam_unreachable_percentage > 25', - 'for': '10m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.', - description: 'actionable: Please find the problem and fix it.', - }, - }, - { - alert: 'WeaveNetIPAMPendingAllocates', - expr: 'sum(weave_ipam_pending_allocates) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Number of pending allocates is above the threshold.', - description: 'actionable: Please find the problem and fix it.', - }, - }, - { - alert: 'WeaveNetIPAMPendingClaims', - expr: 'sum(weave_ipam_pending_claims) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Number of pending claims is above the threshold.', - description: 'actionable: Please find the problem and fix it.', - }, - }, - { - alert: 'WeaveNetFastDPFlowsLow', - expr: 'sum(weave_flows) < 15000', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Number of FastDP flows is below the threshold.', - description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.', - }, - }, - { - alert: 'WeaveNetFastDPFlowsOff', - expr: 'sum(weave_flows == bool 0) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'FastDP flows is zero.', - description: 'actionable: Please find the reason for FastDP flows to be off and fix it.', - }, - }, - { - alert: 'WeaveNetHighConnectionTerminationRate', - expr: 'rate(weave_connection_terminations_total[5m]) > 0.1', - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'A lot of connections are getting terminated.', - description: 'actionable: Please find the reason for the high connection termination rate and fix it.', - }, - }, - { - alert: 'WeaveNetConnectionsConnecting', - expr: 'sum(weave_connections{state="connecting"}) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'A lot of connections are in connecting state.', - description: 'actionable: Please find the reason for this and fix it.', - }, - }, - { - alert: 'WeaveNetConnectionsRetying', - expr: 'sum(weave_connections{state="retrying"}) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'A lot of connections are in retrying state.', - description: 'actionable: Please find the reason for this and fix it.', - }, - }, - { - alert: 'WeaveNetConnectionsPending', - expr: 'sum(weave_connections{state="pending"}) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'A lot of connections are in pending state.', - description: 'actionable: Please find the reason for this and fix it.', - }, - }, - { - alert: 'WeaveNetConnectionsFailed', - expr: 'sum(weave_connections{state="failed"}) > 0', - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'A lot of connections are in failed state.', - description: 'actionable: Please find the reason and fix it.', - }, - }, - ], - }, - ], - }, - grafanaDashboards+:: { - 'weave-net.json': (import './grafana-weave-net.json'), - 'weave-net-cluster.json': (import './grafana-weave-net-cluster.json'), - }, -} diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet deleted file mode 100644 index 46deacc488056fd2da65b217d8af9cb2f27f4cf3..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ /dev/null @@ -1,203 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; -local k3 = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.3/k.libsonnet'; -local configMapList = k3.core.v1.configMapList; -local kubeRbacProxyContainer = import './kube-rbac-proxy/container.libsonnet'; - -(import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') + -(import './kube-state-metrics/kube-state-metrics.libsonnet') + -(import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics-mixin/mixin.libsonnet') + -(import './node-exporter/node-exporter.libsonnet') + -(import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') + -(import './alertmanager/alertmanager.libsonnet') + -(import 'github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet') + -(import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') + -(import './prometheus/prometheus.libsonnet') + -(import './prometheus-adapter/prometheus-adapter.libsonnet') + -(import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') + -(import 'github.com/prometheus/prometheus/documentation/prometheus-mixin/mixin.libsonnet') + -(import './alerts/alerts.libsonnet') + -(import './rules/rules.libsonnet') + { - kubePrometheus+:: { - namespace: k.core.v1.namespace.new($._config.namespace), - }, - prometheusOperator+:: { - service+: { - spec+: { - ports: [ - { - name: 'https', - port: 8443, - targetPort: 'https', - }, - ], - }, - }, - serviceMonitor+: { - spec+: { - endpoints: [ - { - port: 'https', - scheme: 'https', - honorLabels: true, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - tlsConfig: { - insecureSkipVerify: true, - }, - }, - ] - }, - }, - clusterRole+: { - rules+: [ - { - apiGroups: ['authentication.k8s.io'], - resources: ['tokenreviews'], - verbs: ['create'], - }, - { - apiGroups: ['authorization.k8s.io'], - resources: ['subjectaccessreviews'], - verbs: ['create'], - }, - ], - }, - } + - (kubeRbacProxyContainer { - config+:: { - kubeRbacProxy: { - local cfg = self, - image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, - name: 'kube-rbac-proxy', - securePortName: 'https', - securePort: 8443, - secureListenAddress: ':%d' % self.securePort, - upstream: 'http://127.0.0.1:8080/', - tlsCipherSuites: $._config.tlsCipherSuites, - }, - }, - }).deploymentMixin, - - grafana+:: { - dashboardDefinitions: configMapList.new(super.dashboardDefinitions), - serviceMonitor: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'grafana', - namespace: $._config.namespace, - }, - spec: { - selector: { - matchLabels: { - app: 'grafana', - }, - }, - endpoints: [ - { - port: 'http', - interval: '15s', - }, - ], - }, - }, - }, -} + { - _config+:: { - namespace: 'default', - - versions+:: { - grafana: '7.1.0', - kubeRbacProxy: 'v0.8.0', - }, - - imageRepos+:: { - kubeRbacProxy: 'quay.io/brancz/kube-rbac-proxy', - }, - - tlsCipherSuites: [ - 'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 - 'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721 - - // 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 - // 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 - // 'TLS_RSA_WITH_AES_128_CBC_SHA', // disabled by h2 - // 'TLS_RSA_WITH_AES_256_CBC_SHA', // disabled by h2 - // 'TLS_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 - // 'TLS_RSA_WITH_AES_128_GCM_SHA256', // disabled by h2 - // 'TLS_RSA_WITH_AES_256_GCM_SHA384', // disabled by h2 - // 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 - // 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', // disabled by h2 - // 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', // disabled by h2 - // 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566 - // 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661 - // 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', // disabled by h2 - // 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', // disabled by h2 - // 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 - // 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169 - - // disabled by h2 means: https://github.com/golang/net/blob/e514e69ffb8bc3c76a71ae40de0118d794855992/http2/ciphers.go - - 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', - 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', - 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', - 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', - ], - - cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"', - kubeletSelector: 'job="kubelet", metrics_path="/metrics"', - kubeStateMetricsSelector: 'job="kube-state-metrics"', - nodeExporterSelector: 'job="node-exporter"', - fsSpaceFillingUpCriticalThreshold: 15, - notKubeDnsSelector: 'job!="kube-dns"', - kubeSchedulerSelector: 'job="kube-scheduler"', - kubeControllerManagerSelector: 'job="kube-controller-manager"', - kubeApiserverSelector: 'job="apiserver"', - coreDNSSelector: 'job="kube-dns"', - podLabel: 'pod', - - alertmanagerSelector: 'job="alertmanager-' + $._config.alertmanager.name + '",namespace="' + $._config.namespace + '"', - prometheusSelector: 'job="prometheus-' + $._config.prometheus.name + '",namespace="' + $._config.namespace + '"', - prometheusName: '{{$labels.namespace}}/{{$labels.pod}}', - prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + $._config.namespace + '"', - - jobs: { - Kubelet: $._config.kubeletSelector, - KubeScheduler: $._config.kubeSchedulerSelector, - KubeControllerManager: $._config.kubeControllerManagerSelector, - KubeAPI: $._config.kubeApiserverSelector, - KubeStateMetrics: $._config.kubeStateMetricsSelector, - NodeExporter: $._config.nodeExporterSelector, - Alertmanager: $._config.alertmanagerSelector, - Prometheus: $._config.prometheusSelector, - PrometheusOperator: $._config.prometheusOperatorSelector, - CoreDNS: $._config.coreDNSSelector, - }, - - resources+:: { - 'addon-resizer': { - requests: { cpu: '10m', memory: '30Mi' }, - limits: { cpu: '50m', memory: '30Mi' }, - }, - 'kube-rbac-proxy': { - requests: { cpu: '10m', memory: '20Mi' }, - limits: { cpu: '20m', memory: '40Mi' }, - }, - 'kube-state-metrics': { - requests: { cpu: '100m', memory: '150Mi' }, - limits: { cpu: '100m', memory: '150Mi' }, - }, - 'node-exporter': { - requests: { cpu: '102m', memory: '180Mi' }, - limits: { cpu: '250m', memory: '180Mi' }, - }, - }, - prometheus+:: { - rules: $.prometheusRules + $.prometheusAlerts, - }, - - grafana+:: { - dashboards: $.grafanaDashboards, - }, - - }, -} diff --git a/jsonnet/kube-prometheus/kube-rbac-proxy/container.libsonnet b/jsonnet/kube-prometheus/kube-rbac-proxy/container.libsonnet deleted file mode 100644 index fa85f0cf6130141c037c0bb919f6418e69ed0dd6..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-rbac-proxy/container.libsonnet +++ /dev/null @@ -1,91 +0,0 @@ -{ - local krp = self, - config+:: { - kubeRbacProxy: { - image: error 'must provide image', - name: error 'must provide name', - securePortName: error 'must provide securePortName', - securePort: error 'must provide securePort', - secureListenAddress: error 'must provide secureListenAddress', - upstream: error 'must provide upstream', - tlsCipherSuites: error 'must provide tlsCipherSuites', - }, - }, - - specMixin:: { - local sm = self, - config+:: { - kubeRbacProxy: { - image: error 'must provide image', - name: error 'must provide name', - securePortName: error 'must provide securePortName', - securePort: error 'must provide securePort', - secureListenAddress: error 'must provide secureListenAddress', - upstream: error 'must provide upstream', - tlsCipherSuites: error 'must provide tlsCipherSuites', - }, - }, - spec+: { - template+: { - spec+: { - containers+: [{ - name: krp.config.kubeRbacProxy.name, - image: krp.config.kubeRbacProxy.image, - args: [ - '--logtostderr', - '--secure-listen-address=' + krp.config.kubeRbacProxy.secureListenAddress, - '--tls-cipher-suites=' + std.join(',', krp.config.kubeRbacProxy.tlsCipherSuites), - '--upstream=' + krp.config.kubeRbacProxy.upstream, - ], - ports: [ - { name: krp.config.kubeRbacProxy.securePortName, containerPort: krp.config.kubeRbacProxy.securePort }, - ], - securityContext: { - runAsUser: 65534, - }, - }], - }, - }, - }, - }, - - deploymentMixin:: { - local dm = self, - config+:: { - kubeRbacProxy: { - image: error 'must provide image', - name: error 'must provide name', - securePortName: error 'must provide securePortName', - securePort: error 'must provide securePort', - secureListenAddress: error 'must provide secureListenAddress', - upstream: error 'must provide upstream', - tlsCipherSuites: error 'must provide tlsCipherSuites', - }, - }, - deployment+: krp.specMixin { - config+:: { - kubeRbacProxy+: dm.config.kubeRbacProxy, - }, - }, - }, - - statefulSetMixin:: { - local sm = self, - config+:: { - kubeRbacProxy: { - image: error 'must provide image', - name: error 'must provide name', - securePortName: error 'must provide securePortName', - securePort: error 'must provide securePort', - secureListenAddress: error 'must provide secureListenAddress', - upstream: error 'must provide upstream', - tlsCipherSuites: error 'must provide tlsCipherSuites', - }, - }, - statefulSet+: krp.specMixin { - config+:: { - kubeRbacProxy+: sm.config.kubeRbacProxy, - }, - }, - }, -} diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet deleted file mode 100644 index 7fae2be24a0d22562553679ce8562867a939447f..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ /dev/null @@ -1,132 +0,0 @@ -local kubeRbacProxyContainer = import '../kube-rbac-proxy/container.libsonnet'; -local ksm = import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet'; - -{ - _config+:: { - versions+:: { - kubeStateMetrics: '1.9.7', - }, - imageRepos+:: { - kubeStateMetrics: 'quay.io/coreos/kube-state-metrics', - }, - kubeStateMetrics+:: { - scrapeInterval: '30s', - scrapeTimeout: '30s', - }, - }, - kubeStateMetrics+:: - ksm + { - local version = self.version, - name:: 'kube-state-metrics', - namespace:: $._config.namespace, - version:: $._config.versions.kubeStateMetrics, - image:: $._config.imageRepos.kubeStateMetrics + ':v' + $._config.versions.kubeStateMetrics, - service+: { - spec+: { - ports: [ - { - name: 'https-main', - port: 8443, - targetPort: 'https-main', - }, - { - name: 'https-self', - port: 9443, - targetPort: 'https-self', - }, - ], - }, - }, - deployment+: { - spec+: { - template+: { - spec+: { - containers: std.map(function(c) c { - ports:: null, - livenessProbe:: null, - readinessProbe:: null, - args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'], - }, super.containers), - }, - }, - }, - }, - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'kube-state-metrics', - namespace: $._config.namespace, - labels: { - 'app.kubernetes.io/name': 'kube-state-metrics', - 'app.kubernetes.io/version': version, - }, - }, - spec: { - jobLabel: 'app.kubernetes.io/name', - selector: { - matchLabels: { - 'app.kubernetes.io/name': 'kube-state-metrics', - }, - }, - endpoints: [ - { - port: 'https-main', - scheme: 'https', - interval: $._config.kubeStateMetrics.scrapeInterval, - scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout, - honorLabels: true, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [ - { - regex: '(pod|service|endpoint|namespace)', - action: 'labeldrop', - }, - ], - tlsConfig: { - insecureSkipVerify: true, - }, - }, - { - port: 'https-self', - scheme: 'https', - interval: $._config.kubeStateMetrics.scrapeInterval, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - tlsConfig: { - insecureSkipVerify: true, - }, - }, - ], - }, - }, - } + - (kubeRbacProxyContainer { - config+:: { - kubeRbacProxy: { - local cfg = self, - image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, - name: 'kube-rbac-proxy-main', - securePortName: 'https-main', - securePort: 8443, - secureListenAddress: ':%d' % self.securePort, - upstream: 'http://127.0.0.1:8081/', - tlsCipherSuites: $._config.tlsCipherSuites, - }, - }, - }).deploymentMixin + - (kubeRbacProxyContainer { - config+:: { - kubeRbacProxy: { - local cfg = self, - image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, - name: 'kube-rbac-proxy-self', - securePortName: 'https-self', - securePort: 9443, - secureListenAddress: ':%d' % self.securePort, - upstream: 'http://127.0.0.1:8082/', - tlsCipherSuites: $._config.tlsCipherSuites, - }, - }, - }).deploymentMixin, -} diff --git a/jsonnet/kube-prometheus/lib/image.libsonnet b/jsonnet/kube-prometheus/lib/image.libsonnet deleted file mode 100644 index 0561e33c820c207d8a7dd1da2b31bdd9350afd14..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/lib/image.libsonnet +++ /dev/null @@ -1,21 +0,0 @@ -// imageName extracts the image name from a fully qualified image string. eg. -// quay.io/coreos/addon-resizer -> addon-resizer -// grafana/grafana -> grafana -local imageName(image) = - local parts = std.split(image, '/'); - local len = std.length(parts); - if len == 3 then - # registry.com/org/image - parts[2] - else if len == 2 then - # org/image - parts[1] - else if len == 1 then - # image, ie. busybox - parts[0] - else - error 'unknown image format: ' + image; - -{ - imageName:: imageName, -} diff --git a/jsonnet/kube-prometheus/lib/lib.libsonnet b/jsonnet/kube-prometheus/lib/lib.libsonnet deleted file mode 100644 index c30f976f15a4367a4361c72306aa579e5580f6fe..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/lib/lib.libsonnet +++ /dev/null @@ -1 +0,0 @@ -(import 'image.libsonnet') diff --git a/jsonnet/kube-prometheus/lib/mixin.libsonnet b/jsonnet/kube-prometheus/lib/mixin.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..4c0665eac492b2037dc6342dba0cb70c8abd555e --- /dev/null +++ b/jsonnet/kube-prometheus/lib/mixin.libsonnet @@ -0,0 +1,38 @@ +local defaults = { + name: error 'provide name', + namespace: 'monitoring', + labels: { + prometheus: 'k8s', + }, + mixin: error 'provide a mixin', +}; + +function(params) { + _config:: defaults + params, + + local m = self, + + local prometheusRules = if std.objectHasAll(m._config.mixin, 'prometheusRules') || std.objectHasAll(m._config.mixin, 'prometheusAlerts') then { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: m._config.labels, + name: m._config.name, + namespace: m._config.namespace, + }, + spec: { + local r = if std.objectHasAll(m._config.mixin, 'prometheusRules') then m._config.mixin.prometheusRules.groups else [], + local a = if std.objectHasAll(m._config.mixin, 'prometheusAlerts') then m._config.mixin.prometheusAlerts.groups else [], + groups: a + r, + }, + }, + + local grafanaDashboards = if std.objectHasAll(m._config.mixin, 'grafanaDashboards') then ( + if std.objectHas(m._config, 'dashboardFolder') then { + [m._config.dashboardFolder]+: m._config.mixin.grafanaDashboards, + } else (m._config.mixin.grafanaDashboards) + ), + + prometheusRules: prometheusRules, + grafanaDashboards: grafanaDashboards, +} diff --git a/jsonnet/kube-prometheus/lib/utils.libsonnet b/jsonnet/kube-prometheus/lib/utils.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..b5d29825bb756058c42707b487b9ca6b927cf6b4 --- /dev/null +++ b/jsonnet/kube-prometheus/lib/utils.libsonnet @@ -0,0 +1,7 @@ +{ + // rangeInterval takes a scrape interval and convert its to a range interval + // following Prometheus rule of thumb for rate() and irate(). + rangeInterval(i='1m'): + local interval = std.parseInt(std.substr(i, 0, std.length(i) - 1)); + interval * 4 + i[std.length(i) - 1], +} diff --git a/jsonnet/kube-prometheus/main.libsonnet b/jsonnet/kube-prometheus/main.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..877619b25c6895b847c77a8a1e4c56df4b76f88e --- /dev/null +++ b/jsonnet/kube-prometheus/main.libsonnet @@ -0,0 +1,144 @@ +local alertmanager = import './components/alertmanager.libsonnet'; +local blackboxExporter = import './components/blackbox-exporter.libsonnet'; +local grafana = import './components/grafana.libsonnet'; +local kubernetesControlPlane = import './components/k8s-control-plane.libsonnet'; +local kubeStateMetrics = import './components/kube-state-metrics.libsonnet'; +local customMixin = import './components/mixin/custom.libsonnet'; +local nodeExporter = import './components/node-exporter.libsonnet'; +local prometheusAdapter = import './components/prometheus-adapter.libsonnet'; +local prometheusOperator = import './components/prometheus-operator.libsonnet'; +local prometheus = import './components/prometheus.libsonnet'; + +local platformPatch = import './platforms/platforms.libsonnet'; + +local utils = import './lib/utils.libsonnet'; + +{ + // using `values` as this is similar to helm + values:: { + common: { + namespace: 'default', + platform: null, + ruleLabels: { + role: 'alert-rules', + prometheus: $.values.prometheus.name, + }, + // to allow automatic upgrades of components, we store versions in autogenerated `versions.json` file and import it here + versions: { + alertmanager: error 'must provide version', + blackboxExporter: error 'must provide version', + grafana: error 'must provide version', + kubeStateMetrics: error 'must provide version', + nodeExporter: error 'must provide version', + prometheus: error 'must provide version', + prometheusAdapter: error 'must provide version', + prometheusOperator: error 'must provide version', + kubeRbacProxy: error 'must provide version', + configmapReload: error 'must provide version', + } + (import 'versions.json'), + images: { + alertmanager: 'quay.io/prometheus/alertmanager:v' + $.values.common.versions.alertmanager, + blackboxExporter: 'quay.io/prometheus/blackbox-exporter:v' + $.values.common.versions.blackboxExporter, + grafana: 'grafana/grafana:v' + $.values.common.versions.grafana, + kubeStateMetrics: 'k8s.gcr.io/kube-state-metrics/kube-state-metrics:v' + $.values.common.versions.kubeStateMetrics, + nodeExporter: 'quay.io/prometheus/node-exporter:v' + $.values.common.versions.nodeExporter, + prometheus: 'quay.io/prometheus/prometheus:v' + $.values.common.versions.prometheus, + prometheusAdapter: 'k8s.gcr.io/prometheus-adapter/prometheus-adapter:v' + $.values.common.versions.prometheusAdapter, + prometheusOperator: 'quay.io/prometheus-operator/prometheus-operator:v' + $.values.common.versions.prometheusOperator, + prometheusOperatorReloader: 'quay.io/prometheus-operator/prometheus-config-reloader:v' + $.values.common.versions.prometheusOperator, + kubeRbacProxy: 'quay.io/brancz/kube-rbac-proxy:v' + $.values.common.versions.kubeRbacProxy, + configmapReload: 'jimmidyson/configmap-reload:v' + $.values.common.versions.configmapReload, + }, + }, + alertmanager: { + name: 'main', + namespace: $.values.common.namespace, + version: $.values.common.versions.alertmanager, + image: $.values.common.images.alertmanager, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + }, + blackboxExporter: { + namespace: $.values.common.namespace, + version: $.values.common.versions.blackboxExporter, + image: $.values.common.images.blackboxExporter, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, + configmapReloaderImage: $.values.common.images.configmapReload, + }, + grafana: { + namespace: $.values.common.namespace, + version: $.values.common.versions.grafana, + image: $.values.common.images.grafana, + prometheusName: $.values.prometheus.name, + // TODO(paulfantom) This should be done by iterating over all objects and looking for object.mixin.grafanaDashboards + dashboards: $.nodeExporter.mixin.grafanaDashboards + $.prometheus.mixin.grafanaDashboards + $.kubernetesControlPlane.mixin.grafanaDashboards + $.alertmanager.mixin.grafanaDashboards, + }, + kubeStateMetrics: { + namespace: $.values.common.namespace, + version: $.values.common.versions.kubeStateMetrics, + image: $.values.common.images.kubeStateMetrics, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, + }, + nodeExporter: { + namespace: $.values.common.namespace, + version: $.values.common.versions.nodeExporter, + image: $.values.common.images.nodeExporter, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, + }, + prometheus: { + namespace: $.values.common.namespace, + version: $.values.common.versions.prometheus, + image: $.values.common.images.prometheus, + name: 'k8s', + alertmanagerName: $.values.alertmanager.name, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + }, + prometheusAdapter: { + namespace: $.values.common.namespace, + version: $.values.common.versions.prometheusAdapter, + image: $.values.common.images.prometheusAdapter, + prometheusURL: 'http://prometheus-' + $.values.prometheus.name + '.' + $.values.common.namespace + '.svc.cluster.local:9090/', + rangeIntervals+: { + kubelet: utils.rangeInterval($.kubernetesControlPlane.serviceMonitorKubelet.spec.endpoints[0].interval), + nodeExporter: utils.rangeInterval($.nodeExporter.serviceMonitor.spec.endpoints[0].interval), + }, + }, + prometheusOperator: { + namespace: $.values.common.namespace, + version: $.values.common.versions.prometheusOperator, + image: $.values.common.images.prometheusOperator, + configReloaderImage: $.values.common.images.prometheusOperatorReloader, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, + }, + kubernetesControlPlane: { + namespace: $.values.common.namespace, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + }, + }, + + alertmanager: alertmanager($.values.alertmanager), + blackboxExporter: blackboxExporter($.values.blackboxExporter), + grafana: grafana($.values.grafana), + kubeStateMetrics: kubeStateMetrics($.values.kubeStateMetrics), + nodeExporter: nodeExporter($.values.nodeExporter), + prometheus: prometheus($.values.prometheus), + prometheusAdapter: prometheusAdapter($.values.prometheusAdapter), + prometheusOperator: prometheusOperator($.values.prometheusOperator), + kubernetesControlPlane: kubernetesControlPlane($.values.kubernetesControlPlane), + kubePrometheus: customMixin( + { + namespace: $.values.common.namespace, + mixin+: { ruleLabels: $.values.common.ruleLabels }, + } + ) + { + namespace: { + apiVersion: 'v1', + kind: 'Namespace', + metadata: { + name: $.values.common.namespace, + }, + }, + }, +} + platformPatch diff --git a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet deleted file mode 100644 index e0326b882a8ac6d4b1be72aa249c54167c7bd8d2..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ /dev/null @@ -1,208 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -{ - _config+:: { - namespace: 'default', - - versions+:: { - nodeExporter: 'v1.0.1', - }, - - imageRepos+:: { - nodeExporter: 'quay.io/prometheus/node-exporter', - }, - - nodeExporter+:: { - listenAddress: '127.0.0.1', - port: 9100, - labels: { - 'app.kubernetes.io/name': 'node-exporter', - 'app.kubernetes.io/version': $._config.versions.nodeExporter, - }, - selectorLabels: { - [labelName]: $._config.nodeExporter.labels[labelName] - for labelName in std.objectFields($._config.nodeExporter.labels) - if !std.setMember(labelName, ['app.kubernetes.io/version']) - }, - }, - }, - - nodeExporter+:: { - clusterRoleBinding: - local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName('node-exporter') + - clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName('node-exporter') + - clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'node-exporter', namespace: $._config.namespace }]), - - clusterRole: - local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; - - local authenticationRole = policyRule.new() + - policyRule.withApiGroups(['authentication.k8s.io']) + - policyRule.withResources([ - 'tokenreviews', - ]) + - policyRule.withVerbs(['create']); - - local authorizationRole = policyRule.new() + - policyRule.withApiGroups(['authorization.k8s.io']) + - policyRule.withResources([ - 'subjectaccessreviews', - ]) + - policyRule.withVerbs(['create']); - - local rules = [authenticationRole, authorizationRole]; - - clusterRole.new() + - clusterRole.mixin.metadata.withName('node-exporter') + - clusterRole.withRules(rules), - - daemonset: - local daemonset = k.apps.v1.daemonSet; - local container = daemonset.mixin.spec.template.spec.containersType; - local volume = daemonset.mixin.spec.template.spec.volumesType; - local containerPort = container.portsType; - local containerVolumeMount = container.volumeMountsType; - local podSelector = daemonset.mixin.spec.template.spec.selectorType; - local toleration = daemonset.mixin.spec.template.spec.tolerationsType; - local containerEnv = container.envType; - - local podLabels = $._config.nodeExporter.labels; - local selectorLabels = $._config.nodeExporter.selectorLabels; - - local existsToleration = toleration.new() + - toleration.withOperator('Exists'); - local procVolumeName = 'proc'; - local procVolume = volume.fromHostPath(procVolumeName, '/proc'); - local procVolumeMount = containerVolumeMount.new(procVolumeName, '/host/proc'). - withMountPropagation('HostToContainer'). - withReadOnly(true); - - local sysVolumeName = 'sys'; - local sysVolume = volume.fromHostPath(sysVolumeName, '/sys'); - local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys'). - withMountPropagation('HostToContainer'). - withReadOnly(true); - - local rootVolumeName = 'root'; - local rootVolume = volume.fromHostPath(rootVolumeName, '/'); - local rootVolumeMount = containerVolumeMount.new(rootVolumeName, '/host/root'). - withMountPropagation('HostToContainer'). - withReadOnly(true); - - local nodeExporter = - container.new('node-exporter', $._config.imageRepos.nodeExporter + ':' + $._config.versions.nodeExporter) + - container.withArgs([ - '--web.listen-address=' + std.join(':', [$._config.nodeExporter.listenAddress, std.toString($._config.nodeExporter.port)]), - '--path.procfs=/host/proc', - '--path.sysfs=/host/sys', - '--path.rootfs=/host/root', - '--no-collector.wifi', - '--no-collector.hwmon', - '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)', - ]) + - container.withVolumeMounts([procVolumeMount, sysVolumeMount, rootVolumeMount]) + - container.mixin.resources.withRequests($._config.resources['node-exporter'].requests) + - container.mixin.resources.withLimits($._config.resources['node-exporter'].limits); - - local ip = containerEnv.fromFieldPath('IP', 'status.podIP'); - local proxy = - container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + - container.withArgs([ - '--logtostderr', - '--secure-listen-address=[$(IP)]:' + $._config.nodeExporter.port, - '--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites), - '--upstream=http://127.0.0.1:' + $._config.nodeExporter.port + '/', - ]) + - // Keep `hostPort` here, rather than in the node-exporter container - // because Kubernetes mandates that if you define a `hostPort` then - // `containerPort` must match. In our case, we are splitting the - // host port and container port between the two containers. - // We'll keep the port specification here so that the named port - // used by the service is tied to the proxy container. We *could* - // forgo declaring the host port, however it is important to declare - // it so that the scheduler can decide if the pod is schedulable. - container.withPorts(containerPort.new($._config.nodeExporter.port) + containerPort.withHostPort($._config.nodeExporter.port) + containerPort.withName('https')) + - container.mixin.resources.withRequests($._config.resources['kube-rbac-proxy'].requests) + - container.mixin.resources.withLimits($._config.resources['kube-rbac-proxy'].limits) + - container.withEnv([ip]); - - local c = [nodeExporter, proxy]; - - daemonset.new() + - daemonset.mixin.metadata.withName('node-exporter') + - daemonset.mixin.metadata.withNamespace($._config.namespace) + - daemonset.mixin.metadata.withLabels(podLabels) + - daemonset.mixin.spec.selector.withMatchLabels(selectorLabels) + - daemonset.mixin.spec.updateStrategy.rollingUpdate.withMaxUnavailable('10%') + - daemonset.mixin.spec.template.metadata.withLabels(podLabels) + - daemonset.mixin.spec.template.spec.withTolerations([existsToleration]) + - daemonset.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) + - daemonset.mixin.spec.template.spec.withContainers(c) + - daemonset.mixin.spec.template.spec.withVolumes([procVolume, sysVolume, rootVolume]) + - daemonset.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) + - daemonset.mixin.spec.template.spec.securityContext.withRunAsUser(65534) + - daemonset.mixin.spec.template.spec.withServiceAccountName('node-exporter') + - daemonset.mixin.spec.template.spec.withHostPid(true) + - daemonset.mixin.spec.template.spec.withHostNetwork(true), - - serviceAccount: - local serviceAccount = k.core.v1.serviceAccount; - - serviceAccount.new('node-exporter') + - serviceAccount.mixin.metadata.withNamespace($._config.namespace), - - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'node-exporter', - namespace: $._config.namespace, - labels: $._config.nodeExporter.labels, - }, - spec: { - jobLabel: 'app.kubernetes.io/name', - selector: { - matchLabels: $._config.nodeExporter.selectorLabels, - }, - endpoints: [ - { - port: 'https', - scheme: 'https', - interval: '15s', - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [ - { - action: 'replace', - regex: '(.*)', - replacement: '$1', - sourceLabels: ['__meta_kubernetes_pod_node_name'], - targetLabel: 'instance', - }, - ], - tlsConfig: { - insecureSkipVerify: true, - }, - }, - ], - }, - }, - - service: - local service = k.core.v1.service; - local servicePort = k.core.v1.service.mixin.spec.portsType; - - local nodeExporterPort = servicePort.newNamed('https', $._config.nodeExporter.port, 'https'); - - service.new('node-exporter', $._config.nodeExporter.selectorLabels, nodeExporterPort) + - service.mixin.metadata.withNamespace($._config.namespace) + - service.mixin.metadata.withLabels($._config.nodeExporter.labels) + - service.mixin.spec.withClusterIp('None'), - }, -} diff --git a/jsonnet/kube-prometheus/platforms/README.md b/jsonnet/kube-prometheus/platforms/README.md new file mode 100644 index 0000000000000000000000000000000000000000..45eb76d17a7d091c710455b4d8364f3b4668f8b1 --- /dev/null +++ b/jsonnet/kube-prometheus/platforms/README.md @@ -0,0 +1,3 @@ +# Adding a new platform specific configuration + +Adding a new platform specific configuration requires to update the [README](../../../README.md#cluster-creation-tools) and the [platforms.jsonnet](./platform.jsonnet) file by adding the platform to the list of existing ones. This allow the new platform to be discoverable and easily configurable by the users. diff --git a/jsonnet/kube-prometheus/kube-prometheus-kube-aws.libsonnet b/jsonnet/kube-prometheus/platforms/aws.libsonnet similarity index 72% rename from jsonnet/kube-prometheus/kube-prometheus-kube-aws.libsonnet rename to jsonnet/kube-prometheus/platforms/aws.libsonnet index ae8d364d2492cf6a05037430ff925187913d3b6b..27a61c2b87bc25e67c29bad4c4d88041a27d3fc3 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-kube-aws.libsonnet +++ b/jsonnet/kube-prometheus/platforms/aws.libsonnet @@ -14,19 +14,19 @@ local service(name, namespace, labels, selector, ports) = { }; { - prometheus+: { + kubernetesControlPlane+: { kubeControllerManagerPrometheusDiscoveryService: service( 'kube-controller-manager-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-controller-manager' }, - { 'k8s-app': 'kube-controller-manager' }, + { 'app.kubernetes.io/name': 'kube-controller-manager' }, + { 'app.kubernetes.io/name': 'kube-controller-manager' }, [{ name: 'https-metrics', port: 10257, targetPort: 10257 }], ), kubeSchedulerPrometheusDiscoveryService: service( 'kube-scheduler-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-scheduler' }, - { 'k8s-app': 'kube-scheduler' }, + { 'app.kubernetes.io/name': 'kube-scheduler' }, + { 'app.kubernetes.io/name': 'kube-scheduler' }, [{ name: 'https-metrics', port: 10259, targetPort: 10259 }], ), }, diff --git a/jsonnet/kube-prometheus/kube-prometheus-bootkube.libsonnet b/jsonnet/kube-prometheus/platforms/bootkube.libsonnet similarity index 70% rename from jsonnet/kube-prometheus/kube-prometheus-bootkube.libsonnet rename to jsonnet/kube-prometheus/platforms/bootkube.libsonnet index 284544c12196e95179da8cef44329961bfe8421c..e4651ae92c14ab6df95ec37d0e906ec432187614 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-bootkube.libsonnet +++ b/jsonnet/kube-prometheus/platforms/bootkube.libsonnet @@ -14,28 +14,28 @@ local service(name, namespace, labels, selector, ports) = { }; { - prometheus+:: { + kubernetesControlPlane+: { kubeControllerManagerPrometheusDiscoveryService: service( 'kube-controller-manager-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-controller-manager' }, - { 'k8s-app': 'kube-controller-manager' }, + { 'app.kubernetes.io/name': 'kube-controller-manager' }, + { 'app.kubernetes.io/name': 'kube-controller-manager' }, [{ name: 'https-metrics', port: 10257, targetPort: 10257 }] ), kubeSchedulerPrometheusDiscoveryService: service( 'kube-scheduler-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-scheduler' }, - { 'k8s-app': 'kube-scheduler' }, + { 'app.kubernetes.io/name': 'kube-scheduler' }, + { 'app.kubernetes.io/name': 'kube-scheduler' }, [{ name: 'https-metrics', port: 10259, targetPort: 10259 }] ), kubeDnsPrometheusDiscoveryService: service( 'kube-dns-prometheus-discovery', - 'kube-syste', - { 'k8s-app': 'kube-dns' }, - { 'k8s-app': 'kube-dns' }, + 'kube-system', + { 'app.kubernetes.io/name': 'kube-dns' }, + { 'app.kubernetes.io/name': 'kube-dns' }, [{ name: 'http-metrics-skydns', port: 10055, targetPort: 10055 }, { name: 'http-metrics-dnsmasq', port: 10054, targetPort: 10054 }] ), }, diff --git a/jsonnet/kube-prometheus/platforms/eks.libsonnet b/jsonnet/kube-prometheus/platforms/eks.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..f46709f2faa24510eada52a1981bbb19a1fad493 --- /dev/null +++ b/jsonnet/kube-prometheus/platforms/eks.libsonnet @@ -0,0 +1,16 @@ +(import '../addons/aws-vpc-cni.libsonnet') + +(import '../addons/managed-cluster.libsonnet') + { + kubernetesControlPlane+: { + serviceMonitorCoreDNS+: { + spec+: { + endpoints: [ + { + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + interval: '15s', + targetPort: 9153, + }, + ], + }, + }, + }, +} diff --git a/jsonnet/kube-prometheus/platforms/gke.libsonnet b/jsonnet/kube-prometheus/platforms/gke.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..973eeffb86d4d05262b1ecfb5b8e81d2e372de98 --- /dev/null +++ b/jsonnet/kube-prometheus/platforms/gke.libsonnet @@ -0,0 +1,13 @@ +(import '../addons/managed-cluster.libsonnet') + { + values+:: { + prometheusAdapter+: { + config+: { + resourceRules:: null, + }, + }, + }, + + prometheusAdapter+:: { + apiService:: null, + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus-kops-coredns.libsonnet b/jsonnet/kube-prometheus/platforms/kops-coredns.libsonnet similarity index 65% rename from jsonnet/kube-prometheus/kube-prometheus-kops-coredns.libsonnet rename to jsonnet/kube-prometheus/platforms/kops-coredns.libsonnet index 7311c366d555bdae094899bc0ad1a03686008a8d..b9688173f19fa10291c9c89452c0310843d38ae3 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-kops-coredns.libsonnet +++ b/jsonnet/kube-prometheus/platforms/kops-coredns.libsonnet @@ -1,19 +1,19 @@ { - prometheus+:: { + kubernetesControlPlane+: { kubeDnsPrometheusDiscoveryService: { apiVersion: 'v1', kind: 'Service', metadata: { name: 'kube-dns-prometheus-discovery', namespace: 'kube-system', - labels: { 'k8s-app': 'kube-dns' }, + labels: { 'app.kubernetes.io/name': 'kube-dns' }, }, spec: { ports: [ { name: 'metrics', port: 9153, targetPort: 9153 }, ], - selector: { 'k8s-app': 'kube-dns' }, - cluserAPI: 'None', + selector: { 'app.kubernetes.io/name': 'kube-dns' }, + clusterIP: 'None', }, }, }, diff --git a/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet b/jsonnet/kube-prometheus/platforms/kops.libsonnet similarity index 70% rename from jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet rename to jsonnet/kube-prometheus/platforms/kops.libsonnet index 8db8c2990e139dae27b7e8d46f02ed875018c421..52eac362424ee00cecc337f4b11ffd41be001ac4 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-kops.libsonnet +++ b/jsonnet/kube-prometheus/platforms/kops.libsonnet @@ -14,25 +14,25 @@ local service(name, namespace, labels, selector, ports) = { }; { - prometheus+:: { + kubernetesControlPlane+: { kubeControllerManagerPrometheusDiscoveryService: service( 'kube-controller-manager-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-controller-manager' }, + { 'k8s-app': 'kube-controller-manager', 'app.kubernetes.io/name': 'kube-controller-manager' }, { 'k8s-app': 'kube-controller-manager' }, [{ name: 'https-metrics', port: 10257, targetPort: 10257 }] ), kubeSchedulerPrometheusDiscoveryService: service( - 'kube-controller-manager-prometheus-discovery', + 'kube-scheduler-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-scheduler' }, + { 'k8s-app': 'kube-controller-manager', 'app.kubernetes.io/name': 'kube-scheduler' }, { 'k8s-app': 'kube-scheduler' }, [{ name: 'https-metrics', port: 10259, targetPort: 10259 }] ), kubeDnsPrometheusDiscoveryService: service( - 'kube-controller-manager-prometheus-discovery', + 'kube-dns-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-dns' }, + { 'k8s-app': 'kube-controller-manager', 'app.kubernetes.io/name': 'kube-dns' }, { 'k8s-app': 'kube-dns' }, [{ name: 'metrics', port: 10055, targetPort: 10055 }, { name: 'http-metrics-dnsmasq', port: 10054, targetPort: 10054 }] ), diff --git a/jsonnet/kube-prometheus/kube-prometheus-kubeadm.libsonnet b/jsonnet/kube-prometheus/platforms/kubeadm.libsonnet similarity index 74% rename from jsonnet/kube-prometheus/kube-prometheus-kubeadm.libsonnet rename to jsonnet/kube-prometheus/platforms/kubeadm.libsonnet index 0eccc939c2f3e5a6ef395cf6e40ed2bb0ec0f58d..dec785d9de9000d45687a84fe9ae936b16b664c3 100644 --- a/jsonnet/kube-prometheus/kube-prometheus-kubeadm.libsonnet +++ b/jsonnet/kube-prometheus/platforms/kubeadm.libsonnet @@ -14,19 +14,19 @@ local service(name, namespace, labels, selector, ports) = { }; { - prometheus+: { + kubernetesControlPlane+: { kubeControllerManagerPrometheusDiscoveryService: service( 'kube-controller-manager-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-controller-manager' }, - { 'k8s-app': 'kube-controller-manager' }, + { 'app.kubernetes.io/name': 'kube-controller-manager' }, + { component: 'kube-controller-manager' }, [{ name: 'https-metrics', port: 10257, targetPort: 10257 }] ), kubeSchedulerPrometheusDiscoveryService: service( 'kube-scheduler-prometheus-discovery', 'kube-system', - { 'k8s-app': 'kube-scheduler' }, - { 'k8s-app': 'kube-scheduler' }, + { 'app.kubernetes.io/name': 'kube-scheduler' }, + { component: 'kube-scheduler' }, [{ name: 'https-metrics', port: 10259, targetPort: 10259 }], ), }, diff --git a/jsonnet/kube-prometheus/platforms/kubespray.libsonnet b/jsonnet/kube-prometheus/platforms/kubespray.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..dabee25191cf430010ddd623f9719581ea90c173 --- /dev/null +++ b/jsonnet/kube-prometheus/platforms/kubespray.libsonnet @@ -0,0 +1 @@ +(import './kubeadm.libsonnet') diff --git a/jsonnet/kube-prometheus/platforms/platforms.libsonnet b/jsonnet/kube-prometheus/platforms/platforms.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..a3978a6cad2d9afc259679325b04a02cf091764b --- /dev/null +++ b/jsonnet/kube-prometheus/platforms/platforms.libsonnet @@ -0,0 +1,41 @@ +local platforms = { + aws: import './aws.libsonnet', + bootkube: import './bootkube.libsonnet', + gke: import './gke.libsonnet', + eks: import './eks.libsonnet', + kops: import './kops.libsonnet', + kops_coredns: (import './kops.libsonnet') + (import './kops-coredns.libsonnet'), + kubeadm: import './kubeadm.libsonnet', + kubespray: import './kubespray.libsonnet', +}; + +// platformPatch returns the platform specific patch associated to the given +// platform. +local platformPatch(p) = if p != null && std.objectHas(platforms, p) then platforms[p] else {}; + +{ + // initialize the object to prevent "Indexed object has no field" lint errors + local p = { + alertmanager: {}, + blackboxExporter: {}, + grafana: {}, + kubeStateMetrics: {}, + nodeExporter: {}, + prometheus: {}, + prometheusAdapter: {}, + prometheusOperator: {}, + kubernetesControlPlane: {}, + kubePrometheus: {}, + } + platformPatch($.values.common.platform), + + alertmanager+: p.alertmanager, + blackboxExporter+: p.blackboxExporter, + grafana+: p.grafana, + kubeStateMetrics+: p.kubeStateMetrics, + nodeExporter+: p.nodeExporter, + prometheus+: p.prometheus, + prometheusAdapter+: p.prometheusAdapter, + prometheusOperator+: p.prometheusOperator, + kubernetesControlPlane+: p.kubernetesControlPlane, + kubePrometheus+: p.kubePrometheus, +} diff --git a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet b/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet deleted file mode 100644 index 2b4b3748f95dedf85f5e564bec3c353747520aaf..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/prometheus-adapter/prometheus-adapter.libsonnet +++ /dev/null @@ -1,262 +0,0 @@ -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -{ - _config+:: { - namespace: 'default', - - versions+:: { - prometheusAdapter: 'v0.8.2', - }, - - imageRepos+:: { - prometheusAdapter: 'directxman12/k8s-prometheus-adapter', - }, - - prometheusAdapter+:: { - name: 'prometheus-adapter', - namespace: $._config.namespace, - labels: { name: $._config.prometheusAdapter.name }, - prometheusURL: 'http://prometheus-' + $._config.prometheus.name + '.' + $._config.namespace + '.svc.cluster.local:9090/', - config: { - resourceRules: { - cpu: { - containerQuery: 'sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)', - nodeQuery: 'sum(1 - irate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)', - resources: { - overrides: { - node: { - resource: 'node' - }, - namespace: { - resource: 'namespace' - }, - pod: { - resource: 'pod' - }, - }, - }, - containerLabel: 'container' - }, - memory: { - containerQuery: 'sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}) by (<<.GroupBy>>)', - nodeQuery: 'sum(node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}) by (<<.GroupBy>>)', - resources: { - overrides: { - instance: { - resource: 'node' - }, - namespace: { - resource: 'namespace' - }, - pod: { - resource: 'pod' - }, - }, - }, - containerLabel: 'container' - }, - window: '5m', - }, - } - }, - }, - - prometheusAdapter+:: { - apiService: - { - apiVersion: 'apiregistration.k8s.io/v1', - kind: 'APIService', - metadata: { - name: 'v1beta1.metrics.k8s.io', - }, - spec: { - service: { - name: $.prometheusAdapter.service.metadata.name, - namespace: $._config.prometheusAdapter.namespace, - }, - group: 'metrics.k8s.io', - version: 'v1beta1', - insecureSkipTLSVerify: true, - groupPriorityMinimum: 100, - versionPriority: 100, - }, - }, - - configMap: - local configmap = k.core.v1.configMap; - configmap.new('adapter-config', { 'config.yaml': std.manifestYamlDoc($._config.prometheusAdapter.config) }) + - - configmap.mixin.metadata.withNamespace($._config.prometheusAdapter.namespace), - - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: $._config.prometheusAdapter.name, - namespace: $._config.prometheusAdapter.namespace, - labels: $._config.prometheusAdapter.labels, - }, - spec: { - selector: { - matchLabels: $._config.prometheusAdapter.labels, - }, - endpoints: [ - { - port: 'https', - interval: '30s', - scheme: 'https', - tlsConfig: { - insecureSkipVerify: true, - }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - }, - ], - }, - }, - - service: - local service = k.core.v1.service; - local servicePort = k.core.v1.service.mixin.spec.portsType; - - service.new( - $._config.prometheusAdapter.name, - $._config.prometheusAdapter.labels, - servicePort.newNamed('https', 443, 6443), - ) + - service.mixin.metadata.withNamespace($._config.prometheusAdapter.namespace) + - service.mixin.metadata.withLabels($._config.prometheusAdapter.labels), - - deployment: - local deployment = k.apps.v1.deployment; - local volume = deployment.mixin.spec.template.spec.volumesType; - local container = deployment.mixin.spec.template.spec.containersType; - local containerVolumeMount = container.volumeMountsType; - - local c = - container.new($._config.prometheusAdapter.name, $._config.imageRepos.prometheusAdapter + ':' + $._config.versions.prometheusAdapter) + - container.withArgs([ - '--cert-dir=/var/run/serving-cert', - '--config=/etc/adapter/config.yaml', - '--logtostderr=true', - '--metrics-relist-interval=1m', - '--prometheus-url=' + $._config.prometheusAdapter.prometheusURL, - '--secure-port=6443', - ]) + - container.withPorts([{ containerPort: 6443 }]) + - container.withVolumeMounts([ - containerVolumeMount.new('tmpfs', '/tmp'), - containerVolumeMount.new('volume-serving-cert', '/var/run/serving-cert'), - containerVolumeMount.new('config', '/etc/adapter'), - ],); - - deployment.new($._config.prometheusAdapter.name, 1, c, $._config.prometheusAdapter.labels) + - deployment.mixin.metadata.withNamespace($._config.prometheusAdapter.namespace) + - deployment.mixin.spec.selector.withMatchLabels($._config.prometheusAdapter.labels) + - deployment.mixin.spec.template.spec.withServiceAccountName($.prometheusAdapter.serviceAccount.metadata.name) + - deployment.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(1) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(0) + - deployment.mixin.spec.template.spec.withVolumes([ - volume.fromEmptyDir(name='tmpfs'), - volume.fromEmptyDir(name='volume-serving-cert'), - { name: 'config', configMap: { name: 'adapter-config' } }, - ]), - - serviceAccount: - local serviceAccount = k.core.v1.serviceAccount; - - serviceAccount.new($._config.prometheusAdapter.name) + - serviceAccount.mixin.metadata.withNamespace($._config.prometheusAdapter.namespace), - - clusterRole: - local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; - - local rules = - policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources(['nodes', 'namespaces', 'pods', 'services']) + - policyRule.withVerbs(['get', 'list', 'watch']); - - clusterRole.new() + - clusterRole.mixin.metadata.withName($._config.prometheusAdapter.name) + - clusterRole.withRules(rules), - - clusterRoleBinding: - local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName($._config.prometheusAdapter.name) + - clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName($.prometheusAdapter.clusterRole.metadata.name) + - clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ - kind: 'ServiceAccount', - name: $.prometheusAdapter.serviceAccount.metadata.name, - namespace: $._config.prometheusAdapter.namespace, - }]), - - clusterRoleBindingDelegator: - local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName('resource-metrics:system:auth-delegator') + - clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName('system:auth-delegator') + - clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ - kind: 'ServiceAccount', - name: $.prometheusAdapter.serviceAccount.metadata.name, - namespace: $._config.prometheusAdapter.namespace, - }]), - - clusterRoleServerResources: - local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; - - local rules = - policyRule.new() + - policyRule.withApiGroups(['metrics.k8s.io']) + - policyRule.withResources(['*']) + - policyRule.withVerbs(['*']); - - clusterRole.new() + - clusterRole.mixin.metadata.withName('resource-metrics-server-resources') + - clusterRole.withRules(rules), - - clusterRoleAggregatedMetricsReader: - local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; - - local rules = - policyRule.new() + - policyRule.withApiGroups(['metrics.k8s.io']) + - policyRule.withResources(['pods', 'nodes']) + - policyRule.withVerbs(['get','list','watch']); - - clusterRole.new() + - clusterRole.mixin.metadata.withName('system:aggregated-metrics-reader') + - clusterRole.mixin.metadata.withLabels({ - "rbac.authorization.k8s.io/aggregate-to-admin": "true", - "rbac.authorization.k8s.io/aggregate-to-edit": "true", - "rbac.authorization.k8s.io/aggregate-to-view": "true", - }) + - clusterRole.withRules(rules), - - roleBindingAuthReader: - local roleBinding = k.rbac.v1.roleBinding; - - roleBinding.new() + - roleBinding.mixin.metadata.withName('resource-metrics-auth-reader') + - roleBinding.mixin.metadata.withNamespace('kube-system') + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('extension-apiserver-authentication-reader') + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ - kind: 'ServiceAccount', - name: $.prometheusAdapter.serviceAccount.metadata.name, - namespace: $._config.prometheusAdapter.namespace, - }]), - }, -} diff --git a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet deleted file mode 100644 index 9ffd5f1fe214770e2159ae38c3f76294bb6aec72..0000000000000000000000000000000000000000 --- a/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ /dev/null @@ -1,502 +0,0 @@ -local k3 = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.3/k.libsonnet'; -local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; - -{ - _config+:: { - namespace: 'default', - - versions+:: { - prometheus: 'v2.22.1', - }, - - imageRepos+:: { - prometheus: 'quay.io/prometheus/prometheus', - }, - - alertmanager+:: { - name: 'main', - }, - - prometheus+:: { - name: 'k8s', - replicas: 2, - rules: {}, - namespaces: ['default', 'kube-system', $._config.namespace], - }, - }, - - prometheus+:: { - local p = self, - - name:: $._config.prometheus.name, - namespace:: $._config.namespace, - roleBindingNamespaces:: $._config.prometheus.namespaces, - replicas:: $._config.prometheus.replicas, - prometheusRules:: $._config.prometheus.rules, - alertmanagerName:: $.alertmanager.service.metadata.name, - - serviceAccount: - local serviceAccount = k.core.v1.serviceAccount; - - serviceAccount.new('prometheus-' + p.name) + - serviceAccount.mixin.metadata.withNamespace(p.namespace), - service: - local service = k.core.v1.service; - local servicePort = k.core.v1.service.mixin.spec.portsType; - - local prometheusPort = servicePort.newNamed('web', 9090, 'web'); - - service.new('prometheus-' + p.name, { app: 'prometheus', prometheus: p.name }, prometheusPort) + - service.mixin.spec.withSessionAffinity('ClientIP') + - service.mixin.metadata.withNamespace(p.namespace) + - service.mixin.metadata.withLabels({ prometheus: p.name }), - - rules: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'PrometheusRule', - metadata: { - labels: { - prometheus: p.name, - role: 'alert-rules', - }, - name: 'prometheus-' + p.name + '-rules', - namespace: p.namespace, - }, - spec: { - groups: p.prometheusRules.groups, - }, - }, - - roleBindingSpecificNamespaces: - local roleBinding = k.rbac.v1.roleBinding; - - local newSpecificRoleBinding(namespace) = - roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-' + p.name) + - roleBinding.mixin.metadata.withNamespace(namespace) + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-' + p.name) + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]); - - local roleBindingList = k3.rbac.v1.roleBindingList; - roleBindingList.new([newSpecificRoleBinding(x) for x in p.roleBindingNamespaces]), - clusterRole: - local clusterRole = k.rbac.v1.clusterRole; - local policyRule = clusterRole.rulesType; - - local nodeMetricsRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources(['nodes/metrics']) + - policyRule.withVerbs(['get']); - - local metricsRule = policyRule.new() + - policyRule.withNonResourceUrls('/metrics') + - policyRule.withVerbs(['get']); - - local rules = [nodeMetricsRule, metricsRule]; - - clusterRole.new() + - clusterRole.mixin.metadata.withName('prometheus-' + p.name) + - clusterRole.withRules(rules), - roleConfig: - local role = k.rbac.v1.role; - local policyRule = role.rulesType; - - local configmapRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'configmaps', - ]) + - policyRule.withVerbs(['get']); - - role.new() + - role.mixin.metadata.withName('prometheus-' + p.name + '-config') + - role.mixin.metadata.withNamespace(p.namespace) + - role.withRules(configmapRule), - roleBindingConfig: - local roleBinding = k.rbac.v1.roleBinding; - - roleBinding.new() + - roleBinding.mixin.metadata.withName('prometheus-' + p.name + '-config') + - roleBinding.mixin.metadata.withNamespace(p.namespace) + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withName('prometheus-' + p.name + '-config') + - roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) + - roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]), - clusterRoleBinding: - local clusterRoleBinding = k.rbac.v1.clusterRoleBinding; - - clusterRoleBinding.new() + - clusterRoleBinding.mixin.metadata.withName('prometheus-' + p.name) + - clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - clusterRoleBinding.mixin.roleRef.withName('prometheus-' + p.name) + - clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) + - clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]), - roleSpecificNamespaces: - local role = k.rbac.v1.role; - local policyRule = role.rulesType; - local coreRule = policyRule.new() + - policyRule.withApiGroups(['']) + - policyRule.withResources([ - 'services', - 'endpoints', - 'pods', - ]) + - policyRule.withVerbs(['get', 'list', 'watch']); - local ingressRule = policyRule.new() + - policyRule.withApiGroups(['extensions']) + - policyRule.withResources([ - 'ingresses', - ]) + - policyRule.withVerbs(['get', 'list', 'watch']); - - local newSpecificRole(namespace) = - role.new() + - role.mixin.metadata.withName('prometheus-' + p.name) + - role.mixin.metadata.withNamespace(namespace) + - role.withRules([coreRule, ingressRule]); - - local roleList = k3.rbac.v1.roleList; - roleList.new([newSpecificRole(x) for x in p.roleBindingNamespaces]), - prometheus: - local statefulSet = k.apps.v1.statefulSet; - local container = statefulSet.mixin.spec.template.spec.containersType; - local resourceRequirements = container.mixin.resourcesType; - local selector = statefulSet.mixin.spec.selectorType; - - - local resources = - resourceRequirements.new() + - resourceRequirements.withRequests({ memory: '400Mi' }); - - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'Prometheus', - metadata: { - name: p.name, - namespace: p.namespace, - labels: { - prometheus: p.name, - }, - }, - spec: { - replicas: p.replicas, - version: $._config.versions.prometheus, - image: $._config.imageRepos.prometheus + ':' + $._config.versions.prometheus, - serviceAccountName: 'prometheus-' + p.name, - serviceMonitorSelector: {}, - podMonitorSelector: {}, - probeSelector: {}, - serviceMonitorNamespaceSelector: {}, - podMonitorNamespaceSelector: {}, - probeNamespaceSelector: {}, - nodeSelector: { 'kubernetes.io/os': 'linux' }, - ruleSelector: selector.withMatchLabels({ - role: 'alert-rules', - prometheus: p.name, - }), - resources: resources, - alerting: { - alertmanagers: [ - { - namespace: p.namespace, - name: p.alertmanagerName, - port: 'web', - }, - ], - }, - securityContext: { - runAsUser: 1000, - runAsNonRoot: true, - fsGroup: 2000, - }, - }, - }, - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'prometheus', - namespace: p.namespace, - labels: { - 'k8s-app': 'prometheus', - }, - }, - spec: { - selector: { - matchLabels: { - prometheus: p.name, - }, - }, - endpoints: [ - { - port: 'web', - interval: '30s', - }, - ], - }, - }, - serviceMonitorKubeScheduler: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'kube-scheduler', - namespace: p.namespace, - labels: { - 'k8s-app': 'kube-scheduler', - }, - }, - spec: { - jobLabel: 'k8s-app', - endpoints: [ - { - port: 'https-metrics', - interval: '30s', - scheme: 'https', - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - tlsConfig: { - insecureSkipVerify: true, - }, - }, - ], - selector: { - matchLabels: { - 'k8s-app': 'kube-scheduler', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - }, - }, - serviceMonitorKubelet: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'kubelet', - namespace: p.namespace, - labels: { - 'k8s-app': 'kubelet', - }, - }, - spec: { - jobLabel: 'k8s-app', - endpoints: [ - { - port: 'https-metrics', - scheme: 'https', - interval: '30s', - honorLabels: true, - tlsConfig: { - insecureSkipVerify: true, - }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet'), - relabelings: [ - { - sourceLabels: ['__metrics_path__'], - targetLabel: 'metrics_path', - }, - ], - }, - { - port: 'https-metrics', - scheme: 'https', - path: '/metrics/cadvisor', - interval: '30s', - honorLabels: true, - honorTimestamps: false, - tlsConfig: { - insecureSkipVerify: true, - }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [ - { - sourceLabels: ['__metrics_path__'], - targetLabel: 'metrics_path', - }, - ], - metricRelabelings: [ - // Drop a bunch of metrics which are disabled but still sent, see - // https://github.com/google/cadvisor/issues/1925. - { - sourceLabels: ['__name__'], - regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)', - action: 'drop', - }, - ], - }, - { - port: 'https-metrics', - scheme: 'https', - path: '/metrics/probes', - interval: '30s', - honorLabels: true, - tlsConfig: { - insecureSkipVerify: true, - }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [ - { - sourceLabels: ['__metrics_path__'], - targetLabel: 'metrics_path', - }, - ], - }, - ], - selector: { - matchLabels: { - 'k8s-app': 'kubelet', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - }, - }, - serviceMonitorKubeControllerManager: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'kube-controller-manager', - namespace: p.namespace, - labels: { - 'k8s-app': 'kube-controller-manager', - }, - }, - spec: { - jobLabel: 'k8s-app', - endpoints: [ - { - port: 'https-metrics', - interval: '30s', - scheme: 'https', - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - tlsConfig: { - insecureSkipVerify: true, - }, - metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [ - { - sourceLabels: ['__name__'], - regex: 'etcd_(debugging|disk|request|server).*', - action: 'drop', - }, - ], - }, - ], - selector: { - matchLabels: { - 'k8s-app': 'kube-controller-manager', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - }, - }, - serviceMonitorApiserver: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'kube-apiserver', - namespace: p.namespace, - labels: { - 'k8s-app': 'apiserver', - }, - }, - spec: { - jobLabel: 'component', - selector: { - matchLabels: { - component: 'apiserver', - provider: 'kubernetes', - }, - }, - namespaceSelector: { - matchNames: [ - 'default', - ], - }, - endpoints: [ - { - port: 'https', - interval: '30s', - scheme: 'https', - tlsConfig: { - caFile: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt', - serverName: 'kubernetes', - }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [ - { - sourceLabels: ['__name__'], - regex: 'etcd_(debugging|disk|server).*', - action: 'drop', - }, - { - sourceLabels: ['__name__'], - regex: 'apiserver_admission_controller_admission_latencies_seconds_.*', - action: 'drop', - }, - { - sourceLabels: ['__name__'], - regex: 'apiserver_admission_step_admission_latencies_seconds_.*', - action: 'drop', - }, - { - sourceLabels: ['__name__', 'le'], - regex: 'apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)', - action: 'drop', - }, - ], - }, - ], - }, - }, - serviceMonitorCoreDNS: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'coredns', - namespace: p.namespace, - labels: { - 'k8s-app': 'coredns', - }, - }, - spec: { - jobLabel: 'k8s-app', - selector: { - matchLabels: { - 'k8s-app': 'kube-dns', - }, - }, - namespaceSelector: { - matchNames: [ - 'kube-system', - ], - }, - endpoints: [ - { - port: 'metrics', - interval: '15s', - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - }, - ], - }, - }, - }, -} diff --git a/jsonnet/kube-prometheus/versions.json b/jsonnet/kube-prometheus/versions.json new file mode 100644 index 0000000000000000000000000000000000000000..d27a9628bb39cc0b4d26ef6e55d5e1bada711324 --- /dev/null +++ b/jsonnet/kube-prometheus/versions.json @@ -0,0 +1,12 @@ +{ + "alertmanager": "0.23.0", + "blackboxExporter": "0.19.0", + "grafana": "8.1.3", + "kubeStateMetrics": "2.2.0", + "nodeExporter": "1.2.2", + "prometheus": "2.29.2", + "prometheusAdapter": "0.9.0", + "prometheusOperator": "0.50.0", + "kubeRbacProxy": "0.11.0", + "configmapReload": "0.5.0" +} diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 883c61e7370e44ffa48104679d2386f71033621b..9880c65456c150dc007158807d595f2b7dcc0c39 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,18 +8,18 @@ "subdir": "grafana" } }, - "version": "02ac326459f46d6f30a766ce2f9a45337a745db0", - "sum": "r7kj5f5w7aVB7vO++dI9vbHhzoW8PpyVLSA7gOiouZ0=" + "version": "c3b14b24b83cfe9abf1064649d19e2d679f033fb", + "sum": "YrE4DNQsWgYWs6h0j/FjQETt8xDXdYdsslb1WK7xQEk=" }, { "source": { "git": { "remote": "https://github.com/etcd-io/etcd.git", - "subdir": "Documentation/etcd-mixin" + "subdir": "contrib/mixin" } }, - "version": "7da5182f1d02c0baaefd52f361fd0459d5b6703e", - "sum": "L+PGlPK9mykGCJ9TIoEWdhMBjz+9lKuQ4YZ8fOeP9sk=" + "version": "c2937d78d2722d774f69dbf91a956f382d32f4d3", + "sum": "5XhYOigrKipOWDbIn9hlrz7JcbelzvJnormxSaup9JI=" }, { "source": { @@ -28,8 +28,8 @@ "subdir": "grafonnet" } }, - "version": "356bd73e4792ffe107725776ca8946895969c191", - "sum": "CSMZ3dJrpJpwvffie8BqcfrIVVwiKNqdPEN+1XWRBGU=" + "version": "05fb200ee1a1816fc1b4c522071d5606d8dd71c1", + "sum": "mEoObbqbyVaXrHFEJSM2Nad31tOvadzIevWuyNHHBgI=" }, { "source": { @@ -38,19 +38,8 @@ "subdir": "grafana-builder" } }, - "version": "fe3e027c5a0d8311e1bd6cd9de2c295707c3ae76", - "sum": "mD0zEP9FVFXeag7EaeS5OvUr2A9D6DQhGemoNn6+PLc=" - }, - { - "source": { - "git": { - "remote": "https://github.com/ksonnet/ksonnet-lib.git", - "subdir": "" - } - }, - "version": "0d2f82676817bbf9e4acf6495b2090205f323b9f", - "sum": "h28BXZ7+vczxYJ2sCt8JuR9+yznRtU/iA6DCpQUrtEg=", - "name": "ksonnet" + "version": "746874e4836a4bfbb7034d32de0c98ab1282aaae", + "sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8=" }, { "source": { @@ -59,8 +48,8 @@ "subdir": "" } }, - "version": "8a98e9c6fab000ef090b8d313292043696a8b3bb", - "sum": "btFPZfE2paWZdvLtFwv4gfDoygj1axt7Q4ACGSdeuJ8=" + "version": "2b27a09a667091cef74776b690ccceaf55995e29", + "sum": "j2jPdrcM3iuaUK+6V9jWn2M3Fapr0KtI8FZ1KQoHIGA=" }, { "source": { @@ -69,7 +58,7 @@ "subdir": "lib/promgrafonnet" } }, - "version": "8a98e9c6fab000ef090b8d313292043696a8b3bb", + "version": "2b27a09a667091cef74776b690ccceaf55995e29", "sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps=" }, { @@ -79,8 +68,8 @@ "subdir": "jsonnet/kube-state-metrics" } }, - "version": "89aaf6c524ee891140c4c8f2a05b1b16f5847309", - "sum": "zD/pbQLnQq+5hegEelaheHS8mn1h09GTktFO74iwlBI=" + "version": "d111b6d8e07f8dde1dfe7e688f44242e4aa4f734", + "sum": "S5qI+PJUdNeYOv76jH5nxwYS9N6U7CRxvyuB1wI4cTE=" }, { "source": { @@ -89,8 +78,8 @@ "subdir": "jsonnet/kube-state-metrics-mixin" } }, - "version": "a4867d8809ba60a59013034646d0a4bc89576b9c", - "sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo=" + "version": "d111b6d8e07f8dde1dfe7e688f44242e4aa4f734", + "sum": "u8gaydJoxEjzizQ8jY8xSjYgWooPmxw+wIWdDxifMAk=" }, { "source": { @@ -99,8 +88,9 @@ "subdir": "jsonnet/mixin" } }, - "version": "7134bc321bda7e69db0aacc9a41949167de7a56f", - "sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=" + "version": "2c81b0cf6a5673e08057499a08ddce396b19dda4", + "sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=", + "name": "prometheus-operator-mixin" }, { "source": { @@ -109,8 +99,19 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "b86ab77239f2a11ee69ad05b24122958d8b2df5b", - "sum": "Zof470kQY377VxlEH5MQJUSbtViNEdLyLPv/P7fX8QQ=" + "version": "2c81b0cf6a5673e08057499a08ddce396b19dda4", + "sum": "WUuFzKqxzxmTWLeic/IU1SMjdCV/zClt11MHucJ9MSc=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/alertmanager.git", + "subdir": "doc/alertmanager-mixin" + } + }, + "version": "44011410d7065487789c447ce55157ae6e0b917d", + "sum": "pep+dHzfIjh2SU5pEkwilMCAT/NoL6YYflV4x8cr7vU=", + "name": "alertmanager" }, { "source": { @@ -119,8 +120,8 @@ "subdir": "docs/node-mixin" } }, - "version": "f645d4924224f1f3abab7b20798ca8e24957724c", - "sum": "rvyiD/yCB4BeYAWqYF53bP8c+aCUt2ipLHW2Ea8ELO8=" + "version": "dc68e035a5b37a9a3b47e1547f07d96df29ba575", + "sum": "OFNs9Te1QMqSscXqNqMv0zwaJoJxaEg7NyQVNyT4VeA=" }, { "source": { @@ -129,10 +130,21 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "00f16d1ac3a4c94561e5133b821d8e4d9ef78ec2", - "sum": "CGxvaHkP7z/gnsLB/8Imvt/AnW+9nJUnTcL+fvIAZUs=", + "version": "46286cb6abfff961e8c257de091443e835ec444f", + "sum": "m4VHwft4fUcxzL4+52lLZG/V5aH5ZEdjaweb88vISL0=", "name": "prometheus" }, + { + "source": { + "git": { + "remote": "https://github.com/thanos-io/thanos.git", + "subdir": "mixin" + } + }, + "version": "2dd8c22e8c15f5ec0daaa07ae20be44bed419aa5", + "sum": "X+060DnePPeN/87fgj0SrfxVitywTk8hZA9V4nHxl1g=", + "name": "thanos-mixin" + }, { "source": { "local": { diff --git a/kustomization.yaml b/kustomization.yaml index b067b22f5d0aada60a2eab82bf2baa581d408c6f..ffea5b17a1288716776944f7203c7a5e01ad20e5 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -2,10 +2,20 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - ./manifests/alertmanager-alertmanager.yaml +- ./manifests/alertmanager-podDisruptionBudget.yaml +- ./manifests/alertmanager-prometheusRule.yaml - ./manifests/alertmanager-secret.yaml - ./manifests/alertmanager-service.yaml - ./manifests/alertmanager-serviceAccount.yaml - ./manifests/alertmanager-serviceMonitor.yaml +- ./manifests/blackbox-exporter-clusterRole.yaml +- ./manifests/blackbox-exporter-clusterRoleBinding.yaml +- ./manifests/blackbox-exporter-configuration.yaml +- ./manifests/blackbox-exporter-deployment.yaml +- ./manifests/blackbox-exporter-service.yaml +- ./manifests/blackbox-exporter-serviceAccount.yaml +- ./manifests/blackbox-exporter-serviceMonitor.yaml +- ./manifests/grafana-config.yaml - ./manifests/grafana-dashboardDatasources.yaml - ./manifests/grafana-dashboardDefinitions.yaml - ./manifests/grafana-dashboardSources.yaml @@ -13,15 +23,24 @@ resources: - ./manifests/grafana-service.yaml - ./manifests/grafana-serviceAccount.yaml - ./manifests/grafana-serviceMonitor.yaml +- ./manifests/kube-prometheus-prometheusRule.yaml - ./manifests/kube-state-metrics-clusterRole.yaml - ./manifests/kube-state-metrics-clusterRoleBinding.yaml - ./manifests/kube-state-metrics-deployment.yaml +- ./manifests/kube-state-metrics-prometheusRule.yaml - ./manifests/kube-state-metrics-service.yaml - ./manifests/kube-state-metrics-serviceAccount.yaml - ./manifests/kube-state-metrics-serviceMonitor.yaml +- ./manifests/kubernetes-prometheusRule.yaml +- ./manifests/kubernetes-serviceMonitorApiserver.yaml +- ./manifests/kubernetes-serviceMonitorCoreDNS.yaml +- ./manifests/kubernetes-serviceMonitorKubeControllerManager.yaml +- ./manifests/kubernetes-serviceMonitorKubeScheduler.yaml +- ./manifests/kubernetes-serviceMonitorKubelet.yaml - ./manifests/node-exporter-clusterRole.yaml - ./manifests/node-exporter-clusterRoleBinding.yaml - ./manifests/node-exporter-daemonset.yaml +- ./manifests/node-exporter-prometheusRule.yaml - ./manifests/node-exporter-service.yaml - ./manifests/node-exporter-serviceAccount.yaml - ./manifests/node-exporter-serviceMonitor.yaml @@ -33,27 +52,25 @@ resources: - ./manifests/prometheus-adapter-clusterRoleServerResources.yaml - ./manifests/prometheus-adapter-configMap.yaml - ./manifests/prometheus-adapter-deployment.yaml +- ./manifests/prometheus-adapter-podDisruptionBudget.yaml - ./manifests/prometheus-adapter-roleBindingAuthReader.yaml - ./manifests/prometheus-adapter-service.yaml - ./manifests/prometheus-adapter-serviceAccount.yaml - ./manifests/prometheus-adapter-serviceMonitor.yaml - ./manifests/prometheus-clusterRole.yaml - ./manifests/prometheus-clusterRoleBinding.yaml +- ./manifests/prometheus-operator-prometheusRule.yaml - ./manifests/prometheus-operator-serviceMonitor.yaml +- ./manifests/prometheus-podDisruptionBudget.yaml - ./manifests/prometheus-prometheus.yaml +- ./manifests/prometheus-prometheusRule.yaml - ./manifests/prometheus-roleBindingConfig.yaml - ./manifests/prometheus-roleBindingSpecificNamespaces.yaml - ./manifests/prometheus-roleConfig.yaml - ./manifests/prometheus-roleSpecificNamespaces.yaml -- ./manifests/prometheus-rules.yaml - ./manifests/prometheus-service.yaml - ./manifests/prometheus-serviceAccount.yaml - ./manifests/prometheus-serviceMonitor.yaml -- ./manifests/prometheus-serviceMonitorApiserver.yaml -- ./manifests/prometheus-serviceMonitorCoreDNS.yaml -- ./manifests/prometheus-serviceMonitorKubeControllerManager.yaml -- ./manifests/prometheus-serviceMonitorKubeScheduler.yaml -- ./manifests/prometheus-serviceMonitorKubelet.yaml - ./manifests/setup/0namespace-namespace.yaml - ./manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml - ./manifests/setup/prometheus-operator-0alertmanagerCustomResourceDefinition.yaml diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index 55b353a8f12b9302a2292b0db66832e5f841bcbb..42bed1e15154c071d092a01f0caff1b853629e2d 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -3,16 +3,33 @@ kind: Alertmanager metadata: labels: alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 name: main namespace: monitoring spec: - image: quay.io/prometheus/alertmanager:v0.21.0 + image: quay.io/prometheus/alertmanager:v0.23.0 nodeSelector: kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 replicas: 3 + resources: + limits: + cpu: 100m + memory: 100Mi + requests: + cpu: 4m + memory: 100Mi securityContext: fsGroup: 2000 runAsNonRoot: true runAsUser: 1000 serviceAccountName: alertmanager-main - version: v0.21.0 + version: 0.23.0 diff --git a/manifests/alertmanager-podDisruptionBudget.yaml b/manifests/alertmanager-podDisruptionBudget.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b55b7cdb9c097c93bea53e4bfc27117d6c4c9aaa --- /dev/null +++ b/manifests/alertmanager-podDisruptionBudget.yaml @@ -0,0 +1,18 @@ +apiVersion: policy/v1beta1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 + name: alertmanager-main + namespace: monitoring +spec: + maxUnavailable: 1 + selector: + matchLabels: + alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/alertmanager-prometheusRule.yaml b/manifests/alertmanager-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9f749c998d08b639d89bf2ca25d6107e3d581cb3 --- /dev/null +++ b/manifests/alertmanager-prometheusRule.yaml @@ -0,0 +1,138 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 + prometheus: k8s + role: alert-rules + name: alertmanager-main-rules + namespace: monitoring +spec: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload + summary: Reloading an Alertmanager configuration has failed. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) + for: 15m + labels: + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts + summary: An Alertmanager instance failed to send notifications. + expr: | + ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent + summary: Alertmanager instances within the same cluster have different configurations. + expr: | + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) + ) + != 1 + for: 20m + labels: + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: | + ( + count by (namespace,service) ( + avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterCrashlooping + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. + expr: | + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical diff --git a/manifests/alertmanager-secret.yaml b/manifests/alertmanager-secret.yaml index 20c205fb1f4f27ec92895fc973baedd8ba3cd115..f265e096cb4caa90dfd9a3e13762e529e5625bf9 100644 --- a/manifests/alertmanager-secret.yaml +++ b/manifests/alertmanager-secret.yaml @@ -1,6 +1,12 @@ apiVersion: v1 kind: Secret metadata: + labels: + alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 name: alertmanager-main namespace: monitoring stringData: diff --git a/manifests/alertmanager-service.yaml b/manifests/alertmanager-service.yaml index df4c9ff5d490296d14511caa63fe89cfa3f3c927..f3f6cf719cf900dbbc2affd1a4db6ea6b7c6654d 100644 --- a/manifests/alertmanager-service.yaml +++ b/manifests/alertmanager-service.yaml @@ -3,6 +3,10 @@ kind: Service metadata: labels: alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 name: alertmanager-main namespace: monitoring spec: @@ -13,4 +17,7 @@ spec: selector: alertmanager: main app: alertmanager + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus sessionAffinity: ClientIP diff --git a/manifests/alertmanager-serviceAccount.yaml b/manifests/alertmanager-serviceAccount.yaml index 5c06d5e40cb4b3d99ad5388af1bb401458b63213..ba806b5042a6c97786047c49475864847e39cfe3 100644 --- a/manifests/alertmanager-serviceAccount.yaml +++ b/manifests/alertmanager-serviceAccount.yaml @@ -1,5 +1,11 @@ apiVersion: v1 kind: ServiceAccount metadata: + labels: + alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 name: alertmanager-main namespace: monitoring diff --git a/manifests/alertmanager-serviceMonitor.yaml b/manifests/alertmanager-serviceMonitor.yaml index 548af0d6dde9646f2add57c915d9fe25524971aa..070ef530252d52daf58a62eda46bb1a8116363a8 100644 --- a/manifests/alertmanager-serviceMonitor.yaml +++ b/manifests/alertmanager-serviceMonitor.yaml @@ -2,7 +2,10 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - k8s-app: alertmanager + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.23.0 name: alertmanager namespace: monitoring spec: @@ -12,3 +15,6 @@ spec: selector: matchLabels: alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/blackbox-exporter-clusterRole.yaml b/manifests/blackbox-exporter-clusterRole.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c7824058e1b24902c6eaf01dd8b48e2bb213a523 --- /dev/null +++ b/manifests/blackbox-exporter-clusterRole.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: blackbox-exporter +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/manifests/blackbox-exporter-clusterRoleBinding.yaml b/manifests/blackbox-exporter-clusterRoleBinding.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7b3ae320903f9916cd2ed4191139142db3eb1558 --- /dev/null +++ b/manifests/blackbox-exporter-clusterRoleBinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: blackbox-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: blackbox-exporter +subjects: +- kind: ServiceAccount + name: blackbox-exporter + namespace: monitoring diff --git a/manifests/blackbox-exporter-configuration.yaml b/manifests/blackbox-exporter-configuration.yaml new file mode 100644 index 0000000000000000000000000000000000000000..35bfad12fa2c07bac8849e3e29b4b7caa8bc0c64 --- /dev/null +++ b/manifests/blackbox-exporter-configuration.yaml @@ -0,0 +1,51 @@ +apiVersion: v1 +data: + config.yml: |- + "modules": + "http_2xx": + "http": + "preferred_ip_protocol": "ip4" + "prober": "http" + "http_post_2xx": + "http": + "method": "POST" + "preferred_ip_protocol": "ip4" + "prober": "http" + "irc_banner": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" + "query_response": + - "send": "NICK prober" + - "send": "USER prober prober prober :prober" + - "expect": "PING :([^ ]+)" + "send": "PONG ${1}" + - "expect": "^:[^ ]+ 001" + "pop3s_banner": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" + "query_response": + - "expect": "^+OK" + "tls": true + "tls_config": + "insecure_skip_verify": false + "ssh_banner": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" + "query_response": + - "expect": "^SSH-2.0-" + "tcp_connect": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.19.0 + name: blackbox-exporter-configuration + namespace: monitoring diff --git a/manifests/blackbox-exporter-deployment.yaml b/manifests/blackbox-exporter-deployment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e47166bdb4e6c7ab47c260199e964192949aa714 --- /dev/null +++ b/manifests/blackbox-exporter-deployment.yaml @@ -0,0 +1,99 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.19.0 + name: blackbox-exporter + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: blackbox-exporter + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.19.0 + spec: + containers: + - args: + - --config.file=/etc/blackbox_exporter/config.yml + - --web.listen-address=:19115 + image: quay.io/prometheus/blackbox-exporter:v0.19.0 + name: blackbox-exporter + ports: + - containerPort: 19115 + name: http + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - mountPath: /etc/blackbox_exporter/ + name: config + readOnly: true + - args: + - --webhook-url=http://localhost:19115/-/reload + - --volume-dir=/etc/blackbox_exporter/ + image: jimmidyson/configmap-reload:v0.5.0 + name: module-configmap-reloader + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + runAsNonRoot: true + runAsUser: 65534 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/blackbox_exporter/ + name: config + readOnly: true + - args: + - --logtostderr + - --secure-listen-address=:9115 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:19115/ + image: quay.io/brancz/kube-rbac-proxy:v0.11.0 + name: kube-rbac-proxy + ports: + - containerPort: 9115 + name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: blackbox-exporter + volumes: + - configMap: + name: blackbox-exporter-configuration + name: config diff --git a/manifests/blackbox-exporter-service.yaml b/manifests/blackbox-exporter-service.yaml new file mode 100644 index 0000000000000000000000000000000000000000..58ffb3941a0549b6a9f73395bcf6c38eb30c1d5c --- /dev/null +++ b/manifests/blackbox-exporter-service.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.19.0 + name: blackbox-exporter + namespace: monitoring +spec: + ports: + - name: https + port: 9115 + targetPort: https + - name: probe + port: 19115 + targetPort: http + selector: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/blackbox-exporter-serviceAccount.yaml b/manifests/blackbox-exporter-serviceAccount.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac2acefb2e3aa946f502a84628d47cb64217721a --- /dev/null +++ b/manifests/blackbox-exporter-serviceAccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: blackbox-exporter + namespace: monitoring diff --git a/manifests/blackbox-exporter-serviceMonitor.yaml b/manifests/blackbox-exporter-serviceMonitor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7c05825dedc3d62b1c1d85b0be34663380116ea --- /dev/null +++ b/manifests/blackbox-exporter-serviceMonitor.yaml @@ -0,0 +1,24 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.19.0 + name: blackbox-exporter + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + path: /metrics + port: https + scheme: https + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/grafana-config.yaml b/manifests/grafana-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eeece25e07b063e951f3b2a5ed692415cd56db11 --- /dev/null +++ b/manifests/grafana-config.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Secret +metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-config + namespace: monitoring +stringData: + grafana.ini: | + [date_formats] + default_timezone = UTC +type: Opaque diff --git a/manifests/grafana-dashboardDatasources.yaml b/manifests/grafana-dashboardDatasources.yaml index 22d4748885add3680094a37704356ec9d8d70909..c3cce22fd388f7a3907dd624243ea91146ae5ba0 100644 --- a/manifests/grafana-dashboardDatasources.yaml +++ b/manifests/grafana-dashboardDatasources.yaml @@ -1,8 +1,27 @@ apiVersion: v1 -data: - datasources.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLm1vbml0b3Jpbmcuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9CiAgICBdCn0= kind: Secret metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-datasources namespace: monitoring +stringData: + datasources.yaml: |- + { + "apiVersion": 1, + "datasources": [ + { + "access": "proxy", + "editable": false, + "name": "prometheus", + "orgId": 1, + "type": "prometheus", + "url": "http://prometheus-k8s.monitoring.svc:9090", + "version": 1 + } + ] + } type: Opaque diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index e11e95affad8180096eb5cbe755ba81113ddd735..a80d3de2f4f1997ec93f32223dfd02078aac7c00 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -2,7 +2,7 @@ apiVersion: v1 items: - apiVersion: v1 data: - apiserver.json: |- + alertmanager-overview.json: |- { "__inputs": [ @@ -17,122 +17,18 @@ items: }, "editable": false, "gnetId": null, - "graphTooltip": 0, + "graphTooltip": 1, "hideControls": false, "id": null, "links": [ ], - "panels": [ - { - "content": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.", - "datasource": null, - "description": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.", - "gridPos": { - "h": 2, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 2, - "mode": "markdown", - "span": 12, - "title": "Notice", - "type": "text" - } - ], - "refresh": "10s", + "refresh": "30s", "rows": [ { "collapse": false, "collapsed": false, "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "decimals": 3, - "description": "How many percent of requests (both read and write) in 30 days have been answered successfully and fast enough?", - "format": "percentunit", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Availability (30d) > 99.000%", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, { "aliasColors": { @@ -141,14 +37,12 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "decimals": 3, - "description": "How much error budget is left looking at our 0.990% availability gurantees?", - "fill": 10, + "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 4, + "id": 2, "legend": { "alignAsTable": false, "avg": false, @@ -156,7 +50,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -176,15 +70,15 @@ items: ], "spaceLength": 10, - "span": 8, - "stack": false, + "span": 6, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "100 * (apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"} - 0.990000)", + "expr": "sum(alertmanager_alerts{namespace=\"$namespace\",service=\"$service\"}) by (namespace,service,instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "errorbudget", + "legendFormat": "{{instance}}", "refId": "A" } ], @@ -193,9 +87,9 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "ErrorBudget (30d) > 99.000%", + "title": "Alerts", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -211,8 +105,7 @@ items: }, "yaxes": [ { - "decimals": 3, - "format": "percentunit", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -220,8 +113,7 @@ items: "show": true }, { - "decimals": 3, - "format": "percentunit", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -229,105 +121,6 @@ items: "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "decimals": 3, - "description": "How many percent of read requests (LIST,GET) in 30 days have been answered successfully and fast enough?", - "format": "percentunit", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "apiserver_request:availability30d{verb=\"read\", cluster=\"$cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Read Availability (30d)", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" }, { "aliasColors": { @@ -337,13 +130,12 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "How many read requests (LIST,GET) per second do the apiservers get by code?", - "fill": 10, + "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 6, + "id": 3, "legend": { "alignAsTable": false, "avg": false, @@ -351,7 +143,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -368,34 +160,26 @@ items: "renderer": "flot", "repeat": null, "seriesOverrides": [ - { - "alias": "/2../i", - "color": "#56A64B" - }, - { - "alias": "/3../i", - "color": "#F2CC0C" - }, - { - "alias": "/4../i", - "color": "#3274D9" - }, - { - "alias": "/5../i", - "color": "#E02F44" - } + ], "spaceLength": 10, - "span": 3, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", + "expr": "sum(rate(alertmanager_alerts_received_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ code }}", + "legendFormat": "{{instance}} Received", "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Invalid", + "refId": "B" } ], "thresholds": [ @@ -403,9 +187,9 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Read SLI - Requests", + "title": "Alerts receive rate", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -421,7 +205,7 @@ items: }, "yaxes": [ { - "format": "reqps", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -429,7 +213,7 @@ items: "show": true }, { - "format": "reqps", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -437,7 +221,20 @@ items: "show": true } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alerts", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ { "aliasColors": { @@ -446,13 +243,12 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 7, + "id": 4, "legend": { "alignAsTable": false, "avg": false, @@ -460,7 +256,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -475,21 +271,27 @@ items: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, + "repeat": "integration", "seriesOverrides": [ ], "spaceLength": 10, - "span": 3, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", + "expr": "sum(rate(alertmanager_notifications_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ resource }}", + "legendFormat": "{{instance}} Total", "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Failed", + "refId": "B" } ], "thresholds": [ @@ -497,9 +299,9 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Read SLI - Errors", + "title": "$integration: Notifications Send Rate", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -515,19 +317,19 @@ items: }, "yaxes": [ { - "format": "percentunit", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "percentunit", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -540,13 +342,12 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "How many seconds is the 99th percentile for reading (LIST|GET) a given resource?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 8, + "id": 5, "legend": { "alignAsTable": false, "avg": false, @@ -554,7 +355,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -569,21 +370,34 @@ items: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, + "repeat": "integration", "seriesOverrides": [ ], "spaceLength": 10, - "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"read\", cluster=\"$cluster\"}", + "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ resource }}", + "legendFormat": "{{instance}} 99th Percentile", "refId": "A" + }, + { + "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Median", + "refId": "B" + }, + { + "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Average", + "refId": "C" } ], "thresholds": [ @@ -591,9 +405,9 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Read SLI - Duration", + "title": "$integration: Notification Duration", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -630,209 +444,293 @@ items: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", + "showTitle": true, + "title": "Notifications", "titleSize": "h6", "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "decimals": 3, - "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) in 30 days have been answered successfully and fast enough?", - "format": "percentunit", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 9, - "interval": null, - "links": [ + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "alertmanager-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "apiserver_request:availability30d{verb=\"write\", cluster=\"$cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Write Availability (30d)", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" }, - { - "aliasColors": { + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?", - "fill": 10, - "fillGradient": 0, - "gridPos": { + ], + "query": "label_values(alertmanager_alerts, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ - }, - "id": 10, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "service", + "options": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - { - "alias": "/2../i", - "color": "#56A64B" - }, + ], + "query": "label_values(alertmanager_alerts, service)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": null, + "multi": false, + "name": "integration", + "options": [ + + ], + "query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Alertmanager / Overview", + "uid": "alertmanager-overview", + "version": 0 + } + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-alertmanager-overview + namespace: monitoring +- apiVersion: v1 + data: + apiserver.json: |- + { + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "panels": [ + { + "content": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.", + "datasource": null, + "description": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.", + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "mode": "markdown", + "span": 12, + "title": "Notice", + "type": "text" + } + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 3, + "description": "How many percent of requests (both read and write) in 30 days have been answered successfully and fast enough?", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 3, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ { - "alias": "/3../i", - "color": "#F2CC0C" + "name": "value to text", + "value": 1 }, { - "alias": "/4../i", - "color": "#3274D9" - }, + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ { - "alias": "/5../i", - "color": "#E02F44" + "from": "null", + "text": "N/A", + "to": "null" } ], - "spaceLength": 10, - "span": 3, - "stack": true, - "steppedLine": false, + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", "targets": [ { - "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", + "expr": "apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ code }}", + "legendFormat": "", "refId": "A" } ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Write SLI - Requests", + "thresholds": "", + "title": "Availability (30d) > 99.000%", "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] + "shared": false }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "op": "=", + "text": "N/A", + "value": "null" } - ] + ], + "valueName": "avg" }, { "aliasColors": { @@ -842,13 +740,14 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?", - "fill": 1, + "decimals": 3, + "description": "How much error budget is left looking at our 0.990% availability guarantees?", + "fill": 10, "fillGradient": 0, "gridPos": { }, - "id": 11, + "id": 4, "legend": { "alignAsTable": false, "avg": false, @@ -876,15 +775,15 @@ items: ], "spaceLength": 10, - "span": 3, + "span": 8, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", + "expr": "100 * (apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"} - 0.990000)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ resource }}", + "legendFormat": "errorbudget", "refId": "A" } ], @@ -893,7 +792,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Write SLI - Errors", + "title": "ErrorBudget (30d) > 99.000%", "tooltip": { "shared": false, "sort": 0, @@ -911,130 +810,124 @@ items: }, "yaxes": [ { + "decimals": 3, "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { + "decimals": 3, "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], "datasource": "$datasource", - "description": "How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?", - "fill": 1, - "fillGradient": 0, + "decimals": 3, + "description": "How many percent of read requests (LIST,GET) in 30 days have been answered successfully and fast enough?", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, "gridPos": { }, - "id": 12, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, + "id": 5, + "interval": null, "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } ], - "spaceLength": 10, "span": 3, - "stack": false, - "steppedLine": false, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", "targets": [ { - "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"write\", cluster=\"$cluster\"}", + "expr": "apiserver_request:availability30d{verb=\"read\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ resource }}", + "legendFormat": "", "refId": "A" } ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Write SLI - Duration", + "thresholds": "", + "title": "Read Availability (30d)", "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] + "shared": false }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "op": "=", + "text": "N/A", + "value": "null" } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + ], + "valueName": "avg" + }, { "aliasColors": { @@ -1043,12 +936,13 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "description": "How many read requests (LIST,GET) per second do the apiservers get by code?", + "fill": 10, "fillGradient": 0, "gridPos": { }, - "id": 13, + "id": 6, "legend": { "alignAsTable": false, "avg": false, @@ -1056,7 +950,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": false, + "show": true, "sideWidth": null, "total": false, "values": false @@ -1073,18 +967,33 @@ items: "renderer": "flot", "repeat": null, "seriesOverrides": [ - + { + "alias": "/2../i", + "color": "#56A64B" + }, + { + "alias": "/3../i", + "color": "#F2CC0C" + }, + { + "alias": "/4../i", + "color": "#3274D9" + }, + { + "alias": "/5../i", + "color": "#E02F44" + } ], "spaceLength": 10, - "span": 6, - "stack": false, + "span": 3, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_adds_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)", + "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{ code }}", "refId": "A" } ], @@ -1093,7 +1002,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Work Queue Add Rate", + "title": "Read SLI - Requests", "tooltip": { "shared": false, "sort": 0, @@ -1111,19 +1020,19 @@ items: }, "yaxes": [ { - "format": "ops", + "format": "reqps", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "ops", + "format": "reqps", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -1136,12 +1045,13 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "description": "How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 14, + "id": 7, "legend": { "alignAsTable": false, "avg": false, @@ -1149,7 +1059,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": false, + "show": true, "sideWidth": null, "total": false, "values": false @@ -1169,15 +1079,15 @@ items: ], "spaceLength": 10, - "span": 6, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_depth{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)", + "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{ resource }}", "refId": "A" } ], @@ -1186,7 +1096,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Work Queue Depth", + "title": "Read SLI - Errors", "tooltip": { "shared": false, "sort": 0, @@ -1204,7 +1114,7 @@ items: }, "yaxes": [ { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -1212,7 +1122,7 @@ items: "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -1229,23 +1139,24 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "description": "How many seconds is the 99th percentile for reading (LIST|GET) a given resource?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 15, + "id": 8, "legend": { - "alignAsTable": true, + "alignAsTable": false, "avg": false, - "current": true, + "current": false, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sideWidth": null, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, @@ -1262,15 +1173,15 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name, le))", + "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"read\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{ resource }}", "refId": "A" } ], @@ -1279,7 +1190,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Work Queue Latency", + "title": "Read SLI - Duration", "tooltip": { "shared": false, "sort": 0, @@ -1328,19 +1239,106 @@ items: "collapsed": false, "panels": [ { - "aliasColors": { - + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 3, + "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) in 30 days have been answered successfully and fast enough?", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 9, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "apiserver_request:availability30d{verb=\"write\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Write Availability (30d)", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { + }, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "description": "How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?", + "fill": 10, "fillGradient": 0, "gridPos": { }, - "id": 16, + "id": 10, "legend": { "alignAsTable": false, "avg": false, @@ -1365,18 +1363,33 @@ items: "renderer": "flot", "repeat": null, "seriesOverrides": [ - + { + "alias": "/2../i", + "color": "#56A64B" + }, + { + "alias": "/3../i", + "color": "#F2CC0C" + }, + { + "alias": "/4../i", + "color": "#3274D9" + }, + { + "alias": "/5../i", + "color": "#E02F44" + } ], "spaceLength": 10, - "span": 4, - "stack": false, + "span": 3, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", + "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{ code }}", "refId": "A" } ], @@ -1385,7 +1398,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory", + "title": "Write SLI - Requests", "tooltip": { "shared": false, "sort": 0, @@ -1403,7 +1416,7 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "reqps", "label": null, "logBase": 1, "max": null, @@ -1411,7 +1424,7 @@ items: "show": true }, { - "format": "bytes", + "format": "reqps", "label": null, "logBase": 1, "max": null, @@ -1428,12 +1441,13 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 17, + "id": 11, "legend": { "alignAsTable": false, "avg": false, @@ -1461,15 +1475,15 @@ items: ], "spaceLength": 10, - "span": 4, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])", + "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{ resource }}", "refId": "A" } ], @@ -1478,7 +1492,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "CPU usage", + "title": "Write SLI - Errors", "tooltip": { "shared": false, "sort": 0, @@ -1496,7 +1510,7 @@ items: }, "yaxes": [ { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -1504,7 +1518,7 @@ items: "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -1521,12 +1535,13 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "description": "How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 18, + "id": 12, "legend": { "alignAsTable": false, "avg": false, @@ -1554,15 +1569,15 @@ items: ], "spaceLength": 10, - "span": 4, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", + "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"write\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{ resource }}", "refId": "A" } ], @@ -1571,7 +1586,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Goroutines", + "title": "Write SLI - Duration", "tooltip": { "shared": false, "sort": 0, @@ -1589,7 +1604,7 @@ items: }, "yaxes": [ { - "format": "short", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -1597,7 +1612,7 @@ items: "show": true }, { - "format": "short", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -1614,784 +1629,447 @@ items: "title": "Dashboard Row", "titleSize": "h6", "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ + }, + "id": 13, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "query": "label_values(apiserver_request_total, cluster)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(workqueue_adds_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{name}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Work Queue Add Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] }, - "datasource": "$datasource", - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "instance", - "options": [ + { + "aliasColors": { - ], - "query": "label_values(apiserver_request_total{job=\"apiserver\", cluster=\"$cluster\"}, instance)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "UTC", - "title": "Kubernetes / API server", - "uid": "09ec8aa1e996d6ffcd6817bbaff4db1b", - "version": 0 - } - kind: ConfigMap - metadata: - name: grafana-dashboard-apiserver - namespace: monitoring -- apiVersion: v1 - data: - cluster-total.json: |- - { - "__inputs": [ + }, + "id": 14, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "__requires": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(workqueue_depth{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{name}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "panels": [ - { - "collapse": false, - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 2, - "panels": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Work Queue Depth", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Current Bandwidth", - "titleSize": "h6", - "type": "row" - }, - { - "aliasColors": { + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { - }, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 1 - }, - "id": 3, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [ - - ], - "minSpan": 24, - "nullPointMode": "null", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { - ], - "spaceLength": 10, - "span": 24, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{namespace}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ + }, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Current Rate of Bytes Received", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "series", - "name": null, - "show": false, - "values": [ - "current" - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - }, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 1 - }, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [ + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{name}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "minSpan": 24, - "nullPointMode": "null", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Work Queue Latency", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ], - "spaceLength": 10, - "span": 24, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{namespace}}", - "refId": "A", - "step": 10 + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] } ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Current Rate of Bytes Transmitted", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "series", - "name": null, - "show": false, - "values": [ - "current" - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" }, { - "columns": [ - { - "text": "Time", - "value": "Time" - }, - { - "text": "Value #A", - "value": "Value #A" - }, - { - "text": "Value #B", - "value": "Value #B" - }, - { - "text": "Value #C", - "value": "Value #C" - }, - { - "text": "Value #D", - "value": "Value #D" - }, - { - "text": "Value #E", - "value": "Value #E" - }, - { - "text": "Value #F", - "value": "Value #F" - }, - { - "text": "Value #G", - "value": "Value #G" - }, - { - "text": "Value #H", - "value": "Value #H" - }, + "collapse": false, + "collapsed": false, + "panels": [ { - "text": "namespace", - "value": "namespace" - } - ], - "datasource": "$datasource", - "fill": 1, - "fontSize": "90%", - "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 10 - }, - "id": 5, - "lines": true, - "linewidth": 1, - "links": [ + "aliasColors": { - ], - "minSpan": 24, - "nullPointMode": "null as zero", - "renderer": "flot", - "scroll": true, - "showHeader": true, - "sort": { - "col": 0, - "desc": false - }, - "spaceLength": 10, - "span": 24, - "styles": [ - { - "alias": "Time", - "colorMode": null, - "colors": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Time", - "thresholds": [ + }, + "id": 16, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], - "type": "hidden", - "unit": "short" - }, - { - "alias": "Current Bandwidth Received", - "colorMode": null, - "colors": [ + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], "thresholds": [ ], - "type": "number", - "unit": "Bps" + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "alias": "Current Bandwidth Transmitted", - "colorMode": null, - "colors": [ + "aliasColors": { - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { - ], - "type": "number", - "unit": "Bps" - }, - { - "alias": "Average Bandwidth Received", - "colorMode": null, - "colors": [ + }, + "id": 17, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "Bps" - }, - { - "alias": "Average Bandwidth Transmitted", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "Bps" - }, - { - "alias": "Rate of Received Packets", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "pps" - }, - { - "alias": "Rate of Transmitted Packets", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #F", - "thresholds": [ - - ], - "type": "number", - "unit": "pps" - }, - { - "alias": "Rate of Received Packets Dropped", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #G", - "thresholds": [ - - ], - "type": "number", - "unit": "pps" - }, - { - "alias": "Rate of Transmitted Packets Dropped", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #H", - "thresholds": [ - - ], - "type": "number", - "unit": "pps" - }, - { - "alias": "Namespace", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "d/8b7a8b326d7a6f1f04244066368c67af/kubernetes-networking-namespace-pods?orgId=1&refresh=30s&var-namespace=$__cell", - "pattern": "namespace", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sort_desc(sum(irate(container_network_receive_packets_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 - }, - { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "F", - "step": 10 - }, - { - "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "G", - "step": 10 - }, - { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "H", - "step": 10 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Current Status", - "type": "table" - }, - { - "collapse": true, - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 10 - }, - "id": 6, - "panels": [ - { - "aliasColors": { - - }, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 11 - }, - "id": 7, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [ - - ], - "minSpan": 24, - "nullPointMode": "null", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], "spaceLength": 10, - "span": 24, + "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "rate(process_cpu_seconds_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])", "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{namespace}}", - "refId": "A", - "step": 10 + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -2399,25 +2077,25 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Average Rate of Bytes Received", + "title": "CPU usage", "tooltip": { - "shared": true, - "sort": 2, + "shared": false, + "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, - "mode": "series", + "mode": "time", "name": null, - "show": false, + "show": true, "values": [ - "current" + ] }, "yaxes": [ { - "format": "Bps", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -2425,7 +2103,7 @@ items: "show": true }, { - "format": "Bps", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -2438,43 +2116,34 @@ items: "aliasColors": { }, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 2, + "fill": 1, "fillGradient": 0, "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 11 + }, - "id": 8, + "id": 18, "legend": { - "alignAsTable": true, + "alignAsTable": false, "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, + "current": false, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sideWidth": null, - "sort": "current", - "sortDesc": true, "total": false, - "values": true + "values": false }, - "lines": false, + "lines": true, "linewidth": 1, "links": [ ], - "minSpan": 24, "nullPointMode": "null", - "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, @@ -2484,17 +2153,16 @@ items: ], "spaceLength": 10, - "span": 24, + "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "go_goroutines{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{namespace}}", - "refId": "A", - "step": 10 + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -2502,37 +2170,37 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Average Rate of Bytes Transmitted", + "title": "Goroutines", "tooltip": { - "shared": true, - "sort": 2, + "shared": false, + "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, - "mode": "series", + "mode": "time", "name": null, - "show": false, + "show": true, "values": [ - "current" + ] }, "yaxes": [ { - "format": "Bps", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "Bps", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -2541,11 +2209,164 @@ items: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": true, - "title": "Average Bandwidth", + "showTitle": false, + "title": "Dashboard Row", "titleSize": "h6", "type": "row" - }, + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(apiserver_request_total, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(apiserver_request_total{job=\"apiserver\", cluster=\"$cluster\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Kubernetes / API server", + "uid": "09ec8aa1e996d6ffcd6817bbaff4db1b", + "version": 0 + } + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-apiserver + namespace: monitoring +- apiVersion: v1 + data: + cluster-total.json: |- + { + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "panels": [ { "collapse": false, "collapsed": false, @@ -2553,9 +2374,9 @@ items: "h": 1, "w": 24, "x": 0, - "y": 11 + "y": 0 }, - "id": 9, + "id": 2, "panels": [ ], @@ -2563,7 +2384,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Bandwidth History", + "title": "Current Bandwidth", "titleSize": "h6", "type": "row" }, @@ -2571,7 +2392,7 @@ items: "aliasColors": { }, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, "datasource": "$datasource", @@ -2579,32 +2400,34 @@ items: "fillGradient": 0, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, - "y": 12 + "y": 1 }, - "id": 10, + "id": 3, "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": true, "hideEmpty": true, "hideZero": true, - "max": true, - "min": true, + "max": false, + "min": false, "rightSide": true, "show": true, "sideWidth": null, + "sort": "current", + "sortDesc": true, "total": false, "values": true }, - "lines": true, - "linewidth": 2, + "lines": false, + "linewidth": 1, "links": [ ], "minSpan": 24, - "nullPointMode": "connected", + "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 5, @@ -2616,11 +2439,11 @@ items: ], "spaceLength": 10, "span": 24, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -2633,7 +2456,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Receive Bandwidth", + "title": "Current Rate of Bytes Received", "tooltip": { "shared": true, "sort": 2, @@ -2642,11 +2465,11 @@ items: "type": "graph", "xaxis": { "buckets": null, - "mode": "time", + "mode": "series", "name": null, - "show": true, + "show": false, "values": [ - + "current" ] }, "yaxes": [ @@ -2672,7 +2495,7 @@ items: "aliasColors": { }, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, "datasource": "$datasource", @@ -2680,32 +2503,34 @@ items: "fillGradient": 0, "gridPos": { "h": 9, - "w": 24, - "x": 0, - "y": 21 + "w": 12, + "x": 12, + "y": 1 }, - "id": 11, + "id": 4, "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": true, "hideEmpty": true, "hideZero": true, - "max": true, - "min": true, + "max": false, + "min": false, "rightSide": true, "show": true, "sideWidth": null, + "sort": "current", + "sortDesc": true, "total": false, "values": true }, - "lines": true, - "linewidth": 2, + "lines": false, + "linewidth": 1, "links": [ ], "minSpan": 24, - "nullPointMode": "connected", + "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 5, @@ -2717,11 +2542,11 @@ items: ], "spaceLength": 10, "span": 24, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -2734,7 +2559,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Transmit Bandwidth", + "title": "Current Rate of Bytes Transmitted", "tooltip": { "shared": true, "sort": 2, @@ -2743,11 +2568,11 @@ items: "type": "graph", "xaxis": { "buckets": null, - "mode": "time", + "mode": "series", "name": null, - "show": true, + "show": false, "values": [ - + "current" ] }, "yaxes": [ @@ -2769,6 +2594,336 @@ items: } ] }, + { + "columns": [ + { + "text": "Time", + "value": "Time" + }, + { + "text": "Value #A", + "value": "Value #A" + }, + { + "text": "Value #B", + "value": "Value #B" + }, + { + "text": "Value #C", + "value": "Value #C" + }, + { + "text": "Value #D", + "value": "Value #D" + }, + { + "text": "Value #E", + "value": "Value #E" + }, + { + "text": "Value #F", + "value": "Value #F" + }, + { + "text": "Value #G", + "value": "Value #G" + }, + { + "text": "Value #H", + "value": "Value #H" + }, + { + "text": "namespace", + "value": "namespace" + } + ], + "datasource": "$datasource", + "fill": 1, + "fontSize": "90%", + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 5, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "minSpan": 24, + "nullPointMode": "null as zero", + "renderer": "flot", + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": false + }, + "spaceLength": 10, + "span": 24, + "styles": [ + { + "alias": "Time", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Time", + "thresholds": [ + + ], + "type": "hidden", + "unit": "short" + }, + { + "alias": "Current Bandwidth Received", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Current Bandwidth Transmitted", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Average Bandwidth Received", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Average Bandwidth Transmitted", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Rate of Received Packets", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Transmitted Packets", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Received Packets Dropped", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #G", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Transmitted Packets Dropped", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #H", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "d/8b7a8b326d7a6f1f04244066368c67af/kubernetes-networking-namespace-pods?orgId=1&refresh=30s&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sort_desc(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + }, + { + "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "G", + "step": 10 + }, + { + "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "H", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Status", + "type": "table" + }, { "collapse": true, "collapsed": true, @@ -2776,15 +2931,15 @@ items: "h": 1, "w": 24, "x": 0, - "y": 30 + "y": 10 }, - "id": 12, + "id": 6, "panels": [ { "aliasColors": { }, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, "datasource": "$datasource", @@ -2792,32 +2947,34 @@ items: "fillGradient": 0, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, - "y": 31 + "y": 11 }, - "id": 13, + "id": 7, "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": true, "hideEmpty": true, "hideZero": true, - "max": true, - "min": true, + "max": false, + "min": false, "rightSide": true, "show": true, "sideWidth": null, + "sort": "current", + "sortDesc": true, "total": false, "values": true }, - "lines": true, - "linewidth": 2, + "lines": false, + "linewidth": 1, "links": [ ], "minSpan": 24, - "nullPointMode": "connected", + "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 5, @@ -2829,11 +2986,11 @@ items: ], "spaceLength": 10, "span": 24, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_packets_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -2846,7 +3003,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets", + "title": "Average Rate of Bytes Received", "tooltip": { "shared": true, "sort": 2, @@ -2855,16 +3012,16 @@ items: "type": "graph", "xaxis": { "buckets": null, - "mode": "time", + "mode": "series", "name": null, - "show": true, + "show": false, "values": [ - + "current" ] }, "yaxes": [ { - "format": "pps", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -2872,7 +3029,7 @@ items: "show": true }, { - "format": "pps", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -2885,7 +3042,7 @@ items: "aliasColors": { }, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, "datasource": "$datasource", @@ -2893,32 +3050,34 @@ items: "fillGradient": 0, "gridPos": { "h": 9, - "w": 24, - "x": 0, - "y": 40 + "w": 12, + "x": 12, + "y": 11 }, - "id": 14, + "id": 8, "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": true, "hideEmpty": true, "hideZero": true, - "max": true, - "min": true, + "max": false, + "min": false, "rightSide": true, "show": true, "sideWidth": null, + "sort": "current", + "sortDesc": true, "total": false, "values": true }, - "lines": true, - "linewidth": 2, + "lines": false, + "linewidth": 1, "links": [ ], "minSpan": 24, - "nullPointMode": "connected", + "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 5, @@ -2930,11 +3089,11 @@ items: ], "spaceLength": 10, "span": 24, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -2947,7 +3106,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets", + "title": "Average Rate of Bytes Transmitted", "tooltip": { "shared": true, "sort": 2, @@ -2956,16 +3115,16 @@ items: "type": "graph", "xaxis": { "buckets": null, - "mode": "time", + "mode": "series", "name": null, - "show": true, + "show": false, "values": [ - + "current" ] }, "yaxes": [ { - "format": "pps", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -2973,7 +3132,7 @@ items: "show": true }, { - "format": "pps", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -2987,82 +3146,527 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Packets", + "title": "Average Bandwidth", "titleSize": "h6", "type": "row" }, { - "collapse": true, - "collapsed": true, + "collapse": false, + "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 31 + "y": 11 }, - "id": 15, + "id": 9, "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 50 - }, - "id": 16, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": true, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [ - ], - "minSpan": 24, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bandwidth History", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { - ], - "spaceLength": 10, - "span": 24, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{namespace}}", - "refId": "A", - "step": 10 - } - ], + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "minSpan": 24, + "nullPointMode": "connected", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 24, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{namespace}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Receive Bandwidth", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "minSpan": 24, + "nullPointMode": "connected", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 24, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{namespace}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Transmit Bandwidth", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 12, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "minSpan": 24, + "nullPointMode": "connected", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 24, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{namespace}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Received Packets", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "minSpan": 24, + "nullPointMode": "connected", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 24, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{namespace}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Packets", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 15, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 50 + }, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "minSpan": 24, + "nullPointMode": "connected", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 24, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{namespace}}", + "refId": "A", + "step": 10 + } + ], "thresholds": [ ], @@ -3156,7 +3760,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~\".+\"}[$interval:$resolution])) by (namespace))", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\".+\"}[$interval:$resolution])) by (namespace))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{namespace}}", @@ -3261,7 +3865,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(rate(node_netstat_Tcp_RetransSegs[$interval:$resolution]) / rate(node_netstat_Tcp_OutSegs[$interval:$resolution])) by (instance))", + "expr": "sort_desc(sum(rate(node_netstat_Tcp_RetransSegs{cluster=\"$cluster\"}[$interval:$resolution]) / rate(node_netstat_Tcp_OutSegs{cluster=\"$cluster\"}[$interval:$resolution])) by (instance))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", @@ -3366,7 +3970,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(rate(node_netstat_TcpExt_TCPSynRetrans[$interval:$resolution]) / rate(node_netstat_Tcp_RetransSegs[$interval:$resolution])) by (instance))", + "expr": "sort_desc(sum(rate(node_netstat_TcpExt_TCPSynRetrans{cluster=\"$cluster\"}[$interval:$resolution]) / rate(node_netstat_Tcp_RetransSegs{cluster=\"$cluster\"}[$interval:$resolution])) by (instance))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", @@ -3530,30 +4134,56 @@ items: "refresh": 1, "regex": "", "type": "datasource" - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", "6h", "12h", "24h", @@ -3569,6 +4199,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-cluster-total namespace: monitoring - apiVersion: v1 @@ -3661,7 +4296,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kube-controller-manager\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-controller-manager\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -3730,10 +4365,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_adds_total{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{cluster}} {{instance}} {{name}}", "refId": "A" } ], @@ -3836,10 +4471,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_depth{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{cluster}} {{instance}} {{name}}", "refId": "A" } ], @@ -3942,10 +4577,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name, le))", + "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{cluster}} {{instance}} {{name}}", "refId": "A" } ], @@ -4162,7 +4797,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -4268,7 +4903,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -4374,7 +5009,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-controller-manager\",instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -4467,7 +5102,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-controller-manager\",instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -4560,7 +5195,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-controller-manager\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -4644,6 +5279,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"kube-controller-manager\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -4654,7 +5315,7 @@ items: "options": [ ], - "query": "label_values(process_cpu_seconds_total{job=\"kube-controller-manager\"}, instance)", + "query": "label_values(up{cluster=\"$cluster\", job=\"kube-controller-manager\"}, instance)", "refresh": 2, "regex": "", "sort": 1, @@ -4704,6 +5365,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-controller-manager namespace: monitoring - apiVersion: v1 @@ -4768,7 +5434,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[$__interval]))", + "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[$__rate_interval]))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -4852,7 +5518,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_cpu_cores{cluster=\"$cluster\"})", + "expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -4936,7 +5602,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_cpu_cores{cluster=\"$cluster\"})", + "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5020,7 +5686,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - sum(:node_memory_MemAvailable_bytes:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "1 - sum(:node_memory_MemAvailable_bytes:sum{cluster=\"$cluster\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5104,7 +5770,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5188,7 +5854,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -5283,7 +5949,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -5574,7 +6240,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5583,7 +6249,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5592,7 +6258,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5601,7 +6267,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5610,7 +6276,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6010,7 +6676,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6019,7 +6685,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6028,7 +6694,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6037,7 +6703,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6294,7 +6960,7 @@ items: ], "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6303,7 +6969,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6312,7 +6978,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6321,7 +6987,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6330,7 +6996,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6339,7 +7005,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6394,7 +7060,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -6434,12 +7100,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6486,13 +7152,99 @@ items: "show": false } ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Transmit Bandwidth", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -6508,7 +7260,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 13, + "id": 14, "legend": { "avg": false, "current": false, @@ -6532,12 +7284,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6550,7 +7302,93 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Transmit Bandwidth", + "title": "Average Container Bandwidth by Namespace: Received", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Average Container Bandwidth by Namespace: Transmitted", "tooltip": { "shared": false, "sort": 0, @@ -6590,7 +7428,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Namespace", "titleSize": "h6" }, { @@ -6606,7 +7444,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 14, + "id": 16, "legend": { "avg": false, "current": false, @@ -6630,12 +7468,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6648,7 +7486,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Average Container Bandwidth by Namespace: Received", + "title": "Rate of Received Packets", "tooltip": { "shared": false, "sort": 0, @@ -6666,7 +7504,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -6682,19 +7520,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -6704,7 +7530,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 15, + "id": 17, "legend": { "avg": false, "current": false, @@ -6728,12 +7554,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6746,7 +7572,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Average Container Bandwidth by Namespace: Transmitted", + "title": "Rate of Transmitted Packets", "tooltip": { "shared": false, "sort": 0, @@ -6764,7 +7590,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -6786,7 +7612,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -6802,7 +7628,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 16, + "id": 18, "legend": { "avg": false, "current": false, @@ -6826,12 +7652,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6844,7 +7670,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets", + "title": "Rate of Received Packets Dropped", "tooltip": { "shared": false, "sort": 0, @@ -6862,7 +7688,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -6878,19 +7704,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -6900,7 +7714,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 17, + "id": 19, "legend": { "avg": false, "current": false, @@ -6924,12 +7738,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -6942,7 +7756,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets", + "title": "Rate of Transmitted Packets Dropped", "tooltip": { "shared": false, "sort": 0, @@ -6960,7 +7774,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -6982,7 +7796,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" }, { @@ -6997,8 +7811,9 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "decimals": -1, "fill": 10, - "id": 18, + "id": 20, "legend": { "avg": false, "current": false, @@ -7022,12 +7837,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -7040,7 +7855,93 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets Dropped", + "title": "IOPS(Reads+Writes)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", "tooltip": { "shared": false, "sort": 0, @@ -7080,7 +7981,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO", "titleSize": "h6" }, { @@ -7095,8 +7996,8 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 19, + "fill": 1, + "id": 22, "legend": { "avg": false, "current": false, @@ -7107,7 +8008,7 @@ items: "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], @@ -7119,17 +8020,223 @@ items: "seriesOverrides": [ ], + "sort": { + "col": 4, + "desc": true + }, "spaceLength": 10, "span": 12, - "stack": true, + "stack": false, "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down to pods", + "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", - "format": "time_series", + "expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, "intervalFactor": 2, - "legendFormat": "{{namespace}}", - "legendLink": null, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", "step": 10 } ], @@ -7138,13 +8245,14 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets Dropped", + "title": "Current Storage IO", "tooltip": { "shared": false, "sort": 0, "value_type": "individual" }, - "type": "graph", + "transform": "table", + "type": "table", "xaxis": { "buckets": null, "mode": "time", @@ -7156,7 +8264,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -7178,7 +8286,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO - Distribution", "titleSize": "h6" } ], @@ -7220,7 +8328,7 @@ items: "options": [ ], - "query": "label_values(node_cpu_seconds_total, cluster)", + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)", "refresh": 2, "regex": "", "sort": 1, @@ -7270,6 +8378,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-k8s-resources-cluster namespace: monitoring - apiVersion: v1 @@ -7333,7 +8446,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -7417,7 +8530,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -7501,7 +8614,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -7511,7 +8624,7 @@ items: "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "Memory Utilization (from requests)", + "title": "Memory Utilisation (from requests)", "tooltip": { "shared": false, "sort": 0, @@ -7585,7 +8698,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -7677,8 +8790,9 @@ items: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -7687,8 +8801,9 @@ items: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -7699,7 +8814,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -7950,7 +9065,7 @@ items: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7959,7 +9074,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7968,7 +9083,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7977,7 +9092,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -7986,7 +9101,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8083,8 +9198,9 @@ items: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -8093,8 +9209,9 @@ items: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -8422,7 +9539,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8431,7 +9548,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8440,7 +9557,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8449,7 +9566,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8733,7 +9850,7 @@ items: ], "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8742,7 +9859,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8751,7 +9868,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8760,7 +9877,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8769,7 +9886,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8778,7 +9895,7 @@ items: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8833,7 +9950,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -8873,12 +9990,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -8925,13 +10042,99 @@ items: "show": false } ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Transmit Bandwidth", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -8947,7 +10150,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 11, + "id": 12, "legend": { "avg": false, "current": false, @@ -8971,12 +10174,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -8989,7 +10192,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Transmit Bandwidth", + "title": "Rate of Received Packets", "tooltip": { "shared": false, "sort": 0, @@ -9007,7 +10210,93 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -9029,7 +10318,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -9045,7 +10334,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 12, + "id": 14, "legend": { "avg": false, "current": false, @@ -9069,12 +10358,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9087,7 +10376,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets", + "title": "Rate of Received Packets Dropped", "tooltip": { "shared": false, "sort": 0, @@ -9105,7 +10394,93 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets Dropped", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -9127,7 +10502,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" }, { @@ -9142,8 +10517,9 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "decimals": -1, "fill": 10, - "id": 13, + "id": 16, "legend": { "avg": false, "current": false, @@ -9167,12 +10543,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9185,7 +10561,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets", + "title": "IOPS(Reads+Writes)", "tooltip": { "shared": false, "sort": 0, @@ -9203,7 +10579,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -9219,19 +10595,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -9241,7 +10605,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 14, + "id": 17, "legend": { "avg": false, "current": false, @@ -9265,12 +10629,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9283,7 +10647,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets Dropped", + "title": "ThroughPut(Read+Write)", "tooltip": { "shared": false, "sort": 0, @@ -9323,7 +10687,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO", "titleSize": "h6" }, { @@ -9338,8 +10702,8 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 15, + "fill": 1, + "id": 18, "legend": { "avg": false, "current": false, @@ -9350,7 +10714,7 @@ items: "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], @@ -9362,17 +10726,223 @@ items: "seriesOverrides": [ ], + "sort": { + "col": 4, + "desc": true + }, "spaceLength": 10, "span": 12, - "stack": true, + "stack": false, "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down to pods", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", - "format": "time_series", + "expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, "intervalFactor": 2, - "legendFormat": "{{pod}}", - "legendLink": null, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", "step": 10 } ], @@ -9381,13 +10951,14 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets Dropped", + "title": "Current Storage IO", "tooltip": { "shared": false, "sort": 0, "value_type": "individual" }, - "type": "graph", + "transform": "table", + "type": "table", "xaxis": { "buckets": null, "mode": "time", @@ -9399,7 +10970,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -9421,7 +10992,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO - Distribution", "titleSize": "h6" } ], @@ -9464,7 +11035,7 @@ items: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -9491,7 +11062,7 @@ items: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -9540,6 +11111,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-k8s-resources-namespace namespace: monitoring - apiVersion: v1 @@ -9602,7 +11178,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9837,7 +11413,7 @@ items: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9846,7 +11422,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9855,7 +11431,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9864,7 +11440,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9873,7 +11449,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10274,7 +11850,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10283,7 +11859,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10292,7 +11868,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10301,7 +11877,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10426,7 +12002,7 @@ items: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -10453,7 +12029,7 @@ items: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, node)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -10502,6 +12078,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-k8s-resources-node namespace: monitoring - apiVersion: v1 @@ -10581,7 +12162,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -10589,7 +12170,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "requests", @@ -10597,7 +12178,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "limits", @@ -10695,7 +12276,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container)", + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -10937,7 +12518,7 @@ items: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10946,7 +12527,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10955,7 +12536,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10964,7 +12545,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10973,7 +12554,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11071,7 +12652,7 @@ items: "dashes": true, "fill": 0, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -11081,7 +12662,7 @@ items: "dashes": true, "fill": 0, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -11092,7 +12673,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", image!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -11100,7 +12681,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "requests", @@ -11108,7 +12689,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "limits", @@ -11121,7 +12702,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Memory Usage (WSS)", "tooltip": { "shared": false, "sort": 0, @@ -11212,7 +12793,7 @@ items: "type": "hidden" }, { - "alias": "Memory Usage", + "alias": "Memory Usage (WSS)", "colorMode": null, "colors": [ @@ -11400,7 +12981,7 @@ items: ], "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", image!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11409,7 +12990,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11418,7 +12999,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11427,7 +13008,7 @@ items: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11436,7 +13017,7 @@ items: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -11559,12 +13140,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -11611,19 +13192,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -11658,12 +13227,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -11716,7 +13285,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -11757,12 +13326,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -11793,7 +13362,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -11809,19 +13378,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -11856,12 +13413,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -11892,7 +13449,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -11914,7 +13471,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -11955,12 +13512,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -11991,7 +13548,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -12007,19 +13564,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -12054,12 +13599,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -12090,7 +13635,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -12112,172 +13657,9 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "", - "value": "" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": null, - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "", - "value": "" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "", - "value": "" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "pod", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "UTC", - "title": "Kubernetes / Compute Resources / Pod", - "uid": "6581e46e4e5c7ba40a07646395ef7b23", - "version": 0 - } - kind: ConfigMap - metadata: - name: grafana-dashboard-k8s-resources-pod - namespace: monitoring -- apiVersion: v1 - data: - k8s-resources-workload.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ + }, { "collapse": false, "height": "250px", @@ -12290,8 +13672,9 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "decimals": -1, "fill": 10, - "id": 1, + "id": 12, "legend": { "avg": false, "current": false, @@ -12315,15 +13698,23 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{pod}}", + "legendFormat": "Reads", + "legendLink": null, + "step": 10 + }, + { + "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writes", "legendLink": null, "step": 10 } @@ -12333,7 +13724,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "CPU Usage", + "title": "IOPS", "tooltip": { "shared": false, "sort": 0, @@ -12367,19 +13758,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -12388,8 +13767,8 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 2, + "fill": 10, + "id": 13, "legend": { "avg": false, "current": false, @@ -12400,7 +13779,7 @@ items: "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 0, "links": [ ], @@ -12413,190 +13792,24 @@ items: ], "spaceLength": 10, - "span": 12, - "stack": false, + "span": 6, + "stack": true, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "CPU Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "CPU Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Pod", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "D", + "legendFormat": "Reads", + "legendLink": null, "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, + "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "E", + "legendFormat": "Writes", + "legendLink": null, "step": 10 } ], @@ -12605,14 +13818,13 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "CPU Quota", + "title": "ThroughPut", "tooltip": { "shared": false, "sort": 0, "value_type": "individual" }, - "transform": "table", - "type": "table", + "type": "graph", "xaxis": { "buckets": null, "mode": "time", @@ -12624,7 +13836,7 @@ items: }, "yaxes": [ { - "format": "short", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -12646,7 +13858,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "CPU Quota", + "title": "Storage IO - Distribution(Pod - Read & Writes)", "titleSize": "h6" }, { @@ -12661,8 +13873,9 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "decimals": -1, "fill": 10, - "id": 3, + "id": 14, "legend": { "avg": false, "current": false, @@ -12686,15 +13899,15 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "ceil(sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m])))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{pod}}", + "legendFormat": "{{container}}", "legendLink": null, "step": 10 } @@ -12704,7 +13917,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "IOPS(Reads+Writes)", "tooltip": { "shared": false, "sort": 0, @@ -12722,7 +13935,7 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -12738,19 +13951,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -12759,8 +13960,8 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 4, + "fill": 10, + "id": 15, "legend": { "avg": false, "current": false, @@ -12771,7 +13972,7 @@ items: "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 0, "links": [ ], @@ -12784,43 +13985,145 @@ items: ], "spaceLength": 10, - "span": 12, - "stack": false, + "span": 6, + "stack": true, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, + "targets": [ { - "alias": "Memory Usage", - "colorMode": null, - "colors": [ + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ], - "type": "number", - "unit": "bytes" - }, + ] + }, + "yaxes": [ { - "alias": "Memory Requests", + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution(Containers)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "sort": { + "col": 4, + "desc": true + }, + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", "colorMode": null, "colors": [ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, "link": false, "linkTargetBlank": false, "linkTooltip": "Drill down", @@ -12830,16 +14133,16 @@ items: ], "type": "number", - "unit": "bytes" + "unit": "short" }, { - "alias": "Memory Requests %", + "alias": "IOPS(Reads + Writes)", "colorMode": null, "colors": [ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, + "decimals": -1, "link": false, "linkTargetBlank": false, "linkTooltip": "Drill down", @@ -12849,10 +14152,10 @@ items: ], "type": "number", - "unit": "percentunit" + "unit": "short" }, { - "alias": "Memory Limits", + "alias": "Throughput(Read)", "colorMode": null, "colors": [ @@ -12868,10 +14171,10 @@ items: ], "type": "number", - "unit": "bytes" + "unit": "Bps" }, { - "alias": "Memory Limits %", + "alias": "Throughput(Write)", "colorMode": null, "colors": [ @@ -12887,21 +14190,40 @@ items: ], "type": "number", - "unit": "percentunit" + "unit": "Bps" }, { - "alias": "Pod", + "alias": "Throughput(Read + Write)", "colorMode": null, "colors": [ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, - "link": true, + "link": false, "linkTargetBlank": false, "linkTooltip": "Drill down", - "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Container", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", "thresholds": [ ], @@ -12926,7 +14248,7 @@ items: ], "targets": [ { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", "format": "table", "instant": true, "intervalFactor": 2, @@ -12935,7 +14257,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum by(container) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", "format": "table", "instant": true, "intervalFactor": 2, @@ -12944,7 +14266,7 @@ items: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", "format": "table", "instant": true, "intervalFactor": 2, @@ -12953,7 +14275,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", "format": "table", "instant": true, "intervalFactor": 2, @@ -12962,13 +14284,22 @@ items: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum by(container) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", "format": "table", "instant": true, "intervalFactor": 2, "legendFormat": "", "refId": "E", "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 } ], "thresholds": [ @@ -12976,7 +14307,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory Quota", + "title": "Current Storage IO", "tooltip": { "shared": false, "sort": 0, @@ -13017,7 +14348,273 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Memory Quota", + "title": "Storage IO - Distribution", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "pod", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Kubernetes / Compute Resources / Pod", + "uid": "6581e46e4e5c7ba40a07646395ef7b23", + "version": 0 + } + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-k8s-resources-pod + namespace: monitoring +- apiVersion: v1 + data: + k8s-resources-workload.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Usage", "titleSize": "h6" }, { @@ -13033,8 +14630,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 5, - "interval": "1m", + "id": 2, "legend": { "avg": false, "current": false, @@ -13069,7 +14665,7 @@ items: "type": "hidden" }, { - "alias": "Current Receive Bandwidth", + "alias": "CPU Usage", "colorMode": null, "colors": [ @@ -13085,10 +14681,10 @@ items: ], "type": "number", - "unit": "Bps" + "unit": "short" }, { - "alias": "Current Transmit Bandwidth", + "alias": "CPU Requests", "colorMode": null, "colors": [ @@ -13104,10 +14700,10 @@ items: ], "type": "number", - "unit": "Bps" + "unit": "short" }, { - "alias": "Rate of Received Packets", + "alias": "CPU Requests %", "colorMode": null, "colors": [ @@ -13123,10 +14719,10 @@ items: ], "type": "number", - "unit": "pps" + "unit": "percentunit" }, { - "alias": "Rate of Transmitted Packets", + "alias": "CPU Limits", "colorMode": null, "colors": [ @@ -13142,10 +14738,10 @@ items: ], "type": "number", - "unit": "pps" + "unit": "short" }, { - "alias": "Rate of Received Packets Dropped", + "alias": "CPU Limits %", "colorMode": null, "colors": [ @@ -13161,26 +14757,7 @@ items: ], "type": "number", - "unit": "pps" - }, - { - "alias": "Rate of Transmitted Packets Dropped", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #F", - "thresholds": [ - - ], - "type": "number", - "unit": "pps" + "unit": "percentunit" }, { "alias": "Pod", @@ -13219,7 +14796,7 @@ items: ], "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -13228,7 +14805,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -13237,7 +14814,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -13246,7 +14823,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -13255,22 +14832,13 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, "legendFormat": "", "refId": "E", "step": 10 - }, - { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "F", - "step": 10 } ], "thresholds": [ @@ -13278,7 +14846,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Current Network Usage", + "title": "CPU Quota", "tooltip": { "shared": false, "sort": 0, @@ -13319,7 +14887,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "CPU Quota", "titleSize": "h6" }, { @@ -13335,7 +14903,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 3, "legend": { "avg": false, "current": false, @@ -13364,7 +14932,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13377,7 +14945,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Receive Bandwidth", + "title": "Memory Usage", "tooltip": { "shared": false, "sort": 0, @@ -13395,7 +14963,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -13417,7 +14985,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Memory Usage", "titleSize": "h6" }, { @@ -13432,8 +15000,8 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 7, + "fill": 1, + "id": 4, "legend": { "avg": false, "current": false, @@ -13444,7 +15012,7 @@ items: "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], @@ -13458,42 +15026,217 @@ items: ], "spaceLength": 10, "span": 12, - "stack": true, + "stack": false, "steppedLine": false, - "targets": [ + "styles": [ { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Transmit Bandwidth", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, { - "format": "Bps", + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Quota", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", "label": null, "logBase": 1, "max": null, @@ -13515,7 +15258,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Memory Quota", "titleSize": "h6" }, { @@ -13530,8 +15273,9 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 8, + "fill": 1, + "id": 5, + "interval": "1m", "legend": { "avg": false, "current": false, @@ -13542,7 +15286,7 @@ items: "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], @@ -13556,15 +15300,217 @@ items: ], "spaceLength": 10, "span": 12, - "stack": true, + "stack": false, "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Current Receive Bandwidth", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Current Transmit Bandwidth", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Rate of Received Packets", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Transmitted Packets", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Received Packets Dropped", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Transmitted Packets Dropped", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], "targets": [ { - "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", - "format": "time_series", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "format": "table", + "instant": true, "intervalFactor": 2, - "legendFormat": "{{pod}}", - "legendLink": null, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", "step": 10 } ], @@ -13573,13 +15519,14 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Average Container Bandwidth by Pod: Received", + "title": "Current Network Usage", "tooltip": { "shared": false, "sort": 0, "value_type": "individual" }, - "type": "graph", + "transform": "table", + "type": "table", "xaxis": { "buckets": null, "mode": "time", @@ -13591,7 +15538,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -13613,7 +15560,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -13629,7 +15576,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 9, + "id": 6, "legend": { "avg": false, "current": false, @@ -13653,12 +15600,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13671,7 +15618,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Average Container Bandwidth by Pod: Transmitted", + "title": "Receive Bandwidth", "tooltip": { "shared": false, "sort": 0, @@ -13705,19 +15652,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -13727,7 +15662,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 10, + "id": 7, "legend": { "avg": false, "current": false, @@ -13751,12 +15686,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13769,7 +15704,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets", + "title": "Transmit Bandwidth", "tooltip": { "shared": false, "sort": 0, @@ -13809,7 +15744,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -13825,7 +15760,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 11, + "id": 8, "legend": { "avg": false, "current": false, @@ -13849,12 +15784,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13867,7 +15802,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets", + "title": "Average Container Bandwidth by Pod: Received", "tooltip": { "shared": false, "sort": 0, @@ -13901,19 +15836,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -13923,7 +15846,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 12, + "id": 9, "legend": { "avg": false, "current": false, @@ -13947,12 +15870,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -13965,7 +15888,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets Dropped", + "title": "Average Container Bandwidth by Pod: Transmitted", "tooltip": { "shared": false, "sort": 0, @@ -14005,7 +15928,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Pod", "titleSize": "h6" }, { @@ -14021,7 +15944,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 13, + "id": 10, "legend": { "avg": false, "current": false, @@ -14045,12 +15968,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -14063,7 +15986,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets Dropped", + "title": "Rate of Received Packets", "tooltip": { "shared": false, "sort": 0, @@ -14081,7 +16004,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -14097,36 +16020,306 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ + { + "aliasColors": { - ], - "query": "prometheus", - "refresh": 1, + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Rate of Packets", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Received Packets Dropped", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets Dropped", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Rate of Packets Dropped", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, "regex": "", "type": "datasource" }, @@ -14146,7 +16339,7 @@ items: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -14173,7 +16366,7 @@ items: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -14200,7 +16393,7 @@ items: ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -14227,7 +16420,7 @@ items: ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -14276,6 +16469,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-k8s-resources-workload namespace: monitoring - apiVersion: v1 @@ -14335,8 +16533,9 @@ items: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -14345,8 +16544,9 @@ items: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -14357,7 +16557,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}} - {{workload_type}}", @@ -14655,7 +16855,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14664,7 +16864,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14673,7 +16873,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14682,7 +16882,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14691,7 +16891,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14788,8 +16988,9 @@ items: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -14798,8 +16999,9 @@ items: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -15117,7 +17319,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15126,7 +17328,7 @@ items: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15135,7 +17337,7 @@ items: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15144,7 +17346,7 @@ items: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15420,7 +17622,7 @@ items: ], "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15429,7 +17631,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15438,7 +17640,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15447,7 +17649,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15456,7 +17658,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15465,7 +17667,7 @@ items: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -15520,7 +17722,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -15560,12 +17762,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -15612,13 +17814,99 @@ items: "show": false } ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{workload}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Transmit Bandwidth", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -15634,7 +17922,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -15658,12 +17946,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -15676,7 +17964,93 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Transmit Bandwidth", + "title": "Average Container Bandwidth by Workload: Received", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{workload}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Average Container Bandwidth by Workload: Transmitted", "tooltip": { "shared": false, "sort": 0, @@ -15716,7 +18090,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Workload", "titleSize": "h6" }, { @@ -15732,7 +18106,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 10, "legend": { "avg": false, "current": false, @@ -15756,12 +18130,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -15774,7 +18148,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Average Container Bandwidth by Workload: Received", + "title": "Rate of Received Packets", "tooltip": { "shared": false, "sort": 0, @@ -15792,7 +18166,93 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{workload}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -15814,7 +18274,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -15830,7 +18290,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 9, + "id": 12, "legend": { "avg": false, "current": false, @@ -15854,306 +18314,12 @@ items: ], "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{workload}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Average Container Bandwidth by Workload: Transmitted", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{workload}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Rate of Received Packets", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 11, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{workload}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Rate of Transmitted Packets", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 12, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -16184,7 +18350,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -16200,19 +18366,7 @@ items: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -16246,12 +18400,12 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -16282,7 +18436,7 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -16304,7 +18458,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" } ], @@ -16333,28 +18487,23 @@ items: }, { "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", "current": { - "text": "deployment", - "value": "deployment" + "text": "", + "value": "" }, "datasource": "$datasource", - "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "hide": 0, + "hide": 2, "includeAll": false, "label": null, "multi": false, - "name": "type", + "name": "cluster", "options": [ ], - "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "refresh": 1, + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, "regex": "", - "skipUrlSync": false, - "sort": 0, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -16365,23 +18514,28 @@ items: }, { "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", "current": { - "text": "", - "value": "" + "text": "deployment", + "value": "deployment" }, "datasource": "$datasource", - "hide": 2, + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "hide": 0, "includeAll": false, "label": null, "multi": false, - "name": "cluster", + "name": "type", "options": [ ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "refresh": 2, "regex": "", - "sort": 1, + "skipUrlSync": false, + "sort": 0, "tagValuesQuery": "", "tags": [ @@ -16406,7 +18560,7 @@ items: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -16455,6 +18609,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-k8s-resources-workloads-namespace namespace: monitoring - apiVersion: v1 @@ -16480,3095 +18639,1714 @@ items: "links": [ ], - "refresh": "10s", - "rows": [ + "panels": [ { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(up{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } + "mappings": [ + ], - "thresholds": "", - "title": "Up", - "tooltip": { - "shared": false + "thresholds": { + "mode": "absolute", + "steps": [ + + ] }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "valueName": "min" - }, + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" + "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Running Kubelets", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + ] }, - "id": 3, - "interval": null, - "links": [ + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 3, + "links": [ + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Pods", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, + "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Running Pods", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Container", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" + "mappings": [ + ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { + "thresholds": { + "mode": "absolute", + "steps": [ + ] }, - "id": 5, - "interval": null, - "links": [ + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 4, + "links": [ + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Actual Volume Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 6, - "interval": null, + "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Running Container", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } + "mappings": [ + ], - "thresholds": "", - "title": "Desired Volume Count", - "tooltip": { - "shared": false + "thresholds": { + "mode": "absolute", + "steps": [ + + ] }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 5, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "valueName": "min" - }, + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Actual Volume Count", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } + "mappings": [ + ], - "thresholds": "", - "title": "Config Error Count", - "tooltip": { - "shared": false + "thresholds": { + "mode": "absolute", + "steps": [ + + ] }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 6, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "valueName": "min" + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "title": "Desired Volume Count", + "transparent": false, + "type": "stat" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ + "mappings": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "thresholds": { + "mode": "absolute", + "steps": [ ] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 7, + "links": [ + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "title": "Config Error Count", + "transparent": false, + "type": "stat" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + "aliasColors": { - }, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + "aliasColors": { - }, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} pod", - "refId": "A" - }, - { - "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} worker", - "refId": "B" - } - ], - "thresholds": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} pod", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} worker", - "refId": "B" - } - ], - "thresholds": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + "aliasColors": { - }, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} pod", + "refId": "A" + }, + { + "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} worker", + "refId": "B" + } + ], + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - }, - "id": 14, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} pod", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} worker", + "refId": "B" + } + ], + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + "aliasColors": { - }, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 16, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager operation rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "Pod lifecycle event generator", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 19, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist interval", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 21, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "2xx", - "refId": "A" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "3xx", - "refId": "B" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "4xx", - "refId": "C" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "5xx", - "refId": "D" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "RPC Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 22, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{verb}} {{url}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Request duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "seriesOverrides": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 23, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ + "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, + ] + }, + "yaxes": [ { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 24, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 25, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Goroutines", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ + ] + }, + { + "aliasColors": { - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" }, - { - "allValue": null, - "current": { + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "tagsQuery": "", - "type": "query", - "useTags": false + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" }, - { - "allValue": null, - "current": { + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "datasource": "$datasource", - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "instance", - "options": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "query": "label_values(kubelet_runtime_operations_total{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"}, instance)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "UTC", - "title": "Kubernetes / Kubelet", - "uid": "3138fa155d5915769fbded898ac09fd9", - "version": 0 - } - kind: ConfigMap - metadata: - name: grafana-dashboard-kubelet - namespace: monitoring -- apiVersion: v1 - data: - namespace-by-pod.json: |- - { - "__inputs": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "__requires": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ], - "panels": [ + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, { - "collapse": false, - "collapsed": false, + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 1, - "w": 24, + "h": 7, + "w": 12, "x": 0, - "y": 0 + "y": 42 }, - "id": 2, - "panels": [ + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Current Bandwidth", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager operation rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "decimals": 0, - "format": "time_series", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 9, + "h": 7, "w": 12, - "x": 0, - "y": 1 + "x": 12, + "y": 42 }, - "height": 9, - "id": 3, - "interval": null, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "name": "range to text", - "value": 2 + "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" } ], - "maxDataPoints": 100, - "minSpan": 12, - "nullPointMode": "connected", - "nullText": null, - "options": { - "fieldOptions": { - "calcs": [ - "last" - ], - "defaults": { - "max": 10000000000, - "min": 0, - "title": "$namespace", - "unit": "Bps" - }, - "mappings": [ + "thresholds": [ - ], - "override": { + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - }, - "thresholds": [ - { - "color": "dark-green", - "index": 0, - "value": null - }, - { - "color": "dark-yellow", - "index": 1, - "value": 5000000000 - }, - { - "color": "dark-red", - "index": 2, - "value": 7000000000 - } - ], - "values": false + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Pod lifecycle event generator", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true }, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } + "lines": true, + "linewidth": 1, + "links": [ + ], - "span": 12, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution]))", + "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", "format": "time_series", - "instant": null, - "intervalFactor": 1, - "legendFormat": "", + "intervalFactor": 2, + "legendFormat": "{{instance}}", "refId": "A" } ], - "thresholds": "", + "thresholds": [ + + ], "timeFrom": null, "timeShift": null, - "title": "Current Rate of Bytes Received", - "type": "gauge", - "valueFontSize": "80%", - "valueMaps": [ + "title": "PLEG relist rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ { - "op": "=", - "text": "N/A", - "value": "null" + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "valueName": "current" + ] }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "decimals": 0, - "format": "time_series", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 9, + "h": 7, "w": 12, "x": 12, - "y": 1 + "y": 49 }, - "height": 9, - "id": 4, - "interval": null, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "minSpan": 12, - "nullPointMode": "connected", - "nullText": null, - "options": { - "fieldOptions": { - "calcs": [ - "last" - ], - "defaults": { - "max": 10000000000, - "min": 0, - "title": "$namespace", - "unit": "Bps" - }, - "mappings": [ - - ], - "override": { + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - }, - "thresholds": [ - { - "color": "dark-green", - "index": 0, - "value": null - }, - { - "color": "dark-yellow", - "index": 1, - "value": 5000000000 - }, - { - "color": "dark-red", - "index": 2, - "value": 7000000000 - } - ], - "values": false - } - }, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } ], - "span": 12, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution]))", + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", "format": "time_series", - "instant": null, - "intervalFactor": 1, - "legendFormat": "", + "intervalFactor": 2, + "legendFormat": "{{instance}}", "refId": "A" } ], - "thresholds": "", + "thresholds": [ + + ], "timeFrom": null, "timeShift": null, - "title": "Current Rate of Bytes Transmitted", - "type": "gauge", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "columns": [ - { - "text": "Time", - "value": "Time" - }, - { - "text": "Value #A", - "value": "Value #A" - }, - { - "text": "Value #B", - "value": "Value #B" - }, - { - "text": "Value #C", - "value": "Value #C" - }, - { - "text": "Value #D", - "value": "Value #D" - }, - { - "text": "Value #E", - "value": "Value #E" - }, + "title": "PLEG relist interval", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ { - "text": "Value #F", - "value": "Value #F" + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "text": "pod", - "value": "pod" + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "$datasource", "fill": 1, - "fontSize": "100%", + "fillGradient": 0, "gridPos": { - "h": 9, + "h": 7, "w": 24, "x": 0, - "y": 10 + "y": 56 + }, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true }, - "id": 5, "lines": true, "linewidth": 1, "links": [ ], - "minSpan": 24, - "nullPointMode": "null as zero", + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, "renderer": "flot", - "scroll": true, - "showHeader": true, - "sort": { - "col": 0, - "desc": false - }, - "spaceLength": 10, - "span": 24, - "styles": [ - { - "alias": "Time", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Time", - "thresholds": [ - - ], - "type": "hidden", - "unit": "short" - }, - { - "alias": "Bandwidth Received", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "Bps" - }, - { - "alias": "Bandwidth Transmitted", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "Bps" - }, - { - "alias": "Rate of Received Packets", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ + "repeat": null, + "seriesOverrides": [ - ], - "type": "number", - "unit": "pps" - }, + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "alias": "Rate of Transmitted Packets", - "colorMode": null, - "colors": [ + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ], - "type": "number", - "unit": "pps" - }, + ] + }, + "yaxes": [ { - "alias": "Rate of Received Packets Dropped", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "pps" + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "alias": "Rate of Transmitted Packets Dropped", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #F", - "thresholds": [ + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "type": "number", - "unit": "pps" - }, - { - "alias": "Pod", - "colorMode": null, - "colors": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 63 + }, + "id": 21, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "d/7a18067ce943a40ae25454675c19ff5c/kubernetes-networking-pod?orgId=1&refresh=30s&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", - "thresholds": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "type": "number", - "unit": "short" - } ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", - "format": "table", - "instant": true, + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 + "legendFormat": "2xx", + "refId": "A" }, { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", - "format": "table", - "instant": true, + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 + "legendFormat": "3xx", + "refId": "B" }, { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", - "format": "table", - "instant": true, + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 + "legendFormat": "4xx", + "refId": "C" }, { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", - "format": "table", - "instant": true, + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "F", - "step": 10 + "legendFormat": "5xx", + "refId": "D" } + ], + "thresholds": [ + ], "timeFrom": null, "timeShift": null, - "title": "Current Status", - "type": "table" - }, - { - "collapse": false, - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 19 + "title": "RPC Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" }, - "id": 6, - "panels": [ + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Bandwidth", - "titleSize": "h6", - "type": "row" + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { "aliasColors": { @@ -19578,37 +20356,33 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 2, + "fill": 1, "fillGradient": 0, "gridPos": { - "h": 9, - "w": 12, + "h": 7, + "w": 24, "x": 0, - "y": 20 + "y": 70 }, - "id": 7, + "id": 22, "legend": { - "alignAsTable": false, + "alignAsTable": true, "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, + "current": true, "max": false, "min": false, - "rightSide": false, + "rightSide": true, "show": true, "sideWidth": null, "total": false, - "values": false + "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [ ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, @@ -19618,17 +20392,15 @@ items: ], "spaceLength": 10, - "span": 12, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 + "intervalFactor": 2, + "legendFormat": "{{instance}} {{verb}} {{url}}", + "refId": "A" } ], "thresholds": [ @@ -19636,10 +20408,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Receive Bandwidth", + "title": "Request duration 99th quantile", "tooltip": { "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, "type": "graph", @@ -19654,19 +20426,19 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "s", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "Bps", + "format": "s", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -19679,21 +20451,19 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 2, + "fill": 1, "fillGradient": 0, "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 20 + "h": 7, + "w": 8, + "x": 0, + "y": 77 }, - "id": 8, + "id": 23, "legend": { "alignAsTable": false, "avg": false, "current": false, - "hideEmpty": true, - "hideZero": true, "max": false, "min": false, "rightSide": false, @@ -19703,13 +20473,11 @@ items: "values": false }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [ ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, @@ -19719,17 +20487,15 @@ items: ], "spaceLength": 10, - "span": 12, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -19737,10 +20503,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Transmit Bandwidth", + "title": "Memory", "tooltip": { "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, "type": "graph", @@ -19755,473 +20521,219 @@ items: }, "yaxes": [ { - "format": "Bps", + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "Bps", + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] }, { - "collapse": true, - "collapsed": true, + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 29 + "h": 7, + "w": 8, + "x": 8, + "y": 77 }, - "id": 9, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 30 - }, - "id": 10, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [ - - ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ + "id": 24, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Rate of Received Packets", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ] - }, - "yaxes": [ - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 30 - }, - "id": 11, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [ - - ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Rate of Transmitted Packets", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Packets", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": true, - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 30 - }, - "id": 12, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 40 - }, - "id": 13, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [ - - ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Rate of Received Packets Dropped", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 40 - }, - "id": 14, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 77 + }, + "id": 25, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Rate of Transmitted Packets Dropped", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "pps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Errors", - "titleSize": "h6", - "type": "row" + ] } ], "refresh": "10s", "rows": [ ], - "schemaVersion": 18, + "schemaVersion": 14, "style": "dark", "tags": [ "kubernetes-mixin" @@ -20245,28 +20757,22 @@ items: "type": "datasource" }, { - "allValue": ".+", - "auto": false, - "auto_count": 30, - "auto_min": "10s", + "allValue": null, "current": { - "text": "kube-system", - "value": "kube-system" + }, "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total, namespace)", - "hide": 0, - "includeAll": true, - "label": null, + "hide": 2, + "includeAll": false, + "label": "cluster", "multi": false, - "name": "namespace", + "name": "cluster", "options": [ ], - "query": "label_values(container_network_receive_packets_total, namespace)", - "refresh": 1, + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics\"}, cluster)", + "refresh": 2, "regex": "", - "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ @@ -20278,82 +20784,28 @@ items: }, { "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", "current": { - "text": "5m", - "value": "5m" + }, "datasource": "$datasource", "hide": 0, - "includeAll": false, + "includeAll": true, "label": null, "multi": false, - "name": "resolution", + "name": "instance", "options": [ - { - "selected": false, - "text": "30s", - "value": "30s" - }, - { - "selected": true, - "text": "5m", - "value": "5m" - }, - { - "selected": false, - "text": "1h", - "value": "1h" - } - ], - "query": "30s,5m,1h", - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "interval", - "useTags": false - }, - { - "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "5m", - "value": "5m" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": null, - "multi": false, - "name": "interval", - "options": [ - { - "selected": true, - "text": "4h", - "value": "4h" - } - ], - "query": "4h", + "query": "label_values(kubelet_runtime_operations_total{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"}, instance)", "refresh": 2, "regex": "", - "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ ], "tagsQuery": "", - "type": "interval", + "type": "query", "useTags": false } ] @@ -20388,17 +20840,22 @@ items: ] }, "timezone": "UTC", - "title": "Kubernetes / Networking / Namespace (Pods)", - "uid": "8b7a8b326d7a6f1f04244066368c67af", + "title": "Kubernetes / Kubelet", + "uid": "3138fa155d5915769fbded898ac09fd9", "version": 0 } kind: ConfigMap metadata: - name: grafana-dashboard-namespace-by-pod + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-kubelet namespace: monitoring - apiVersion: v1 data: - namespace-by-workload.json: |- + namespace-by-pod.json: |- { "__inputs": [ @@ -20450,210 +20907,258 @@ items: "type": "row" }, { - "aliasColors": { - - }, - "bars": true, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, + "decimals": 0, + "format": "time_series", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, "gridPos": { "h": 9, "w": 12, "x": 0, "y": 1 }, + "height": 9, "id": 3, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, + "interval": null, "links": [ ], - "minSpan": 24, - "nullPointMode": "null", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "minSpan": 12, + "nullPointMode": "connected", + "nullText": null, + "options": { + "fieldOptions": { + "calcs": [ + "last" + ], + "defaults": { + "max": 10000000000, + "min": 0, + "title": "$namespace", + "unit": "Bps" + }, + "mappings": [ + + ], + "override": { + }, + "thresholds": [ + { + "color": "dark-green", + "index": 0, + "value": null + }, + { + "color": "dark-yellow", + "index": 1, + "value": 5000000000 + }, + { + "color": "dark-red", + "index": 2, + "value": 7000000000 + } + ], + "values": false + } + }, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } ], - "spaceLength": 10, - "span": 24, - "stack": false, - "steppedLine": false, + "span": 12, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution]))", "format": "time_series", + "instant": null, "intervalFactor": 1, - "legendFormat": "{{ workload }}", - "refId": "A", - "step": 10 + "legendFormat": "", + "refId": "A" } ], - "thresholds": [ - - ], + "thresholds": "", "timeFrom": null, "timeShift": null, "title": "Current Rate of Bytes Received", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "series", - "name": null, - "show": false, - "values": [ - "current" - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, + "type": "gauge", + "valueFontSize": "80%", + "valueMaps": [ { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true + "op": "=", + "text": "N/A", + "value": "null" } - ] + ], + "valueName": "current" }, { - "aliasColors": { - - }, - "bars": true, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, + "decimals": 0, + "format": "time_series", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, "gridPos": { "h": 9, "w": 12, "x": 12, "y": 1 }, + "height": 9, "id": 4, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, + "interval": null, "links": [ ], - "minSpan": 24, - "nullPointMode": "null", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "minSpan": 12, + "nullPointMode": "connected", + "nullText": null, + "options": { + "fieldOptions": { + "calcs": [ + "last" + ], + "defaults": { + "max": 10000000000, + "min": 0, + "title": "$namespace", + "unit": "Bps" + }, + "mappings": [ + + ], + "override": { + }, + "thresholds": [ + { + "color": "dark-green", + "index": 0, + "value": null + }, + { + "color": "dark-yellow", + "index": 1, + "value": 5000000000 + }, + { + "color": "dark-red", + "index": 2, + "value": 7000000000 + } + ], + "values": false + } + }, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } ], - "spaceLength": 10, - "span": 24, - "stack": false, - "steppedLine": false, + "span": 12, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution]))", "format": "time_series", + "instant": null, "intervalFactor": 1, - "legendFormat": "{{ workload }}", - "refId": "A", - "step": 10 + "legendFormat": "", + "refId": "A" } ], - "thresholds": [ - - ], + "thresholds": "", "timeFrom": null, "timeShift": null, "title": "Current Rate of Bytes Transmitted", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "series", - "name": null, - "show": false, - "values": [ - "current" - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, + "type": "gauge", + "valueFontSize": "80%", + "valueMaps": [ { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true + "op": "=", + "text": "N/A", + "value": "null" } - ] + ], + "valueName": "current" }, { "columns": [ @@ -20686,21 +21191,13 @@ items: "value": "Value #F" }, { - "text": "Value #G", - "value": "Value #G" - }, - { - "text": "Value #H", - "value": "Value #H" - }, - { - "text": "workload", - "value": "workload" + "text": "pod", + "value": "pod" } ], "datasource": "$datasource", "fill": 1, - "fontSize": "90%", + "fontSize": "100%", "gridPos": { "h": 9, "w": 24, @@ -20744,7 +21241,7 @@ items: "unit": "short" }, { - "alias": "Current Bandwidth Received", + "alias": "Bandwidth Received", "colorMode": null, "colors": [ @@ -20762,7 +21259,7 @@ items: "unit": "Bps" }, { - "alias": "Current Bandwidth Transmitted", + "alias": "Bandwidth Transmitted", "colorMode": null, "colors": [ @@ -20779,42 +21276,6 @@ items: "type": "number", "unit": "Bps" }, - { - "alias": "Average Bandwidth Received", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "Bps" - }, - { - "alias": "Average Bandwidth Transmitted", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "Bps" - }, { "alias": "Rate of Received Packets", "colorMode": null, @@ -20826,7 +21287,7 @@ items: "link": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #E", + "pattern": "Value #C", "thresholds": [ ], @@ -20844,7 +21305,7 @@ items: "link": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #F", + "pattern": "Value #D", "thresholds": [ ], @@ -20862,7 +21323,7 @@ items: "link": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #G", + "pattern": "Value #E", "thresholds": [ ], @@ -20880,7 +21341,7 @@ items: "link": false, "linkTooltip": "Drill down", "linkUrl": "", - "pattern": "Value #H", + "pattern": "Value #F", "thresholds": [ ], @@ -20888,7 +21349,7 @@ items: "unit": "pps" }, { - "alias": "Workload", + "alias": "Pod", "colorMode": null, "colors": [ @@ -20897,8 +21358,8 @@ items: "decimals": 2, "link": true, "linkTooltip": "Drill down", - "linkUrl": "d/728bf77cc1166d2f3133bf25846876cc/kubernetes-networking-workload?orgId=1&refresh=30s&var-namespace=$namespace&var-type=$type&var-workload=$__cell", - "pattern": "workload", + "linkUrl": "d/7a18067ce943a40ae25454675c19ff5c/kubernetes-networking-pod?orgId=1&refresh=30s&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", "thresholds": [ ], @@ -20908,7 +21369,7 @@ items: ], "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -20917,7 +21378,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -20926,7 +21387,7 @@ items: "step": 10 }, { - "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -20935,7 +21396,7 @@ items: "step": 10 }, { - "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -20944,7 +21405,7 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -20953,31 +21414,13 @@ items: "step": 10 }, { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, "legendFormat": "", "refId": "F", "step": 10 - }, - { - "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "G", - "step": 10 - }, - { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "H", - "step": 10 } ], "timeFrom": null, @@ -20985,232 +21428,6 @@ items: "title": "Current Status", "type": "table" }, - { - "collapse": true, - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 19 - }, - "id": 6, - "panels": [ - { - "aliasColors": { - - }, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 20 - }, - "id": 7, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [ - - ], - "minSpan": 24, - "nullPointMode": "null", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 24, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ workload }}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Average Rate of Bytes Received", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "series", - "name": null, - "show": false, - "values": [ - "current" - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 20 - }, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [ - - ], - "minSpan": 24, - "nullPointMode": "null", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 24, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ workload }}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Average Rate of Bytes Transmitted", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "series", - "name": null, - "show": false, - "values": [ - "current" - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Average Bandwidth", - "titleSize": "h6", - "type": "row" - }, { "collapse": false, "collapsed": false, @@ -21218,9 +21435,9 @@ items: "h": 1, "w": 24, "x": 0, - "y": 29 + "y": 19 }, - "id": 9, + "id": 6, "panels": [ ], @@ -21228,7 +21445,7 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Bandwidth HIstory", + "title": "Bandwidth", "titleSize": "h6", "type": "row" }, @@ -21246,9 +21463,9 @@ items: "h": 9, "w": 12, "x": 0, - "y": 38 + "y": 20 }, - "id": 10, + "id": 7, "legend": { "alignAsTable": false, "avg": false, @@ -21285,10 +21502,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{workload}}", + "legendFormat": "{{pod}}", "refId": "A", "step": 10 } @@ -21347,9 +21564,9 @@ items: "h": 9, "w": 12, "x": 12, - "y": 38 + "y": 20 }, - "id": 11, + "id": 8, "legend": { "alignAsTable": false, "avg": false, @@ -21386,10 +21603,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{workload}}", + "legendFormat": "{{pod}}", "refId": "A", "step": 10 } @@ -21441,9 +21658,9 @@ items: "h": 1, "w": 24, "x": 0, - "y": 39 + "y": 29 }, - "id": 12, + "id": 9, "panels": [ { "aliasColors": { @@ -21456,12 +21673,12 @@ items: "fill": 2, "fillGradient": 0, "gridPos": { - "h": 9, + "h": 10, "w": 12, "x": 0, - "y": 40 + "y": 30 }, - "id": 13, + "id": 10, "legend": { "alignAsTable": false, "avg": false, @@ -21498,10 +21715,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{workload}}", + "legendFormat": "{{pod}}", "refId": "A", "step": 10 } @@ -21557,12 +21774,12 @@ items: "fill": 2, "fillGradient": 0, "gridPos": { - "h": 9, + "h": 10, "w": 12, "x": 12, - "y": 40 + "y": 30 }, - "id": 14, + "id": 11, "legend": { "alignAsTable": false, "avg": false, @@ -21599,10 +21816,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{workload}}", + "legendFormat": "{{pod}}", "refId": "A", "step": 10 } @@ -21663,9 +21880,9 @@ items: "h": 1, "w": 24, "x": 0, - "y": 40 + "y": 30 }, - "id": 15, + "id": 12, "panels": [ { "aliasColors": { @@ -21678,12 +21895,12 @@ items: "fill": 2, "fillGradient": 0, "gridPos": { - "h": 9, + "h": 10, "w": 12, "x": 0, - "y": 41 + "y": 40 }, - "id": 16, + "id": 13, "legend": { "alignAsTable": false, "avg": false, @@ -21720,10 +21937,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{workload}}", + "legendFormat": "{{pod}}", "refId": "A", "step": 10 } @@ -21779,12 +21996,12 @@ items: "fill": 2, "fillGradient": 0, "gridPos": { - "h": 9, + "h": 10, "w": 12, "x": 12, - "y": 41 + "y": 40 }, - "id": 17, + "id": 14, "legend": { "alignAsTable": false, "avg": false, @@ -21821,10 +22038,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])) by (pod)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{workload}}", + "legendFormat": "{{pod}}", "refId": "A", "step": 10 } @@ -21908,28 +22125,22 @@ items: }, { "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", "current": { - "text": "kube-system", - "value": "kube-system" + }, "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total, namespace)", - "hide": 0, + "hide": 2, "includeAll": false, "label": null, "multi": false, - "name": "namespace", + "name": "cluster", "options": [ ], - "query": "label_values(container_network_receive_packets_total, namespace)", - "refresh": 1, + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)", + "refresh": 2, "regex": "", - "skipUrlSync": false, - "sort": 1, + "sort": 0, "tagValuesQuery": "", "tags": [ @@ -21939,29 +22150,29 @@ items: "useTags": false }, { - "allValue": null, + "allValue": ".+", "auto": false, "auto_count": 30, "auto_min": "10s", "current": { - "text": "deployment", - "value": "deployment" + "text": "kube-system", + "value": "kube-system" }, "datasource": "$datasource", - "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", "hide": 0, - "includeAll": false, + "includeAll": true, "label": null, "multi": false, - "name": "type", + "name": "namespace", "options": [ ], - "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "refresh": 1, + "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", + "refresh": 2, "regex": "", "skipUrlSync": false, - "sort": 0, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -22082,349 +22293,679 @@ items: ] }, "timezone": "UTC", - "title": "Kubernetes / Networking / Namespace (Workload)", - "uid": "bbb2a765a623ae38130206c7d94a160f", + "title": "Kubernetes / Networking / Namespace (Pods)", + "uid": "8b7a8b326d7a6f1f04244066368c67af", "version": 0 } kind: ConfigMap metadata: - name: grafana-dashboard-namespace-by-workload + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-namespace-by-pod namespace: monitoring - apiVersion: v1 data: - node-cluster-rsrc-use.json: |- + namespace-by-workload.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ - + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "hideControls": false, + "id": null, "links": [ ], - "refresh": "10s", - "rows": [ + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ + + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Current Bandwidth", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "minSpan": 24, + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 24, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ workload }}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Rate of Bytes Received", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "series", + "name": null, + "show": false, + "values": [ + "current" + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "minSpan": 24, + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 24, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ workload }}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Rate of Bytes Transmitted", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "series", + "name": null, + "show": false, + "values": [ + "current" + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, { - "collapse": false, - "height": "250px", - "panels": [ + "columns": [ { - "aliasColors": { + "text": "Time", + "value": "Time" + }, + { + "text": "Value #A", + "value": "Value #A" + }, + { + "text": "Value #B", + "value": "Value #B" + }, + { + "text": "Value #C", + "value": "Value #C" + }, + { + "text": "Value #D", + "value": "Value #D" + }, + { + "text": "Value #E", + "value": "Value #E" + }, + { + "text": "Value #F", + "value": "Value #F" + }, + { + "text": "Value #G", + "value": "Value #G" + }, + { + "text": "Value #H", + "value": "Value #H" + }, + { + "text": "workload", + "value": "workload" + } + ], + "datasource": "$datasource", + "fill": 1, + "fontSize": "90%", + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 5, + "lines": true, + "linewidth": 1, + "links": [ - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ + ], + "minSpan": 24, + "nullPointMode": "null as zero", + "renderer": "flot", + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": false + }, + "spaceLength": 10, + "span": 24, + "styles": [ + { + "alias": "Time", + "colorMode": null, + "colors": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Time", + "thresholds": [ ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - } + "type": "hidden", + "unit": "short" + }, + { + "alias": "Current Bandwidth Received", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + "type": "number", + "unit": "Bps" }, { - "aliasColors": { + "alias": "Current Bandwidth Transmitted", + "colorMode": null, + "colors": [ - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ + "type": "number", + "unit": "Bps" + }, + { + "alias": "Average Bandwidth Received", + "colorMode": null, + "colors": [ ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - } + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Average Bandwidth Transmitted", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Saturation (load1 per CPU)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "type": "number", + "unit": "Bps" + }, + { + "alias": "Rate of Received Packets", + "colorMode": null, + "colors": [ - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, { - "aliasColors": { + "alias": "Rate of Transmitted Packets", + "colorMode": null, + "colors": [ - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Received Packets Dropped", + "colorMode": null, + "colors": [ ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\"}))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - } + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #G", + "thresholds": [ + + ], + "type": "number", + "unit": "pps" + }, + { + "alias": "Rate of Transmitted Packets Dropped", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #H", "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "type": "number", + "unit": "pps" + }, + { + "alias": "Workload", + "colorMode": null, + "colors": [ - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "d/728bf77cc1166d2f3133bf25846876cc/kubernetes-networking-workload?orgId=1&refresh=30s&var-namespace=$namespace&var-type=$type&var-workload=$__cell", + "pattern": "workload", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sort_desc(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + }, + { + "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "G", + "step": 10 }, + { + "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "H", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Status", + "type": "table" + }, + { + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 6, + "panels": [ { "aliasColors": { }, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 4, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 7, "legend": { + "alignAsTable": true, "avg": false, - "current": false, + "current": true, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, - "lines": true, - "linewidth": 0, + "lines": false, + "linewidth": 1, "links": [ ], - "nullPointMode": "null as zero", + "minSpan": 24, + "nullPointMode": "null", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, - "stack": true, + "span": 24, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\"}", + "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", + "intervalFactor": 1, + "legendFormat": "{{ workload }}", + "refId": "A", "step": 10 } ], @@ -22433,25 +22974,25 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory Saturation (Major Page Faults)", + "title": "Average Rate of Bytes Received", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, - "mode": "time", + "mode": "series", "name": null, - "show": true, + "show": false, "values": [ - + "current" ] }, "yaxes": [ { - "format": "rps", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -22459,86 +23000,75 @@ items: "show": true }, { - "format": "short", + "format": "Bps", "label": null, "logBase": 1, "max": null, - "min": null, - "show": false + "min": 0, + "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 5, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 8, "legend": { + "alignAsTable": true, "avg": false, - "current": false, + "current": true, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, - "lines": true, - "linewidth": 0, + "lines": false, + "linewidth": 1, "links": [ ], - "nullPointMode": "null as zero", + "minSpan": 24, + "nullPointMode": "null", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ - { - "alias": "/ Receive/", - "stack": "A" - }, - { - "alias": "/ Transmit/", - "stack": "B", - "transform": "negative-Y" - } + ], "spaceLength": 10, - "span": 6, - "stack": true, + "span": 24, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Receive", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - }, - { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Transmit", - "legendLink": "/dashboard/file/node-rsrc-use.json", + "intervalFactor": 1, + "legendFormat": "{{ workload }}", + "refId": "A", "step": 10 } ], @@ -22547,20 +23077,20 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Net Utilisation (Bytes Receive/Transmit)", + "title": "Average Rate of Bytes Transmitted", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, - "mode": "time", + "mode": "series", "name": null, - "show": true, + "show": false, "values": [ - + "current" ] }, "yaxes": [ @@ -22569,19 +23099,262 @@ items: "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "short", + "format": "Bps", "label": null, "logBase": 1, "max": null, - "min": null, - "show": false + "min": 0, + "show": true } ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Average Bandwidth", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 9, + "panels": [ + + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bandwidth HIstory", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 38 + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{workload}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Receive Bandwidth", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 38 + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{workload}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Transmit Bandwidth", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 39 + }, + "id": 12, + "panels": [ { "aliasColors": { @@ -22590,57 +23363,56 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 6, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 13, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 2, "links": [ ], - "nullPointMode": "null as zero", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ - { - "alias": "/ Receive/", - "stack": "A" - }, - { - "alias": "/ Transmit/", - "stack": "B", - "transform": "negative-Y" - } + ], "spaceLength": 10, - "span": 6, + "span": 12, "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Receive", - "legendLink": "/dashboard/file/node-rsrc-use.json", - "step": 10 - }, - { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "sort_desc(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Transmit", - "legendLink": "/dashboard/file/node-rsrc-use.json", + "intervalFactor": 1, + "legendFormat": "{{workload}}", + "refId": "A", "step": 10 } ], @@ -22649,10 +23421,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Net Saturation (Drops Receive/Transmit)", + "title": "Rate of Received Packets", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -22667,35 +23439,23 @@ items: }, "yaxes": [ { - "format": "rps", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, - "show": false + "min": 0, + "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -22704,41 +23464,56 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 7, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 14, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 2, "links": [ ], - "nullPointMode": "null as zero", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, + "span": 12, "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{device}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", + "intervalFactor": 1, + "legendFormat": "{{workload}}", + "refId": "A", "step": 10 } ], @@ -22747,10 +23522,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Disk IO Utilisation", + "title": "Rate of Transmitted Packets", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -22765,23 +23540,43 @@ items: }, "yaxes": [ { - "format": "percentunit", + "format": "pps", "label": null, "logBase": 1, - "max": 1, + "max": null, "min": 0, "show": true }, { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, - "show": false + "min": 0, + "show": true } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Packets", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 15, + "panels": [ { "aliasColors": { @@ -22790,41 +23585,56 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 8, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 16, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 2, "links": [ ], - "nullPointMode": "null as zero", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, + "span": 12, "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n", + "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{device}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", + "intervalFactor": 1, + "legendFormat": "{{workload}}", + "refId": "A", "step": 10 } ], @@ -22833,10 +23643,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Disk IO Saturation", + "title": "Rate of Received Packets Dropped", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -22851,35 +23661,23 @@ items: }, "yaxes": [ { - "format": "percentunit", + "format": "pps", "label": null, "logBase": 1, - "max": 1, + "max": null, "min": 0, "show": true }, { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, - "show": false + "min": 0, + "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Disk IO", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -22888,27 +23686,42 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 9, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 17, "legend": { + "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 2, "links": [ ], - "nullPointMode": "null as zero", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], @@ -22918,11 +23731,11 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n) \n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"})))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "legendLink": "/dashboard/file/node-rsrc-use.json", + "intervalFactor": 1, + "legendFormat": "{{workload}}", + "refId": "A", "step": 10 } ], @@ -22931,10 +23744,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Disk Space Utilisation", + "title": "Rate of Transmitted Packets Dropped", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -22949,20 +23762,20 @@ items: }, "yaxes": [ { - "format": "percentunit", + "format": "pps", "label": null, "logBase": 1, - "max": 1, + "max": null, "min": 0, "show": true }, { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, - "show": false + "min": 0, + "show": true } ] } @@ -22971,14 +23784,19 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Disk Space", - "titleSize": "h6" + "title": "Errors", + "titleSize": "h6", + "type": "row" } ], - "schemaVersion": 14, + "refresh": "10s", + "rows": [ + + ], + "schemaVersion": 18, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -22997,6 +23815,176 @@ items: "refresh": 1, "regex": "", "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "kube-system", + "value": "kube-system" + }, + "datasource": "$datasource", + "definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "deployment", + "value": "deployment" + }, + "datasource": "$datasource", + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "type", + "options": [ + + ], + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": true, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + } + ], + "query": "30s,5m,1h", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "interval", + "useTags": false + }, + { + "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "interval", + "options": [ + { + "selected": true, + "text": "4h", + "value": "4h" + } + ], + "query": "4h", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "interval", + "useTags": false } ] }, @@ -23030,35 +24018,47 @@ items: ] }, "timezone": "UTC", - "title": "USE Method / Cluster", - "uid": "3e97d1d02672cdd0861f4c97c64f89b2", + "title": "Kubernetes / Networking / Namespace (Workload)", + "uid": "bbb2a765a623ae38130206c7d94a160f", "version": 0 } kind: ConfigMap metadata: - name: grafana-dashboard-node-cluster-rsrc-use + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-namespace-by-workload namespace: monitoring - apiVersion: v1 data: - node-rsrc-use.json: |- + node-cluster-rsrc-use.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ ] }, - "editable": true, + "editable": false, "gnetId": null, - "graphTooltip": 0, + "graphTooltip": 1, "hideControls": false, + "id": null, "links": [ ], - "refresh": "10s", + "refresh": "30s", "rows": [ { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -23068,14 +24068,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 1, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 2, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23084,26 +24091,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_cpu_utilisation:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "((\n instance:node_cpu_utilisation:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}\n *\n instance:node_num_cpu:sum{job=\"node-exporter\", cluster=\"$cluster\"}\n) != 0 )\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\", cluster=\"$cluster\"}))\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Utilisation", - "legendLink": null, - "step": 10 + "legendFormat": "{{ instance }}", + "refId": "A" } ], "thresholds": [ @@ -23113,8 +24120,8 @@ items: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23133,16 +24140,16 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -23154,14 +24161,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 2, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23170,26 +24184,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "(\n instance:node_load1_per_cpu:ratio{job=\"node-exporter\", cluster=\"$cluster\"}\n / scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\", cluster=\"$cluster\"}))\n) != 0\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Saturation", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -23199,8 +24213,8 @@ items: "timeShift": null, "title": "CPU Saturation (Load1 per CPU)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23219,16 +24233,16 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -23238,11 +24252,12 @@ items: "repeatRowId": null, "showTitle": true, "title": "CPU", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -23252,14 +24267,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 3, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23268,26 +24290,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\", job=\"node-exporter\", instance=\"$instance\"}", + "expr": "(\n instance:node_memory_utilisation:ratio{job=\"node-exporter\", cluster=\"$cluster\"}\n / scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\", cluster=\"$cluster\"}))\n) != 0\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Memory", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -23297,8 +24319,8 @@ items: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23317,16 +24339,16 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -23338,14 +24360,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 4, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23354,26 +24383,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Major page faults", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -23383,8 +24412,8 @@ items: "timeShift": null, "title": "Memory Saturation (Major Page Faults)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23399,20 +24428,20 @@ items: }, "yaxes": [ { - "format": "short", + "format": "rds", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "rds", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -23422,11 +24451,12 @@ items: "repeatRowId": null, "showTitle": true, "title": "Memory", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -23436,14 +24466,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 5, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 6, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23452,11 +24489,12 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ { "alias": "/Receive/", @@ -23470,24 +24508,22 @@ items: ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Receive", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} Receive", + "refId": "A" }, { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Transmit", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} Transmit", + "refId": "B" } ], "thresholds": [ @@ -23495,10 +24531,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Net Utilisation (Bytes Receive/Transmit)", + "title": "Network Utilisation (Bytes Receive/Transmit)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23521,12 +24557,12 @@ items: "show": true }, { - "format": "short", + "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -23538,14 +24574,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 6, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 7, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23554,42 +24597,41 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ { - "alias": "/Receive/", + "alias": "/ Receive/", "stack": "A" }, { - "alias": "/Transmit/", + "alias": "/ Transmit/", "stack": "B", "transform": "negative-Y" } ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Receive drops", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} Receive", + "refId": "A" }, { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Transmit drops", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} Transmit", + "refId": "B" } ], "thresholds": [ @@ -23597,10 +24639,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Net Saturation (Drops Receive/Transmit)", + "title": "Network Saturation (Drops Receive/Transmit)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23615,7 +24657,7 @@ items: }, "yaxes": [ { - "format": "rps", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -23623,12 +24665,12 @@ items: "show": true }, { - "format": "short", + "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -23637,12 +24679,13 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Net", - "titleSize": "h6" + "title": "Network", + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -23652,14 +24695,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 7, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 8, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23668,26 +24718,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "(\n instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}\n / scalar(count(instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}))\n) != 0\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{device}}", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} {{device}}", + "refId": "A" } ], "thresholds": [ @@ -23697,8 +24747,8 @@ items: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23717,16 +24767,16 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -23738,14 +24788,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 8, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 9, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23754,26 +24811,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "(\n instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}\n / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", cluster=\"$cluster\"}))\n) != 0\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{device}}", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}} {{device}}", + "refId": "A" } ], "thresholds": [ @@ -23783,8 +24840,8 @@ items: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23803,16 +24860,16 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -23822,11 +24879,12 @@ items: "repeatRowId": null, "showTitle": true, "title": "Disk IO", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -23836,14 +24894,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 9, + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -23852,26 +24917,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 12, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "1 -\n(\n max without (mountpoint, fstype) (node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\", instance=\"$instance\"})\n/\n max without (mountpoint, fstype) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", instance=\"$instance\"})\n)\n", + "expr": "sum without (device) (\n max without (fstype, mountpoint) ((\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", cluster=\"$cluster\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\", cluster=\"$cluster\"}\n ) != 0)\n)\n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", cluster=\"$cluster\"})))\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{device}}", - "legendLink": null, - "step": 10 + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -23881,8 +24946,8 @@ items: "timeShift": null, "title": "Disk Space Utilisation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -23901,16 +24966,16 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -23920,20 +24985,21 @@ items: "repeatRowId": null, "showTitle": true, "title": "Disk Space", - "titleSize": "h6" + "titleSize": "h6", + "type": "row" } ], "schemaVersion": 14, "style": "dark", "tags": [ - + "node-exporter-mixin" ], "templating": { "list": [ { "current": { - "text": "default", - "value": "default" + "text": "Prometheus", + "value": "Prometheus" }, "hide": 0, "label": null, @@ -23949,22 +25015,22 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", - "hide": 0, + "hide": 2, "includeAll": false, - "label": "instance", + "label": null, "multi": false, - "name": "instance", + "name": "cluster", "options": [ ], - "query": "label_values(up{job=\"node-exporter\"}, instance)", - "refresh": 1, + "query": "label_values(node_time_seconds, cluster)", + "refresh": 2, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -24004,18 +25070,22 @@ items: "30d" ] }, - "timezone": "UTC", - "title": "USE Method / Node", - "uid": "fac67cfbe174d3ef53eb473d73d9212f", + "timezone": "utc", + "title": "Node Exporter / USE Method / Cluster", "version": 0 } kind: ConfigMap metadata: - name: grafana-dashboard-node-rsrc-use + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-node-cluster-rsrc-use namespace: monitoring - apiVersion: v1 data: - nodes.json: |- + node-rsrc-use.json: |- { "__inputs": [ @@ -24030,13 +25100,13 @@ items: }, "editable": false, "gnetId": null, - "graphTooltip": 0, + "graphTooltip": 1, "hideControls": false, "id": null, "links": [ ], - "refresh": "", + "refresh": "30s", "rows": [ { "collapse": false, @@ -24050,7 +25120,7 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 10, "fillGradient": 0, "gridPos": { @@ -24063,7 +25133,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -24088,11 +25158,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", + "expr": "instance:node_cpu_utilisation:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", "format": "time_series", - "interval": "1m", - "intervalFactor": 5, - "legendFormat": "{{cpu}}", + "intervalFactor": 2, + "legendFormat": "Utilisation", "refId": "A" } ], @@ -24101,10 +25170,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "CPU Usage", + "title": "CPU Utilisation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -24122,16 +25191,16 @@ items: "format": "percentunit", "label": null, "logBase": 1, - "max": 1, - "min": 0, + "max": null, + "min": null, "show": true }, { "format": "percentunit", "label": null, "logBase": 1, - "max": 1, - "min": 0, + "max": null, + "min": null, "show": true } ] @@ -24144,7 +25213,7 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 0, + "fill": 10, "fillGradient": 0, "gridPos": { @@ -24157,7 +25226,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -24178,36 +25247,15 @@ items: ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "node_load1{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "1m load average", + "legendFormat": "Saturation", "refId": "A" - }, - { - "expr": "node_load5{job=\"node-exporter\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "5m load average", - "refId": "B" - }, - { - "expr": "node_load15{job=\"node-exporter\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "15m load average", - "refId": "C" - }, - { - "expr": "count(node_cpu_seconds_total{job=\"node-exporter\", instance=\"$instance\", mode=\"idle\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "logical cores", - "refId": "D" } ], "thresholds": [ @@ -24215,10 +25263,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Load Average", + "title": "CPU Saturation (Load1 per CPU)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -24233,19 +25281,19 @@ items: }, "yaxes": [ { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -24254,8 +25302,8 @@ items: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", + "showTitle": true, + "title": "CPU", "titleSize": "h6", "type": "row" }, @@ -24271,7 +25319,7 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 10, "fillGradient": 0, "gridPos": { @@ -24284,7 +25332,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -24304,37 +25352,16 @@ items: ], "spaceLength": 10, - "span": 9, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "memory used", + "legendFormat": "Utilisation", "refId": "A" - }, - { - "expr": "node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "refId": "B" - }, - { - "expr": "node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory cached", - "refId": "C" - }, - { - "expr": "node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory free", - "refId": "D" } ], "thresholds": [ @@ -24342,10 +25369,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Memory Utilisation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -24360,113 +25387,122 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "bytes", + "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "$datasource", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "fillGradient": 0, "gridPos": { }, "id": 5, - "interval": null, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } + ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, "targets": [ { - "expr": "100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"})\n/\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"})\n* 100\n)\n", + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "Major page Faults", "refId": "A" } ], - "thresholds": "80, 90", - "title": "Memory Usage", + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Saturation (Major Page Faults)", "tooltip": { - "shared": false + "shared": true, + "sort": 2, + "value_type": "individual" }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ { - "op": "=", - "text": "N/A", - "value": "null" + "format": "rds", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "rds", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "valueName": "current" + ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", + "showTitle": true, + "title": "Memory", "titleSize": "h6", "type": "row" }, @@ -24482,7 +25518,7 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 0, + "fill": 10, "fillGradient": 0, "gridPos": { @@ -24495,7 +25531,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -24513,42 +25549,33 @@ items: "repeat": null, "seriesOverrides": [ { - "alias": "/ read| written/", - "yaxis": 1 + "alias": "/Receive/", + "stack": "A" }, { - "alias": "/ io time/", - "yaxis": 2 + "alias": "/Transmit/", + "stack": "B", + "transform": "negative-Y" } ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", "format": "time_series", - "interval": "1m", "intervalFactor": 2, - "legendFormat": "{{device}} read", + "legendFormat": "Receive", "refId": "A" }, { - "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", "format": "time_series", - "interval": "1m", "intervalFactor": 2, - "legendFormat": "{{device}} written", + "legendFormat": "Transmit", "refId": "B" - }, - { - "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", - "format": "time_series", - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{device}} io time", - "refId": "C" } ], "thresholds": [ @@ -24556,10 +25583,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Disk I/O", + "title": "Network Utilisation (Bytes Receive/Transmit)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -24574,7 +25601,7 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -24582,7 +25609,7 @@ items: "show": true }, { - "format": "s", + "format": "Bps", "label": null, "logBase": 1, "max": null, @@ -24599,7 +25626,7 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 10, "fillGradient": 0, "gridPos": { @@ -24612,7 +25639,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -24630,12 +25657,13 @@ items: "repeat": null, "seriesOverrides": [ { - "alias": "used", - "color": "#E0B400" + "alias": "/ Receive/", + "stack": "A" }, { - "alias": "available", - "color": "#73BF69" + "alias": "/ Transmit/", + "stack": "B", + "transform": "negative-Y" } ], "spaceLength": 10, @@ -24644,17 +25672,17 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n", + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "used", + "legendFormat": "Receive", "refId": "A" }, { - "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n", + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", "format": "time_series", "intervalFactor": 2, - "legendFormat": "available", + "legendFormat": "Transmit", "refId": "B" } ], @@ -24663,10 +25691,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Disk Space Usage", + "title": "Network Saturation (Drops Receive/Transmit)", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -24681,19 +25709,19 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "Bps", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "bytes", + "format": "Bps", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -24702,8 +25730,8 @@ items: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", + "showTitle": true, + "title": "Network", "titleSize": "h6", "type": "row" }, @@ -24719,7 +25747,7 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 0, + "fill": 10, "fillGradient": 0, "gridPos": { @@ -24732,7 +25760,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -24753,13 +25781,12 @@ items: ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "expr": "instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}}", "refId": "A" @@ -24770,10 +25797,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Network Received", + "title": "Disk IO Utilisation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -24788,19 +25815,19 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "bytes", + "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -24813,7 +25840,7 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 0, + "fill": 10, "fillGradient": 0, "gridPos": { @@ -24826,7 +25853,7 @@ items: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -24847,13 +25874,12 @@ items: ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\", cluster=\"$cluster\"} != 0", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}}", "refId": "A" @@ -24864,10 +25890,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Network Transmitted", + "title": "Disk IO Saturation", "tooltip": { - "shared": false, - "sort": 0, + "shared": true, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -24882,19 +25908,19 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "bytes", + "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -24903,8 +25929,114 @@ items: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", + "showTitle": true, + "title": "Disk IO", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(1 -\n (\n max without (mountpoint, fstype) (node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\", instance=\"$instance\", cluster=\"$cluster\"})\n /\n max without (mountpoint, fstype) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", instance=\"$instance\", cluster=\"$cluster\"})\n ) != 0\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Space Utilisation", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk Space", "titleSize": "h6", "type": "row" } @@ -24912,7 +26044,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "node-exporter-mixin" ], "templating": { "list": [ @@ -24932,6 +26064,33 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(node_time_seconds, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -24946,10 +26105,10 @@ items: "options": [ ], - "query": "label_values(node_exporter_build_info{job=\"node-exporter\"}, instance)", + "query": "label_values(node_exporter_build_info{job=\"node-exporter\", cluster=\"$cluster\"}, instance)", "refresh": 2, "regex": "", - "sort": 0, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -24989,18 +26148,22 @@ items: "30d" ] }, - "timezone": "UTC", - "title": "Nodes", - "uid": "fa49a4706d07a042595b664c87fb33ea", + "timezone": "utc", + "title": "Node Exporter / USE Method / Node", "version": 0 } kind: ConfigMap metadata: - name: grafana-dashboard-nodes + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-node-rsrc-use namespace: monitoring - apiVersion: v1 data: - persistentvolumesusage.json: |- + nodes.json: |- { "__inputs": [ @@ -25015,13 +26178,13 @@ items: }, "editable": false, "gnetId": null, - "graphTooltip": 0, + "graphTooltip": 1, "hideControls": false, "id": null, "links": [ ], - "refresh": "10s", + "refresh": "30s", "rows": [ { "collapse": false, @@ -25042,16 +26205,16 @@ items: }, "id": 2, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, "rightSide": false, "show": true, "sideWidth": null, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, @@ -25068,23 +26231,16 @@ items: ], "spaceLength": 10, - "span": 9, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", + "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__rate_interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Used Space", + "intervalFactor": 5, + "legendFormat": "{{cpu}}", "refId": "A" - }, - { - "expr": "sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Free Space", - "refId": "B" } ], "thresholds": [ @@ -25092,9 +26248,9 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Volume Space Usage", + "title": "CPU Usage", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -25110,106 +26266,136 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "percentunit", "label": null, "logBase": 1, - "max": null, + "max": 1, "min": 0, "show": true }, { - "format": "bytes", + "format": "percentunit", "label": null, "logBase": 1, - "max": null, + "max": 1, "min": 0, "show": true } ] }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "$datasource", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 0, + "fillGradient": 0, "gridPos": { }, "id": 3, - "interval": null, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, "links": [ ], - "mappingType": 1, - "mappingTypes": [ + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ { - "name": "value to text", - "value": 1 + "expr": "node_load1{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "1m load average", + "refId": "A" }, { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ + "expr": "node_load5{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5m load average", + "refId": "B" + }, { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ + "expr": "node_load15{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "15m load average", + "refId": "C" + }, { - "expr": "(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n", + "expr": "count(node_cpu_seconds_total{job=\"node-exporter\", instance=\"$instance\", mode=\"idle\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "A" + "legendFormat": "logical cores", + "refId": "D" } ], - "thresholds": "80, 90", - "title": "Volume Space Usage", + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Load Average", "tooltip": { - "shared": false + "shared": true, + "sort": 0, + "value_type": "individual" }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, { - "op": "=", - "text": "N/A", - "value": "null" + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true } - ], - "valueName": "current" + ] } ], "repeat": null, @@ -25239,16 +26425,16 @@ items: }, "id": 4, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, "rightSide": false, "show": true, "sideWidth": null, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, @@ -25270,18 +26456,32 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", + "expr": "(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n", "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Used inodes", + "intervalFactor": 2, + "legendFormat": "memory used", "refId": "A" }, { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", + "expr": "node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", - "intervalFactor": 1, - "legendFormat": " Free inodes", + "intervalFactor": 2, + "legendFormat": "memory buffers", "refId": "B" + }, + { + "expr": "node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory cached", + "refId": "C" + }, + { + "expr": "node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory free", + "refId": "D" } ], "thresholds": [ @@ -25289,9 +26489,9 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Volume inodes Usage", + "title": "Memory Usage", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -25307,7 +26507,7 @@ items: }, "yaxes": [ { - "format": "none", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -25315,7 +26515,7 @@ items: "show": true }, { - "format": "none", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -25349,756 +26549,308 @@ items: "interval": null, "links": [ - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "80, 90", - "title": "Volume inodes Usage", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kubelet_volume_stats_capacity_bytes, cluster)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"}, namespace)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "PersistentVolumeClaim", - "multi": false, - "name": "volume", - "options": [ - - ], - "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\"}, persistentvolumeclaim)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-7d", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "UTC", - "title": "Kubernetes / Persistent Volumes", - "uid": "919b92a8e8041bd567af9edab12c840c", - "version": 0 - } - kind: ConfigMap - metadata: - name: grafana-dashboard-persistentvolumesusage - namespace: monitoring -- apiVersion: v1 - data: - pod-total.json: |- - { - "__inputs": [ - - ], - "__requires": [ - - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "panels": [ - { - "collapse": false, - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 2, - "panels": [ - - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Current Bandwidth", - "titleSize": "h6", - "type": "row" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "decimals": 0, - "format": "time_series", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 1 - }, - "height": 9, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "minSpan": 12, - "nullPointMode": "connected", - "nullText": null, - "options": { - "fieldOptions": { - "calcs": [ - "last" - ], - "defaults": { - "max": 10000000000, - "min": 0, - "title": "$namespace: $pod", - "unit": "Bps" - }, - "mappings": [ - - ], - "override": { - - }, - "thresholds": [ - { - "color": "dark-green", - "index": 0, - "value": null - }, - { - "color": "dark-yellow", - "index": 1, - "value": 5000000000 - }, - { - "color": "dark-red", - "index": 2, - "value": 7000000000 - } - ], - "values": false - } - }, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 12, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution]))", - "format": "time_series", - "instant": null, - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Current Rate of Bytes Received", - "type": "gauge", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "decimals": 0, - "format": "time_series", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 1 - }, - "height": 9, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "minSpan": 12, - "nullPointMode": "connected", - "nullText": null, - "options": { - "fieldOptions": { - "calcs": [ - "last" - ], - "defaults": { - "max": 10000000000, - "min": 0, - "title": "$namespace: $pod", - "unit": "Bps" - }, - "mappings": [ - - ], - "override": { - - }, - "thresholds": [ + ], + "mappingType": 1, + "mappingTypes": [ { - "color": "dark-green", - "index": 0, - "value": null + "name": "value to text", + "value": 1 }, { - "color": "dark-yellow", - "index": 1, - "value": 5000000000 - }, + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ { - "color": "dark-red", - "index": 2, - "value": 7000000000 + "from": "null", + "text": "N/A", + "to": "null" } ], - "values": false - } - }, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 12, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution]))", - "format": "time_series", - "instant": null, - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Current Rate of Bytes Transmitted", - "type": "gauge", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"})\n/\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"})\n* 100\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80, 90", + "title": "Memory Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" } - ], - "valueName": "current" - }, - { - "collapse": false, - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 10 - }, - "id": 5, - "panels": [ - ], "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": true, - "title": "Bandwidth", + "showTitle": false, + "title": "Dashboard Row", "titleSize": "h6", "type": "row" }, { - "aliasColors": { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 11 - }, - "id": 6, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 0, + "fillGradient": 0, + "gridPos": { - ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/ read| written/", + "yaxis": 1 + }, + { + "alias": "/ io time/", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} read", + "refId": "A" + }, + { + "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} written", + "refId": "B" + }, + { + "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} io time", + "refId": "C" + } + ], + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Receive Bandwidth", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 11 - }, - "id": 7, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { - ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "used", + "color": "#E0B400" + }, + { + "alias": "available", + "color": "#73BF69" + } + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "used", + "refId": "A" + }, + { + "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B" + } + ], + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Transmit Bandwidth", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Space Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] } - ] + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" }, { - "collapse": true, - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 20 - }, - "id": 8, + "collapse": false, + "collapsed": false, "panels": [ { "aliasColors": { @@ -26108,21 +26860,16 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 2, + "fill": 0, "fillGradient": 0, "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 21 + }, - "id": 9, + "id": 8, "legend": { "alignAsTable": false, "avg": false, "current": false, - "hideEmpty": true, - "hideZero": true, "max": false, "min": false, "rightSide": false, @@ -26132,13 +26879,11 @@ items: "values": false }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [ ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, @@ -26148,17 +26893,16 @@ items: ], "spaceLength": 10, - "span": 12, - "stack": true, + "span": 6, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])", "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" } ], "thresholds": [ @@ -26166,10 +26910,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets", + "title": "Network Received", "tooltip": { "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, "type": "graph", @@ -26184,7 +26928,7 @@ items: }, "yaxes": [ { - "format": "pps", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -26192,7 +26936,7 @@ items: "show": true }, { - "format": "pps", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -26209,21 +26953,16 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 2, + "fill": 0, "fillGradient": 0, "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 21 + }, - "id": 10, + "id": 9, "legend": { "alignAsTable": false, "avg": false, "current": false, - "hideEmpty": true, - "hideZero": true, "max": false, "min": false, "rightSide": false, @@ -26233,13 +26972,11 @@ items: "values": false }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [ ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, @@ -26249,17 +26986,16 @@ items: ], "spaceLength": 10, - "span": 12, - "stack": true, + "span": 6, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])", "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" } ], "thresholds": [ @@ -26267,10 +27003,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets", + "title": "Network Transmitted", "tooltip": { "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, "type": "graph", @@ -26285,7 +27021,7 @@ items: }, "yaxes": [ { - "format": "pps", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -26293,7 +27029,7 @@ items: "show": true }, { - "format": "pps", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -26306,21 +27042,133 @@ items: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": true, - "title": "Packets", + "showTitle": false, + "title": "Dashboard Row", "titleSize": "h6", "type": "row" - }, - { - "collapse": true, - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 21 + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "node-exporter-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" }, - "id": 11, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(node_exporter_build_info{job=\"node-exporter\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Node Exporter / Nodes", + "version": 0 + } + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-nodes + namespace: monitoring +- apiVersion: v1 + data: + persistentvolumesusage.json: |- + { + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "collapsed": false, "panels": [ { "aliasColors": { @@ -26330,37 +27178,30 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 2, + "fill": 1, "fillGradient": 0, "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 32 + }, - "id": 12, + "id": 2, "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, "rightSide": false, "show": true, "sideWidth": null, "total": false, - "values": false + "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [ ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, @@ -26370,17 +27211,23 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 9, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 + "legendFormat": "Used Space", + "refId": "A" + }, + { + "expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Free Space", + "refId": "B" } ], "thresholds": [ @@ -26388,10 +27235,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets Dropped", + "title": "Volume Space Usage", "tooltip": { - "shared": true, - "sort": 2, + "shared": false, + "sort": 0, "value_type": "individual" }, "type": "graph", @@ -26406,7 +27253,7 @@ items: }, "yaxes": [ { - "format": "pps", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -26414,7 +27261,7 @@ items: "show": true }, { - "format": "pps", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -26423,6 +27270,103 @@ items: } ] }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "$datasource", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 3, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max without(instance,node) (\n(\n topk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n topk(1, kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n/\ntopk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80, 90", + "title": "Volume Space Usage", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ { "aliasColors": { @@ -26431,37 +27375,30 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 2, + "fill": 1, "fillGradient": 0, "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 32 + }, - "id": 13, + "id": 4, "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, "rightSide": false, "show": true, "sideWidth": null, "total": false, - "values": false + "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [ ], - "minSpan": 12, - "nullPointMode": "connected", - "paceLength": 10, + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, @@ -26471,17 +27408,23 @@ items: ], "spaceLength": 10, - "span": 12, + "span": 9, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{pod}}", - "refId": "A", - "step": 10 + "legendFormat": "Used inodes", + "refId": "A" + }, + { + "expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": " Free inodes", + "refId": "B" } ], "thresholds": [ @@ -26489,10 +27432,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets Dropped", + "title": "Volume inodes Usage", "tooltip": { - "shared": true, - "sort": 2, + "shared": false, + "sort": 0, "value_type": "individual" }, "type": "graph", @@ -26507,7 +27450,7 @@ items: }, "yaxes": [ { - "format": "pps", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -26515,7 +27458,7 @@ items: "show": true }, { - "format": "pps", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -26523,22 +27466,102 @@ items: "show": true } ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "$datasource", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max without(instance,node) (\ntopk(1, kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n/\ntopk(1, kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80, 90", + "title": "Volume inodes Usage", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": true, - "title": "Errors", + "showTitle": false, + "title": "Dashboard Row", "titleSize": "h6", "type": "row" } ], - "refresh": "10s", - "rows": [ - - ], - "schemaVersion": 18, + "schemaVersion": 14, "style": "dark", "tags": [ "kubernetes-mixin" @@ -26562,60 +27585,22 @@ items: "type": "datasource" }, { - "allValue": ".+", - "auto": false, - "auto_count": 30, - "auto_min": "10s", + "allValue": null, "current": { - "text": "kube-system", - "value": "kube-system" - }, - "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total, namespace)", - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(container_network_receive_packets_total, namespace)", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tags": [ - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".+", - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "", - "value": "" }, "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total{namespace=~\"$namespace\"}, pod)", - "hide": 0, + "hide": 2, "includeAll": false, - "label": null, + "label": "cluster", "multi": false, - "name": "pod", + "name": "cluster", "options": [ ], - "query": "label_values(container_network_receive_packets_total{namespace=~\"$namespace\"}, pod)", - "refresh": 1, + "query": "label_values(kubelet_volume_stats_capacity_bytes, cluster)", + "refresh": 2, "regex": "", - "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ @@ -26627,88 +27612,60 @@ items: }, { "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", "current": { - "text": "5m", - "value": "5m" + }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": null, + "label": "Namespace", "multi": false, - "name": "resolution", + "name": "namespace", "options": [ - { - "selected": false, - "text": "30s", - "value": "30s" - }, - { - "selected": true, - "text": "5m", - "value": "5m" - }, - { - "selected": false, - "text": "1h", - "value": "1h" - } + ], - "query": "30s,5m,1h", + "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"}, namespace)", "refresh": 2, "regex": "", - "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ ], "tagsQuery": "", - "type": "interval", + "type": "query", "useTags": false }, { "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", "current": { - "text": "5m", - "value": "5m" + }, "datasource": "$datasource", - "hide": 2, + "hide": 0, "includeAll": false, - "label": null, + "label": "PersistentVolumeClaim", "multi": false, - "name": "interval", + "name": "volume", "options": [ - { - "selected": true, - "text": "4h", - "value": "4h" - } + ], - "query": "4h", + "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\"}, persistentvolumeclaim)", "refresh": 2, "regex": "", - "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [ ], "tagsQuery": "", - "type": "interval", + "type": "query", "useTags": false } ] }, "time": { - "from": "now-1h", + "from": "now-7d", "to": "now" }, "timepicker": { @@ -26737,17 +27694,22 @@ items: ] }, "timezone": "UTC", - "title": "Kubernetes / Networking / Pod", - "uid": "7a18067ce943a40ae25454675c19ff5c", + "title": "Kubernetes / Persistent Volumes", + "uid": "919b92a8e8041bd567af9edab12c840c", "version": 0 } kind: ConfigMap metadata: - name: grafana-dashboard-pod-total + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-persistentvolumesusage namespace: monitoring - apiVersion: v1 data: - prometheus-remote-write.json: |- + pod-total.json: |- { "__inputs": [ @@ -26757,7 +27719,15 @@ items: ], "annotations": { "list": [ - + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } ] }, "editable": true, @@ -26768,504 +27738,516 @@ items: "links": [ ], - "refresh": "", - "rows": [ + "panels": [ { "collapse": false, "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 2, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}\n)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Highest Timestamp In vs. Highest Timestamp Sent", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Current Bandwidth", + "titleSize": "h6", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 0, + "format": "time_series", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 1 + }, + "height": 9, + "id": 3, + "interval": null, + "links": [ - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 3, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", - "refId": "A" - } + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "minSpan": 12, + "nullPointMode": "connected", + "nullText": null, + "options": { + "fieldOptions": { + "calcs": [ + "last" ], - "thresholds": [ + "defaults": { + "max": 10000000000, + "min": 0, + "title": "$namespace: $pod", + "unit": "Bps" + }, + "mappings": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Rate[5m]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "override": { - ] }, - "yaxes": [ + "thresholds": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "color": "dark-green", + "index": 0, + "value": null }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "color": "dark-yellow", + "index": 1, + "value": 5000000000 + }, + { + "color": "dark-red", + "index": 2, + "value": 7000000000 } - ] + ], + "values": false + } + }, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Timestamps", - "titleSize": "h6", - "type": "row" + "span": 12, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution]))", + "format": "time_series", + "instant": null, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "Current Rate of Bytes Received", + "type": "gauge", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 4, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 0, + "format": "time_series", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 1 + }, + "height": 9, + "id": 4, + "interval": null, + "links": [ + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "minSpan": 12, + "nullPointMode": "connected", + "nullText": null, + "options": { + "fieldOptions": { + "calcs": [ + "last" ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", - "refId": "A" - } - ], - "thresholds": [ + "defaults": { + "max": 10000000000, + "min": 0, + "title": "$namespace: $pod", + "unit": "Bps" + }, + "mappings": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Rate, in vs. succeeded or dropped [5m]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "override": { - ] }, - "yaxes": [ + "thresholds": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "color": "dark-green", + "index": 0, + "value": null }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "color": "dark-yellow", + "index": 1, + "value": 5000000000 + }, + { + "color": "dark-red", + "index": 2, + "value": 7000000000 } - ] + ], + "values": false + } + }, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 12, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution]))", + "format": "time_series", + "instant": null, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "Current Rate of Bytes Transmitted", + "type": "gauge", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" } + ], + "valueName": "current" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 5, + "panels": [ + ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Samples", + "title": "Bandwidth", "titleSize": "h6", "type": "row" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + "aliasColors": { - }, - "id": 5, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ - ], - "minSpan": 6, - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Current Shards", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Receive Bandwidth", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { - }, - "id": 6, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "prometheus_remote_storage_shards_max{cluster=~\"$cluster\", instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Max Shards", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Transmit Bandwidth", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 8, + "panels": [ { "aliasColors": { @@ -27274,16 +28256,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 2, "fillGradient": 0, "gridPos": { - + "h": 10, + "w": 12, + "x": 0, + "y": 21 }, - "id": 7, + "id": 9, "legend": { "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, "rightSide": false, @@ -27293,11 +28280,13 @@ items: "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 2, "links": [ ], - "nullPointMode": "null", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, @@ -27307,16 +28296,17 @@ items: ], "spaceLength": 10, - "span": 4, - "stack": false, + "span": 12, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_shards_min{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 10 } ], "thresholds": [ @@ -27324,10 +28314,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Min Shards", + "title": "Rate of Received Packets", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -27342,19 +28332,19 @@ items: }, "yaxes": [ { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true } ] @@ -27367,16 +28357,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 2, "fillGradient": 0, "gridPos": { - + "h": 10, + "w": 12, + "x": 12, + "y": 21 }, - "id": 8, + "id": 10, "legend": { "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, "rightSide": false, @@ -27386,11 +28381,13 @@ items: "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 2, "links": [ ], - "nullPointMode": "null", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, @@ -27400,16 +28397,17 @@ items: ], "spaceLength": 10, - "span": 4, - "stack": false, + "span": 12, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_shards_desired{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 10 } ], "thresholds": [ @@ -27417,10 +28415,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Desired Shards", + "title": "Rate of Transmitted Packets", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -27435,19 +28433,19 @@ items: }, "yaxes": [ { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true } ] @@ -27457,13 +28455,20 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Shards", + "title": "Packets", "titleSize": "h6", "type": "row" }, { - "collapse": false, - "collapsed": false, + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 11, "panels": [ { "aliasColors": { @@ -27473,16 +28478,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 2, "fillGradient": 0, "gridPos": { - + "h": 10, + "w": 12, + "x": 0, + "y": 32 }, - "id": 9, + "id": 12, "legend": { "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, "rightSide": false, @@ -27492,11 +28502,13 @@ items: "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 2, "links": [ ], - "nullPointMode": "null", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, @@ -27506,16 +28518,17 @@ items: ], "spaceLength": 10, - "span": 6, - "stack": false, + "span": 12, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_shard_capacity{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 10 } ], "thresholds": [ @@ -27523,10 +28536,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Shard Capacity", + "title": "Rate of Received Packets Dropped", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -27541,19 +28554,19 @@ items: }, "yaxes": [ { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true } ] @@ -27566,16 +28579,21 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 2, "fillGradient": 0, "gridPos": { - + "h": 10, + "w": 12, + "x": 12, + "y": 32 }, - "id": 10, + "id": 13, "legend": { "alignAsTable": false, "avg": false, "current": false, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, "rightSide": false, @@ -27585,11 +28603,13 @@ items: "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 2, "links": [ ], - "nullPointMode": "null", + "minSpan": 12, + "nullPointMode": "connected", + "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, @@ -27599,16 +28619,17 @@ items: ], "spaceLength": 10, - "span": 6, - "stack": false, + "span": 12, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[$interval:$resolution])) by (pod)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 10 } ], "thresholds": [ @@ -27616,10 +28637,10 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Pending Samples", + "title": "Rate of Transmitted Packets Dropped", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -27634,19 +28655,19 @@ items: }, "yaxes": [ { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "short", + "format": "pps", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true } ] @@ -27656,107 +28677,282 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Shard Details", + "title": "Errors", "titleSize": "h6", "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + } + ], + "refresh": "10s", + "rows": [ - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + ], + "schemaVersion": 18, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ - }, - "id": 11, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "prometheus_tsdb_wal_segment_current{cluster=~\"$cluster\", instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "TSDB Current Segment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "kube-system", + "value": "kube-system" + }, + "datasource": "$datasource", + "definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "namespace", + "options": [ - ] + ], + "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}, pod)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "pod", + "options": [ + + ], + "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}, pod)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "30s", + "value": "30s" }, - "yaxes": [ - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + { + "selected": true, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + } + ], + "query": "30s,5m,1h", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "interval", + "useTags": false + }, + { + "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "interval", + "options": [ + { + "selected": true, + "text": "4h", + "value": "4h" + } + ], + "query": "4h", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "interval", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Kubernetes / Networking / Pod", + "uid": "7a18067ce943a40ae25454675c19ff5c", + "version": 0 + } + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-pod-total + namespace: monitoring +- apiVersion: v1 + data: + prometheus-remote-write.json: |- + { + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "60s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ { "aliasColors": { @@ -27770,7 +28966,7 @@ items: "gridPos": { }, - "id": 12, + "id": 2, "legend": { "alignAsTable": false, "avg": false, @@ -27803,10 +28999,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "prometheus_wal_watcher_current_segment{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} != 0)\n)\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}} {{consumer}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -27815,7 +29011,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Remote Write Current Segment", + "title": "Highest Timestamp In vs. Highest Timestamp Sent", "tooltip": { "shared": true, "sort": 0, @@ -27833,7 +29029,7 @@ items: }, "yaxes": [ { - "format": "none", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -27849,20 +29045,7 @@ items: "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Segments", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + }, { "aliasColors": { @@ -27876,7 +29059,7 @@ items: "gridPos": { }, - "id": 13, + "id": 3, "legend": { "alignAsTable": false, "avg": false, @@ -27904,12 +29087,12 @@ items: ], "spaceLength": 10, - "span": 3, + "span": 6, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "clamp_min(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n, 0)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -27921,7 +29104,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Dropped Samples", + "title": "Rate[5m]", "tooltip": { "shared": true, "sort": 0, @@ -27955,7 +29138,20 @@ items: "show": true } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Timestamps", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ { "aliasColors": { @@ -27969,7 +29165,7 @@ items: "gridPos": { }, - "id": 14, + "id": 4, "legend": { "alignAsTable": false, "avg": false, @@ -27997,12 +29193,12 @@ items: ], "spaceLength": 10, - "span": 3, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))\n- \n (rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -28014,7 +29210,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Failed Samples", + "title": "Rate, in vs. succeeded or dropped [5m]", "tooltip": { "shared": true, "sort": 0, @@ -28048,7 +29244,20 @@ items: "show": true } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Samples", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ { "aliasColors": { @@ -28062,7 +29271,7 @@ items: "gridPos": { }, - "id": 15, + "id": 5, "legend": { "alignAsTable": false, "avg": false, @@ -28080,6 +29289,7 @@ items: "links": [ ], + "minSpan": 6, "nullPointMode": "null", "percentage": false, "pointradius": 5, @@ -28090,12 +29300,12 @@ items: ], "spaceLength": 10, - "span": 3, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -28107,7 +29317,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Retried Samples", + "title": "Current Shards", "tooltip": { "shared": true, "sort": 0, @@ -28155,7 +29365,7 @@ items: "gridPos": { }, - "id": 16, + "id": 6, "legend": { "alignAsTable": false, "avg": false, @@ -28183,12 +29393,12 @@ items: ], "spaceLength": 10, - "span": 3, + "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_enqueue_retries_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "prometheus_remote_storage_shards_max{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -28200,7 +29410,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Enqueue Retries", + "title": "Max Shards", "tooltip": { "shared": true, "sort": 0, @@ -28234,193 +29444,7 @@ items: "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Misc. Rates", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": { - "selected": true, - "text": "All", - "value": "$__all" - }, - "value": { - "selected": true, - "text": "All", - "value": "$__all" - } - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "instance", - "options": [ - - ], - "query": "label_values(prometheus_build_info, instance)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": { - "selected": true, - "text": "All", - "value": "$__all" - }, - "value": { - "selected": true, - "text": "All", - "value": "$__all" - } }, - "datasource": "$datasource", - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_container_info{image=~\".*prometheus.*\"}, cluster)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "url", - "options": [ - - ], - "query": "label_values(prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}, url)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Prometheus Remote Write", - "version": 0 - } - kind: ConfigMap - metadata: - name: grafana-dashboard-prometheus-remote-write - namespace: monitoring -- apiVersion: v1 - data: - prometheus.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ { "aliasColors": { @@ -28430,13 +29454,20 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 7, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -28445,154 +29476,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 4, "stack": false, "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Count", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "hidden", - "unit": "short" - }, - { - "alias": "Uptime", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "Instance", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "instance", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "Job", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "job", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "Version", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTargetBlank": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "version", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], "targets": [ { - "expr": "count by (job, instance, version) (prometheus_build_info{job=~\"$job\", instance=~\"$instance\"})", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "max by (job, instance) (time() - process_start_time_seconds{job=~\"$job\", instance=~\"$instance\"})", - "format": "table", - "instant": true, + "expr": "prometheus_remote_storage_shards_min{cluster=~\"$cluster\", instance=~\"$instance\"}", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", + "refId": "A" } ], "thresholds": [ @@ -28600,14 +29503,13 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Prometheus Stats", + "title": "Min Shards", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, - "transform": "table", - "type": "table", + "type": "graph", "xaxis": { "buckets": null, "mode": "time", @@ -28623,7 +29525,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -28632,22 +29534,10 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Prometheus Stats", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -28657,13 +29547,20 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 2, + "fillGradient": 0, + "gridPos": { + + }, + "id": 8, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -28672,26 +29569,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, + "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(prometheus_target_sync_length_seconds_sum{job=~\"$job\",instance=~\"$instance\"}[5m])) by (scrape_job) * 1e3", + "expr": "prometheus_remote_storage_shards_desired{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{scrape_job}}", - "legendLink": null, - "step": 10 + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", + "refId": "A" } ], "thresholds": [ @@ -28699,7 +29596,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Target Sync", + "title": "Desired Shards", "tooltip": { "shared": true, "sort": 0, @@ -28717,11 +29614,11 @@ items: }, "yaxes": [ { - "format": "ms", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -28730,10 +29627,23 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Shards", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ { "aliasColors": { @@ -28742,42 +29652,49 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 3, + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 9, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(prometheus_sd_discovered_targets{job=~\"$job\",instance=~\"$instance\"})", + "expr": "prometheus_remote_storage_shard_capacity{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Targets", - "legendLink": null, - "step": 10 + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", + "refId": "A" } ], "thresholds": [ @@ -28785,7 +29702,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Targets", + "title": "Shard Capacity", "tooltip": { "shared": true, "sort": 0, @@ -28807,7 +29724,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -28816,22 +29733,10 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Discovery", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -28841,13 +29746,20 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, @@ -28856,26 +29768,26 @@ items: "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 6, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_target_interval_length_seconds_sum{job=~\"$job\",instance=~\"$instance\"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~\"$job\",instance=~\"$instance\"}[5m]) * 1e3", + "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"} or prometheus_remote_storage_samples_pending{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{interval}} configured", - "legendLink": null, - "step": 10 + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", + "refId": "A" } ], "thresholds": [ @@ -28883,7 +29795,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Average Scrape Interval Duration", + "title": "Pending Samples", "tooltip": { "shared": true, "sort": 0, @@ -28901,11 +29813,11 @@ items: }, "yaxes": [ { - "format": "ms", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -28914,10 +29826,23 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Shard Details", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ { "aliasColors": { @@ -28926,66 +29851,49 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 5, + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 11, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, - "stack": true, + "span": 6, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "exceeded sample limit: {{job}}", - "legendLink": null, - "step": 10 - }, - { - "expr": "sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "duplicate timestamp: {{job}}", - "legendLink": null, - "step": 10 - }, - { - "expr": "sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "out of bounds: {{job}}", - "legendLink": null, - "step": 10 - }, - { - "expr": "sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))", + "expr": "prometheus_tsdb_wal_segment_current{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "out of order: {{job}}", - "legendLink": null, - "step": 10 + "legendFormat": "{{cluster}}:{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -28993,7 +29901,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Scrape failures", + "title": "TSDB Current Segment", "tooltip": { "shared": true, "sort": 0, @@ -29011,11 +29919,11 @@ items: }, "yaxes": [ { - "format": "short", + "format": "none", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -29024,7 +29932,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -29036,42 +29944,49 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 6, + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 12, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, - "stack": true, + "span": 6, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_tsdb_head_samples_appended_total{job=~\"$job\",instance=~\"$instance\"}[5m])", + "expr": "prometheus_wal_watcher_current_segment{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{job}} {{instance}}", - "legendLink": null, - "step": 10 + "legendFormat": "{{cluster}}:{{instance}} {{consumer}}", + "refId": "A" } ], "thresholds": [ @@ -29079,7 +29994,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Appended Samples", + "title": "Remote Write Current Segment", "tooltip": { "shared": true, "sort": 0, @@ -29097,11 +30012,11 @@ items: }, "yaxes": [ { - "format": "short", + "format": "none", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -29110,7 +30025,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -29119,12 +30034,13 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Retrieval", - "titleSize": "h6" + "title": "Segments", + "titleSize": "h6", + "type": "row" }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "aliasColors": { @@ -29134,42 +30050,49 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 7, + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 13, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, - "stack": true, + "span": 3, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "prometheus_tsdb_head_series{job=~\"$job\",instance=~\"$instance\"}", + "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{job}} {{instance}} head series", - "legendLink": null, - "step": 10 + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", + "refId": "A" } ], "thresholds": [ @@ -29177,7 +30100,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Head Series", + "title": "Dropped Samples", "tooltip": { "shared": true, "sort": 0, @@ -29199,7 +30122,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -29208,7 +30131,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -29220,42 +30143,49 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 8, + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 14, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, - "stack": true, + "span": 3, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "prometheus_tsdb_head_chunks{job=~\"$job\",instance=~\"$instance\"}", + "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{job}} {{instance}} head chunks", - "legendLink": null, - "step": 10 + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", + "refId": "A" } ], "thresholds": [ @@ -29263,7 +30193,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Head Chunks", + "title": "Failed Samples", "tooltip": { "shared": true, "sort": 0, @@ -29285,7 +30215,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -29294,22 +30224,10 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Storage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -29318,42 +30236,49 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 9, + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 15, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, - "stack": true, + "span": 3, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_engine_query_duration_seconds_count{job=~\"$job\",instance=~\"$instance\",slice=\"inner_eval\"}[5m])", + "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{job}} {{instance}}", - "legendLink": null, - "step": 10 + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", + "refId": "A" } ], "thresholds": [ @@ -29361,7 +30286,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Query Rate", + "title": "Retried Samples", "tooltip": { "shared": true, "sort": 0, @@ -29383,7 +30308,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -29392,7 +30317,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] }, @@ -29404,42 +30329,49 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 10, + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 16, "legend": { + "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, + "rightSide": false, "show": true, + "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], - "nullPointMode": "null as zero", + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, - "stack": true, + "span": 3, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "max by (slice) (prometheus_engine_query_duration_seconds{quantile=\"0.9\",job=~\"$job\",instance=~\"$instance\"}) * 1e3", + "expr": "rate(prometheus_remote_storage_enqueue_retries_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{slice}}", - "legendLink": null, - "step": 10 + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", + "refId": "A" } ], "thresholds": [ @@ -29447,7 +30379,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Stage Duration", + "title": "Enqueue Retries", "tooltip": { "shared": true, "sort": 0, @@ -29465,11 +30397,11 @@ items: }, "yaxes": [ { - "format": "ms", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -29478,7 +30410,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ] } @@ -29487,22 +30419,19 @@ items: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Query", - "titleSize": "h6" + "title": "Misc. Rates", + "titleSize": "h6", + "type": "row" } ], "schemaVersion": 14, "style": "dark", "tags": [ - + "prometheus-mixin" ], "templating": { "list": [ { - "current": { - "text": "default", - "value": "default" - }, "hide": 0, "label": null, "name": "datasource", @@ -29517,23 +30446,65 @@ items: { "allValue": null, "current": { - "selected": true, - "text": "All", - "value": "$__all" + "text": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "value": { + "selected": true, + "text": "All", + "value": "$__all" + } + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(prometheus_build_info, instance)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "value": { + "selected": true, + "text": "All", + "value": "$__all" + } }, "datasource": "$datasource", "hide": 0, "includeAll": true, - "label": "job", - "multi": true, - "name": "job", + "label": null, + "multi": false, + "name": "cluster", "options": [ ], - "query": "label_values(prometheus_build_info, job)", - "refresh": 1, + "query": "label_values(kube_pod_container_info{image=~\".*prometheus.*\"}, cluster)", + "refresh": 2, "regex": "", - "sort": 2, + "sort": 0, "tagValuesQuery": "", "tags": [ @@ -29545,23 +30516,21 @@ items: { "allValue": null, "current": { - "selected": true, - "text": "All", - "value": "$__all" + }, "datasource": "$datasource", "hide": 0, "includeAll": true, - "label": "instance", - "multi": true, - "name": "instance", + "label": null, + "multi": false, + "name": "url", "options": [ ], - "query": "label_values(prometheus_build_info, instance)", - "refresh": 1, + "query": "label_values(prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}, url)", + "refresh": 2, "regex": "", - "sort": 2, + "sort": 0, "tagValuesQuery": "", "tags": [ @@ -29573,7 +30542,7 @@ items: ] }, "time": { - "from": "now-1h", + "from": "now-6h", "to": "now" }, "timepicker": { @@ -29601,128 +30570,41 @@ items: "30d" ] }, - "timezone": "utc", - "title": "Prometheus Overview", - "uid": "", + "timezone": "browser", + "title": "Prometheus / Remote Write", "version": 0 } kind: ConfigMap metadata: - name: grafana-dashboard-prometheus + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-prometheus-remote-write namespace: monitoring - apiVersion: v1 data: - proxy.json: |- + prometheus.json: |- { - "__inputs": [ - - ], - "__requires": [ - - ], "annotations": { "list": [ ] }, - "editable": false, + "editable": true, "gnetId": null, "graphTooltip": 0, "hideControls": false, - "id": null, "links": [ ], - "refresh": "10s", + "refresh": "60s", "rows": [ { "collapse": false, - "collapsed": false, + "height": "250px", "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(up{job=\"kube-proxy\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Up", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, { "aliasColors": { @@ -29732,20 +30614,13 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 3, + "id": 1, "legend": { - "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": false, "show": true, - "sideWidth": null, "total": false, "values": false }, @@ -29754,26 +30629,154 @@ items: "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 5, + "span": 12, "stack": false, "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Count", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "hidden", + "unit": "short" + }, + { + "alias": "Uptime", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Instance", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "instance", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Job", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "job", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Version", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "version", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], "targets": [ { - "expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", - "format": "time_series", + "expr": "count by (job, instance, version) (prometheus_build_info{job=~\"$job\", instance=~\"$instance\"})", + "format": "table", + "instant": true, "intervalFactor": 2, - "legendFormat": "rate", - "refId": "A" + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "max by (job, instance) (time() - process_start_time_seconds{job=~\"$job\", instance=~\"$instance\"})", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 } ], "thresholds": [ @@ -29781,13 +30784,14 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rules Sync Rate", + "title": "Prometheus Stats", "tooltip": { "shared": false, "sort": 0, "value_type": "individual" }, - "type": "graph", + "transform": "table", + "type": "table", "xaxis": { "buckets": null, "mode": "time", @@ -29799,7 +30803,7 @@ items: }, "yaxes": [ { - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -29807,15 +30811,27 @@ items: "show": true }, { - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, - "show": true + "min": null, + "show": false } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Prometheus Stats", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ { "aliasColors": { @@ -29825,48 +30841,41 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 4, + "id": 2, "legend": { - "alignAsTable": true, "avg": false, - "current": true, + "current": false, "max": false, "min": false, - "rightSide": true, "show": true, - "sideWidth": null, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 5, + "span": 6, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", + "expr": "sum(rate(prometheus_target_sync_length_seconds_sum{job=~\"$job\",instance=~\"$instance\"}[5m])) by (scrape_job) * 1e3", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" + "legendFormat": "{{scrape_job}}", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -29874,7 +30883,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Rule Sync Latency 99th Quantile", + "title": "Target Sync", "tooltip": { "shared": false, "sort": 0, @@ -29892,7 +30901,7 @@ items: }, "yaxes": [ { - "format": "s", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -29900,28 +30909,15 @@ items: "show": true }, { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, - "show": true + "min": null, + "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + }, { "aliasColors": { @@ -29930,49 +30926,42 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 5, + "fill": 10, + "id": 3, "legend": { - "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": false, "show": true, - "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 0, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 6, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", + "expr": "sum(prometheus_sd_discovered_targets{job=~\"$job\",instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "rate", - "refId": "A" + "legendFormat": "Targets", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -29980,7 +30969,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Network Programming Rate", + "title": "Targets", "tooltip": { "shared": false, "sort": 0, @@ -29998,7 +30987,7 @@ items: }, "yaxes": [ { - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -30006,15 +30995,27 @@ items: "show": true }, { - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, - "show": true + "min": null, + "show": false } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Discovery", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ { "aliasColors": { @@ -30024,48 +31025,41 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 6, + "id": 4, "legend": { - "alignAsTable": true, "avg": false, - "current": true, + "current": false, "max": false, "min": false, - "rightSide": true, "show": true, - "sideWidth": null, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, + "span": 4, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "rate(prometheus_target_interval_length_seconds_sum{job=~\"$job\",instance=~\"$instance\"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~\"$job\",instance=~\"$instance\"}[5m]) * 1e3", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" + "legendFormat": "{{interval}} configured", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -30073,7 +31067,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Network Programming Latency 99th Quantile", + "title": "Average Scrape Interval Duration", "tooltip": { "shared": false, "sort": 0, @@ -30091,7 +31085,7 @@ items: }, "yaxes": [ { - "format": "s", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -30099,28 +31093,15 @@ items: "show": true }, { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, - "show": true + "min": null, + "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + }, { "aliasColors": { @@ -30129,70 +31110,74 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 7, + "fill": 10, + "id": 5, "legend": { - "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": false, "show": true, - "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 0, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "span": 4, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "2xx", - "refId": "A" + "legendFormat": "exceeded body size limit: {{job}}", + "legendLink": null, + "step": 10 }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "3xx", - "refId": "B" + "legendFormat": "exceeded sample limit: {{job}}", + "legendLink": null, + "step": 10 }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "expr": "sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "4xx", - "refId": "C" + "legendFormat": "duplicate timestamp: {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "out of bounds: {{job}}", + "legendLink": null, + "step": 10 }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "expr": "sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "5xx", - "refId": "D" + "legendFormat": "out of order: {{job}}", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -30200,7 +31185,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Kube API Request Rate", + "title": "Scrape failures", "tooltip": { "shared": false, "sort": 0, @@ -30218,20 +31203,20 @@ items: }, "yaxes": [ { - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ] }, @@ -30243,49 +31228,42 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 8, + "fill": 10, + "id": 6, "legend": { - "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": false, "show": true, - "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 0, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 8, - "stack": false, + "span": 4, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "rate(prometheus_tsdb_head_samples_appended_total{job=~\"$job\",instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{verb}} {{url}}", - "refId": "A" + "legendFormat": "{{job}} {{instance}}", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -30293,7 +31271,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Post Request Latency 99th Quantile", + "title": "Appended Samples", "tooltip": { "shared": false, "sort": 0, @@ -30311,7 +31289,7 @@ items: }, "yaxes": [ { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -30319,12 +31297,12 @@ items: "show": true }, { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, - "show": true + "min": null, + "show": false } ] } @@ -30332,14 +31310,13 @@ items: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "showTitle": true, + "title": "Retrieval", + "titleSize": "h6" }, { "collapse": false, - "collapsed": false, + "height": "250px", "panels": [ { "aliasColors": { @@ -30349,49 +31326,42 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 9, + "fill": 10, + "id": 7, "legend": { - "alignAsTable": true, "avg": false, - "current": true, + "current": false, "max": false, "min": false, - "rightSide": true, "show": true, - "sideWidth": null, "total": false, - "values": true + "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 0, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, - "stack": false, + "span": 6, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "prometheus_tsdb_head_series{job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{verb}} {{url}}", - "refId": "A" + "legendFormat": "{{job}} {{instance}} head series", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -30399,7 +31369,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Get Request Latency 99th Quantile", + "title": "Head Series", "tooltip": { "shared": false, "sort": 0, @@ -30417,7 +31387,7 @@ items: }, "yaxes": [ { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -30425,28 +31395,15 @@ items: "show": true }, { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": 0, - "show": true + "min": null, + "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + }, { "aliasColors": { @@ -30455,49 +31412,42 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 10, + "fill": 10, + "id": 8, "legend": { - "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": false, "show": true, - "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 0, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, - "stack": false, + "span": 6, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-proxy\",instance=~\"$instance\"}", + "expr": "prometheus_tsdb_head_chunks{job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" + "legendFormat": "{{job}} {{instance}} head chunks", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -30505,7 +31455,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory", + "title": "Head Chunks", "tooltip": { "shared": false, "sort": 0, @@ -30523,23 +31473,35 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ { "aliasColors": { @@ -30548,49 +31510,42 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 11, + "fill": 10, + "id": 9, "legend": { - "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": false, "show": true, - "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 0, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, - "stack": false, + "span": 6, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-proxy\",instance=~\"$instance\"}[5m])", + "expr": "rate(prometheus_engine_query_duration_seconds_count{job=~\"$job\",instance=~\"$instance\",slice=\"inner_eval\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" + "legendFormat": "{{job}} {{instance}}", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -30598,7 +31553,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "CPU usage", + "title": "Query Rate", "tooltip": { "shared": false, "sort": 0, @@ -30628,8 +31583,8 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, - "show": true + "min": null, + "show": false } ] }, @@ -30641,49 +31596,42 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 12, + "fill": 10, + "id": 10, "legend": { - "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, - "rightSide": false, "show": true, - "sideWidth": null, "total": false, "values": false }, "lines": true, - "linewidth": 1, + "linewidth": 0, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, - "stack": false, + "span": 6, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-proxy\",instance=~\"$instance\"}", + "expr": "max by (slice) (prometheus_engine_query_duration_seconds{quantile=\"0.9\",job=~\"$job\",instance=~\"$instance\"}) * 1e3", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" + "legendFormat": "{{slice}}", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -30691,7 +31639,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Goroutines", + "title": "Stage Duration", "tooltip": { "shared": false, "sort": 0, @@ -30709,11 +31657,11 @@ items: }, "yaxes": [ { - "format": "short", + "format": "ms", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { @@ -30722,7 +31670,7 @@ items: "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ] } @@ -30730,16 +31678,15 @@ items: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "showTitle": true, + "title": "Query", + "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [ - "kubernetes-mixin" + "prometheus-mixin" ], "templating": { "list": [ @@ -30760,23 +31707,53 @@ items: "type": "datasource" }, { - "allValue": null, + "allValue": ".+", "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ + + ], + "query": "label_values(prometheus_build_info{job=\"prometheus-k8s\",namespace=\"monitoring\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": "All", + "value": "$__all" }, "datasource": "$datasource", "hide": 0, "includeAll": true, - "label": null, - "multi": false, + "label": "instance", + "multi": true, "name": "instance", "options": [ ], - "query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{job=\"kube-proxy\"}, instance)", - "refresh": 2, + "query": "label_values(prometheus_build_info{job=~\"$job\"}, instance)", + "refresh": 1, "regex": "", - "sort": 1, + "sort": 2, "tagValuesQuery": "", "tags": [ @@ -30816,18 +31793,23 @@ items: "30d" ] }, - "timezone": "UTC", - "title": "Kubernetes / Proxy", - "uid": "632e265de029684c40b21cb76bca4f94", + "timezone": "utc", + "title": "Prometheus / Overview", + "uid": "", "version": 0 } kind: ConfigMap metadata: - name: grafana-dashboard-proxy + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-prometheus namespace: monitoring - apiVersion: v1 data: - scheduler.json: |- + proxy.json: |- { "__inputs": [ @@ -30915,7 +31897,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kube-scheduler\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-proxy\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -30953,16 +31935,16 @@ items: }, "id": 3, "legend": { - "alignAsTable": true, + "alignAsTable": false, "avg": false, - "current": true, + "current": false, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sideWidth": null, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, @@ -30984,32 +31966,11 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} e2e", + "legendFormat": "rate", "refId": "A" - }, - { - "expr": "sum(rate(scheduler_binding_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} binding", - "refId": "B" - }, - { - "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} scheduling algorithm", - "refId": "C" - }, - { - "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} volume", - "refId": "D" } ], "thresholds": [ @@ -31017,7 +31978,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Scheduling Rate", + "title": "Rules Sync Rate", "tooltip": { "shared": false, "sort": 0, @@ -31098,32 +32059,210 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} e2e", + "legendFormat": "{{instance}}", "refId": "A" - }, + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rule Sync Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} binding", - "refId": "B" + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} scheduling algorithm", - "refId": "C" + "legendFormat": "rate", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Programming Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m])) by (instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} volume", - "refId": "D" + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -31131,7 +32270,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Scheduling latency 99th Quantile", + "title": "Network Programming Latency 99th Quantile", "tooltip": { "shared": false, "sort": 0, @@ -31192,7 +32331,7 @@ items: "gridPos": { }, - "id": 5, + "id": 7, "legend": { "alignAsTable": false, "avg": false, @@ -31225,28 +32364,28 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"2..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "2xx", "refId": "A" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"3..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "3xx", "refId": "B" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"4..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "4xx", "refId": "C" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"5..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "5xx", @@ -31280,7 +32419,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { @@ -31288,7 +32427,7 @@ items: "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -31306,7 +32445,7 @@ items: "gridPos": { }, - "id": 6, + "id": 8, "legend": { "alignAsTable": false, "avg": false, @@ -31339,7 +32478,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -31412,7 +32551,7 @@ items: "gridPos": { }, - "id": 7, + "id": 9, "legend": { "alignAsTable": true, "avg": false, @@ -31445,7 +32584,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -31518,7 +32657,7 @@ items: "gridPos": { }, - "id": 8, + "id": 10, "legend": { "alignAsTable": false, "avg": false, @@ -31551,7 +32690,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-scheduler\", instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -31611,7 +32750,7 @@ items: "gridPos": { }, - "id": 9, + "id": 11, "legend": { "alignAsTable": false, "avg": false, @@ -31644,7 +32783,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -31674,7 +32813,7 @@ items: }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -31682,7 +32821,7 @@ items: "show": true }, { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -31704,7 +32843,7 @@ items: "gridPos": { }, - "id": 10, + "id": 12, "legend": { "alignAsTable": false, "avg": false, @@ -31737,7 +32876,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-scheduler\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -31821,6 +32960,32 @@ items: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -31831,7 +32996,7 @@ items: "options": [ ], - "query": "label_values(process_cpu_seconds_total{job=\"kube-scheduler\"}, instance)", + "query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\"}, instance)", "refresh": 2, "regex": "", "sort": 1, @@ -31870,310 +33035,52 @@ items: "12h", "24h", "2d", - "7d", - "30d" - ] - }, - "timezone": "UTC", - "title": "Kubernetes / Scheduler", - "uid": "2e6b6a3b4bddf1427b3a55aa1311c656", - "version": 0 - } - kind: ConfigMap - metadata: - name: grafana-dashboard-scheduler - namespace: monitoring -- apiVersion: v1 - data: - statefulset.json: |- - { - "__inputs": [ - - ], - "__requires": [ - - ], - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "CPU", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}) / 1024^3", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Memory", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "Bps", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\",pod=~\"$statefulset.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Network", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Kubernetes / Proxy", + "uid": "632e265de029684c40b21cb76bca4f94", + "version": 0 + } + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-proxy + namespace: monitoring +- apiVersion: v1 + data: + scheduler.json: |- + { + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "10s", + "rows": [ { "collapse": false, "collapsed": false, - "height": "100px", "panels": [ { "cacheTimeout": null, @@ -32196,7 +33103,7 @@ items: "gridPos": { }, - "id": 5, + "id": 2, "interval": null, "links": [ @@ -32226,7 +33133,7 @@ items: "to": "null" } ], - "span": 3, + "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, @@ -32236,7 +33143,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-scheduler\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -32244,7 +33151,7 @@ items: } ], "thresholds": "", - "title": "Desired Replicas", + "title": "Up", "tooltip": { "shared": false }, @@ -32253,277 +33160,367 @@ items: "valueMaps": [ { "op": "=", - "text": "0", + "text": "N/A", "value": "null" } ], - "valueName": "current" + "valueName": "min" }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, "gridPos": { }, - "id": 6, - "interval": null, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, "links": [ ], - "mappingType": 1, - "mappingTypes": [ + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ { - "name": "value to text", - "value": 1 + "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} e2e", + "refId": "A" }, { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ + "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} binding", + "refId": "B" + }, { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ + "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} scheduling algorithm", + "refId": "C" + }, { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "A" + "legendFormat": "{{cluster}} {{instance}} volume", + "refId": "D" } ], - "thresholds": "", - "title": "Replicas of current version", + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Scheduling Rate", "tooltip": { - "shared": false + "shared": false, + "sort": 0, + "value_type": "individual" }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ { - "op": "=", - "text": "0", - "value": "null" + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true } - ], - "valueName": "current" + ] }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, "gridPos": { }, - "id": 7, - "interval": null, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, "links": [ ], - "mappingType": 1, - "mappingTypes": [ + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ { - "name": "value to text", - "value": 1 + "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} e2e", + "refId": "A" }, { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ + "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} binding", + "refId": "B" + }, { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ + "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} scheduling algorithm", + "refId": "C" + }, { - "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "A" + "legendFormat": "{{cluster}} {{instance}} volume", + "refId": "D" } ], - "thresholds": "", - "title": "Observed Generation", + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Scheduling latency 99th Quantile", "tooltip": { - "shared": false + "shared": false, + "sort": 0, + "value_type": "individual" }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ { - "op": "=", - "text": "0", - "value": "null" + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true } - ], - "valueName": "current" - }, + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, "gridPos": { }, - "id": 8, - "interval": null, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, "links": [ ], - "mappingType": 1, - "mappingTypes": [ + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ { - "name": "value to text", - "value": 1 + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" }, { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, { - "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[5m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "", - "refId": "A" + "legendFormat": "5xx", + "refId": "D" } ], - "thresholds": "", - "title": "Metadata Generation", + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Kube API Request Rate", "tooltip": { - "shared": false + "shared": false, + "sort": 0, + "value_type": "individual" }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ { - "op": "=", - "text": "0", - "value": "null" + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + ] + }, { "aliasColors": { @@ -32537,7 +33534,7 @@ items: "gridPos": { }, - "id": 9, + "id": 6, "legend": { "alignAsTable": false, "avg": false, @@ -32565,43 +33562,414 @@ items: ], "spaceLength": 10, + "span": 8, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "replicas specified", + "legendFormat": "{{verb}} {{url}}", "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Post Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true }, { - "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "replicas created", - "refId": "B" + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Get Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true }, { - "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "ready", - "refId": "C" + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "replicas of current version", - "refId": "D" + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true }, { - "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "updated", - "refId": "E" + "legendFormat": "{{instance}}", + "refId": "A" } ], "thresholds": [ @@ -32609,7 +33977,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Replicas", + "title": "Goroutines", "tooltip": { "shared": false, "sort": 0, @@ -32691,33 +34059,7 @@ items: "options": [ ], - "query": "label_values(kube_statefulset_metadata_generation, cluster)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)", + "query": "label_values(up{job=\"kube-scheduler\"}, cluster)", "refresh": 2, "regex": "", "sort": 1, @@ -32736,14 +34078,14 @@ items: }, "datasource": "$datasource", "hide": 0, - "includeAll": false, - "label": "Name", + "includeAll": true, + "label": null, "multi": false, - "name": "statefulset", + "name": "instance", "options": [ ], - "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\"}, statefulset)", + "query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\"}, instance)", "refresh": 2, "regex": "", "sort": 1, @@ -32787,13 +34129,18 @@ items: ] }, "timezone": "UTC", - "title": "Kubernetes / StatefulSets", - "uid": "a31c1f46e6f727cb37c0d731a7245005", + "title": "Kubernetes / Scheduler", + "uid": "2e6b6a3b4bddf1427b3a55aa1311c656", "version": 0 } kind: ConfigMap metadata: - name: grafana-dashboard-statefulset + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 + name: grafana-dashboard-scheduler namespace: monitoring - apiVersion: v1 data: @@ -32903,7 +34250,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ pod }}", @@ -33006,7 +34353,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ pod }}", @@ -33120,7 +34467,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ pod }}", @@ -33223,7 +34570,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ pod }}", @@ -33354,7 +34701,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33455,7 +34802,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33567,7 +34914,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33668,7 +35015,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33789,7 +35136,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33890,7 +35237,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "sort_desc(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\",namespace=~\"$namespace\"}[$interval:$resolution])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", @@ -33975,6 +35322,32 @@ items: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": ".+", "auto": false, @@ -33985,7 +35358,7 @@ items: "value": "kube-system" }, "datasource": "$datasource", - "definition": "label_values(container_network_receive_packets_total, namespace)", + "definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", "hide": 0, "includeAll": true, "label": null, @@ -33994,8 +35367,8 @@ items: "options": [ ], - "query": "label_values(container_network_receive_packets_total, namespace)", - "refresh": 1, + "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -34017,7 +35390,7 @@ items: "value": "" }, "datasource": "$datasource", - "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\"}, workload)", + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\"}, workload)", "hide": 0, "includeAll": false, "label": null, @@ -34026,8 +35399,8 @@ items: "options": [ ], - "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\"}, workload)", - "refresh": 1, + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\"}, workload)", + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -34049,7 +35422,7 @@ items: "value": "deployment" }, "datasource": "$datasource", - "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)", + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)", "hide": 0, "includeAll": false, "label": null, @@ -34058,8 +35431,8 @@ items: "options": [ ], - "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)", - "refresh": 1, + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)", + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -34189,6 +35562,11 @@ items: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboard-workload-total namespace: monitoring kind: ConfigMapList diff --git a/manifests/grafana-dashboardSources.yaml b/manifests/grafana-dashboardSources.yaml index fffec986bb45dcfc685e2cae8ea9a8be170ecc3b..f6e281ebf4553673d38e864c80aa644a9a1801ec 100644 --- a/manifests/grafana-dashboardSources.yaml +++ b/manifests/grafana-dashboardSources.yaml @@ -6,6 +6,7 @@ data: "providers": [ { "folder": "Default", + "folderUid": "", "name": "0", "options": { "path": "/grafana-dashboard-definitions/0" @@ -17,5 +18,10 @@ data: } kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana-dashboards namespace: monitoring diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 589b1ade56cc53f4e2d0a174427919b1418df86d..4b23928096d4fe46b9ca14664ca4d43d402431e2 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -2,24 +2,34 @@ apiVersion: apps/v1 kind: Deployment metadata: labels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana namespace: monitoring spec: replicas: 1 selector: matchLabels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus template: metadata: annotations: - checksum/grafana-datasources: 48faab41f579fc8efde6034391496f6a + checksum/grafana-config: e1f5b84a1d40edb8a6527c98d24ff656 + checksum/grafana-dashboardproviders: 2c7c248e5512bb5576d633004725159c + checksum/grafana-datasources: b2cbbea3079b8634b7bdf42cb56c1537 labels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 spec: containers: - env: [] - image: grafana/grafana:7.1.0 + image: grafana/grafana:8.1.3 name: grafana ports: - containerPort: 3000 @@ -45,6 +55,9 @@ spec: - mountPath: /etc/grafana/provisioning/dashboards name: grafana-dashboards readOnly: false + - mountPath: /grafana-dashboard-definitions/0/alertmanager-overview + name: grafana-dashboard-alertmanager-overview + readOnly: false - mountPath: /grafana-dashboard-definitions/0/apiserver name: grafana-dashboard-apiserver readOnly: false @@ -108,14 +121,14 @@ spec: - mountPath: /grafana-dashboard-definitions/0/scheduler name: grafana-dashboard-scheduler readOnly: false - - mountPath: /grafana-dashboard-definitions/0/statefulset - name: grafana-dashboard-statefulset - readOnly: false - mountPath: /grafana-dashboard-definitions/0/workload-total name: grafana-dashboard-workload-total readOnly: false + - mountPath: /etc/grafana + name: grafana-config + readOnly: false nodeSelector: - beta.kubernetes.io/os: linux + kubernetes.io/os: linux securityContext: fsGroup: 65534 runAsNonRoot: true @@ -130,6 +143,9 @@ spec: - configMap: name: grafana-dashboards name: grafana-dashboards + - configMap: + name: grafana-dashboard-alertmanager-overview + name: grafana-dashboard-alertmanager-overview - configMap: name: grafana-dashboard-apiserver name: grafana-dashboard-apiserver @@ -193,9 +209,9 @@ spec: - configMap: name: grafana-dashboard-scheduler name: grafana-dashboard-scheduler - - configMap: - name: grafana-dashboard-statefulset - name: grafana-dashboard-statefulset - configMap: name: grafana-dashboard-workload-total name: grafana-dashboard-workload-total + - name: grafana-config + secret: + secretName: grafana-config diff --git a/manifests/grafana-service.yaml b/manifests/grafana-service.yaml index 5e7e1453ea6698bd65a4faab5b508e5fe3696daf..258a972070c275a9a907c975abf420c1a1c26bef 100644 --- a/manifests/grafana-service.yaml +++ b/manifests/grafana-service.yaml @@ -2,7 +2,10 @@ apiVersion: v1 kind: Service metadata: labels: - app: grafana + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana namespace: monitoring spec: @@ -11,5 +14,6 @@ spec: port: 3000 targetPort: http selector: - app: grafana - type: NodePort + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/grafana-serviceMonitor.yaml b/manifests/grafana-serviceMonitor.yaml index 7ede266a5cfcf38e76818aa5571e009870a01e96..fedfd40cda1a6646c6156671d74e35a2c0fc8244 100644 --- a/manifests/grafana-serviceMonitor.yaml +++ b/manifests/grafana-serviceMonitor.yaml @@ -1,6 +1,11 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.1.3 name: grafana namespace: monitoring spec: @@ -9,4 +14,4 @@ spec: port: http selector: matchLabels: - app: grafana + app.kubernetes.io/name: grafana diff --git a/manifests/kube-prometheus-prometheusRule.yaml b/manifests/kube-prometheus-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f97781540e6c39c152b53ea0359fceb2d8c291a1 --- /dev/null +++ b/manifests/kube-prometheus-prometheusRule.yaml @@ -0,0 +1,69 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + role: alert-rules + name: kube-prometheus-rules + namespace: monitoring +spec: + groups: + - name: general.rules + rules: + - alert: TargetDown + annotations: + description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown + summary: One or more targets are unreachable. + expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + description: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog + summary: An alert that should always be firing to certify that Alertmanager is working properly. + expr: vector(1) + labels: + severity: none + - name: node-network + rules: + - alert: NodeNetworkInterfaceFlapping + annotations: + description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping + summary: Network interface is often changing its status + expr: | + changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kube-prometheus-general.rules + rules: + - expr: count without(instance, pod, node) (up == 1) + record: count:up1 + - expr: count without(instance, pod, node) (up == 0) + record: count:up0 diff --git a/manifests/kube-state-metrics-clusterRole.yaml b/manifests/kube-state-metrics-clusterRole.yaml index c04db2900015e53ff0ad3e4475d6d7754c9de364..9b7d81f123a5a5f01bea9480a3af6128df79cca2 100644 --- a/manifests/kube-state-metrics-clusterRole.yaml +++ b/manifests/kube-state-metrics-clusterRole.yaml @@ -2,8 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics rules: - apiGroups: @@ -24,16 +26,6 @@ rules: verbs: - list - watch -- apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - - ingresses - verbs: - - list - - watch - apiGroups: - apps resources: @@ -105,6 +97,14 @@ rules: - networking.k8s.io resources: - networkpolicies + - ingresses + verbs: + - list + - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases verbs: - list - watch diff --git a/manifests/kube-state-metrics-clusterRoleBinding.yaml b/manifests/kube-state-metrics-clusterRoleBinding.yaml index c8f9434da7b873ee1ef2a14601db301b5b0641a7..43243f70bdc654b8319e1f41407cc037a33dbd05 100644 --- a/manifests/kube-state-metrics-clusterRoleBinding.yaml +++ b/manifests/kube-state-metrics-clusterRoleBinding.yaml @@ -2,8 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index b54e64148c4ea9bde9e2863b44dde5f37daf51ba..caf97030142db616280357364d6eeae483efab58 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -2,20 +2,28 @@ apiVersion: apps/v1 kind: Deployment metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics namespace: monitoring spec: replicas: 1 selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus template: metadata: + annotations: + kubectl.kubernetes.io/default-container: kube-state-metrics labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 spec: containers: - args: @@ -23,32 +31,59 @@ spec: - --port=8081 - --telemetry-host=127.0.0.1 - --telemetry-port=8082 - image: quay.io/coreos/kube-state-metrics:v1.9.7 + image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.2.0 name: kube-state-metrics + resources: + limits: + cpu: 100m + memory: 250Mi + requests: + cpu: 10m + memory: 190Mi + securityContext: + runAsUser: 65534 - args: - --logtostderr - --secure-listen-address=:8443 - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 - --upstream=http://127.0.0.1:8081/ - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.11.0 name: kube-rbac-proxy-main ports: - containerPort: 8443 name: https-main + resources: + limits: + cpu: 40m + memory: 40Mi + requests: + cpu: 20m + memory: 20Mi securityContext: - runAsUser: 65534 + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 - args: - --logtostderr - --secure-listen-address=:9443 - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 - --upstream=http://127.0.0.1:8082/ - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.11.0 name: kube-rbac-proxy-self ports: - containerPort: 9443 name: https-self + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi securityContext: - runAsUser: 65534 + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 nodeSelector: kubernetes.io/os: linux serviceAccountName: kube-state-metrics diff --git a/manifests/kube-state-metrics-prometheusRule.yaml b/manifests/kube-state-metrics-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f90f6343c8aa8ee30bced1f65fbe8e61733add71 --- /dev/null +++ b/manifests/kube-state-metrics-prometheusRule.yaml @@ -0,0 +1,65 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 + prometheus: k8s + role: alert-rules + name: kube-state-metrics-rules + namespace: monitoring +spec: + groups: + - name: kube-state-metrics + rules: + - alert: KubeStateMetricsListErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors + summary: kube-state-metrics is experiencing errors in list operations. + expr: | + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsWatchErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors + summary: kube-state-metrics is experiencing errors in watch operations. + expr: | + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardingMismatch + annotations: + description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch + summary: kube-state-metrics sharding is misconfigured. + expr: | + stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardsMissing + annotations: + description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing + summary: kube-state-metrics shards are missing. + expr: | + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 + - + sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) + != 0 + for: 15m + labels: + severity: critical diff --git a/manifests/kube-state-metrics-service.yaml b/manifests/kube-state-metrics-service.yaml index d734a19ba110f225e45afe6255c9e664072fc13b..80bf3bdf8316fd54806f6120893ebf1b90a663ee 100644 --- a/manifests/kube-state-metrics-service.yaml +++ b/manifests/kube-state-metrics-service.yaml @@ -2,8 +2,10 @@ apiVersion: v1 kind: Service metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics namespace: monitoring spec: @@ -16,4 +18,6 @@ spec: port: 9443 targetPort: https-self selector: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/kube-state-metrics-serviceAccount.yaml b/manifests/kube-state-metrics-serviceAccount.yaml index c23b36c1f180a60be837862d768219bbaab14549..248654d012caae85fa2dc651765f5f57963a0508 100644 --- a/manifests/kube-state-metrics-serviceAccount.yaml +++ b/manifests/kube-state-metrics-serviceAccount.yaml @@ -2,7 +2,9 @@ apiVersion: v1 kind: ServiceAccount metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: v1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics namespace: monitoring diff --git a/manifests/kube-state-metrics-serviceMonitor.yaml b/manifests/kube-state-metrics-serviceMonitor.yaml index b860f4c3d34d39b5f455fa8b96bcd682dad8e6df..052e6b2284a7c45ed4763abf596881122925e4bc 100644 --- a/manifests/kube-state-metrics-serviceMonitor.yaml +++ b/manifests/kube-state-metrics-serviceMonitor.yaml @@ -2,8 +2,10 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.7 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.2.0 name: kube-state-metrics namespace: monitoring spec: @@ -28,4 +30,6 @@ spec: jobLabel: app.kubernetes.io/name selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/kubernetes-prometheusRule.yaml b/manifests/kubernetes-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..82a67f1ad66251fe41b81b4d6c807656edd2457c --- /dev/null +++ b/manifests/kubernetes-prometheusRule.yaml @@ -0,0 +1,1380 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + role: alert-rules + name: kubernetes-monitoring-rules + namespace: monitoring +spec: + groups: + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping + summary: Pod is crash looping. + expr: | + increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) > 0 + and + kube_pod_container_status_waiting{job="kube-state-metrics"} == 1 + for: 15m + labels: + severity: warning + - alert: KubePodNotReady + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: | + sum by (namespace, pod) ( + max by(namespace, pod) ( + kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} + ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( + 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 + for: 15m + labels: + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | + ( + kube_deployment_spec_replicas{job="kube-state-metrics"} + > + kube_deployment_status_replicas_available{job="kube-state-metrics"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: | + ( + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: | + ( + ( + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} + != + 0 + ) or ( + kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) + ) and ( + changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeContainerWaiting + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting + summary: Pod container waiting longer than 1 hour + expr: | + sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + for: 1h + labels: + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled + summary: DaemonSet pods are not scheduled. + expr: | + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled + summary: DaemonSet pods are misscheduled. + expr: | + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeJobCompletion + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobcompletion + summary: Job did not complete in time + expr: | + kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 + for: 12h + labels: + severity: warning + - alert: KubeJobFailed + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed + summary: Job failed to complete. + expr: | + kube_job_failed{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch + summary: HPA has not matched descired number of replicas. + expr: | + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} + != + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + > + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + < + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}) + and + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}[15m]) == 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaMaxedOut + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout + summary: HPA is running at max replicas + expr: | + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + == + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + and + (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + for: 10m + labels: + severity: warning + - alert: KubeMemoryOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Pods by {{ $value }} bytes and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + and + (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + for: 10m + labels: + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) + / + sum(kube_node_status_allocatable{resource="cpu"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) + / + sum(kube_node_status_allocatable{resource="memory",job="kube-state-metrics"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaFullyUsed + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused + summary: Namespace quota is fully used. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + == 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaExceeded + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh + summary: Processes experience elevated CPU throttling. + expr: | + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) + > ( 25 / 100 ) + for: 15m + labels: + severity: info + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + ( + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + ( + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical + - name: kubernetes-system + rules: + - alert: KubeVersionMismatch + annotations: + description: There are {{ $value }} different semantic versions of Kubernetes components running. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch + summary: Different semantic versions of Kubernetes components running. + expr: | + count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors + summary: Kubernetes API server client is experiencing errors. + expr: | + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job, namespace) + / + sum(rate(rest_client_requests_total[5m])) by (instance, job, namespace)) + > 0.01 + for: 15m + labels: + severity: warning + - name: kube-apiserver-slos + rules: + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) + and + sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) + for: 2m + labels: + long: 1h + severity: critical + short: 5m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) + and + sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + for: 15m + labels: + long: 6h + severity: critical + short: 30m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) + and + sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) + for: 1h + labels: + long: 1d + severity: warning + short: 2h + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) + and + sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) + for: 3h + labels: + long: 3d + severity: warning + short: 6h + - name: kubernetes-system-apiserver + rules: + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical + - alert: AggregatedAPIErrors + annotations: + description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapierrors + summary: An aggregated API has reported errors. + expr: | + sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 + labels: + severity: warning + - alert: AggregatedAPIDown + annotations: + description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapidown + summary: An aggregated API is down. + expr: | + (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 + for: 5m + labels: + severity: warning + - alert: KubeAPIDown + annotations: + description: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeAPITerminatedRequests + annotations: + description: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests + summary: The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. + expr: | + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + for: 5m + labels: + severity: warning + - name: kubernetes-system-kubelet + rules: + - alert: KubeNodeNotReady + annotations: + description: '{{ $labels.node }} has been unready for more than 15 minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready + summary: Node is not ready. + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: warning + - alert: KubeNodeUnreachable + annotations: + description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable + summary: Node is unreachable. + expr: | + (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods + summary: Kubelet is running at capacity. + expr: | + count by(node) ( + (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + ) + / + max by(node) ( + kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 + ) > 0.95 + for: 15m + labels: + severity: info + - alert: KubeNodeReadinessFlapping + annotations: + description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping + summary: Node readiness status is flapping. + expr: | + sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 + for: 15m + labels: + severity: warning + - alert: KubeletPlegDurationHigh + annotations: + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. + expr: | + node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning + - alert: KubeletPodStartUpLatencyHigh + annotations: + description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh + summary: Kubelet Pod startup latency is too high. + expr: | + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + for: 15m + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletClientCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors + summary: Kubelet has failed to renew its client certificate. + expr: | + increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletServerCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors + summary: Kubelet has failed to renew its server certificate. + expr: | + increase(kubelet_server_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletDown + annotations: + description: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kubelet", metrics_path="/metrics"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-scheduler + rules: + - alert: KubeSchedulerDown + annotations: + description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-scheduler"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-controller-manager + rules: + - alert: KubeControllerManagerDown + annotations: + description: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + severity: critical + - name: kube-apiserver-burnrate.rules + rules: + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1d])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1d])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1d])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + labels: + verb: read + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1h])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + labels: + verb: read + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[2h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[2h])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[2h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + labels: + verb: read + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30m])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30m])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30m])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + labels: + verb: read + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[3d])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[3d])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[3d])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + labels: + verb: read + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[5m])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[5m])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[5m])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: apiserver_request:burnrate5m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[6h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[6h])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[6h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) + labels: + verb: read + record: apiserver_request:burnrate6h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + labels: + verb: write + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + labels: + verb: write + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + labels: + verb: write + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + labels: + verb: write + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + labels: + verb: write + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: apiserver_request:burnrate5m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + labels: + verb: write + record: apiserver_request:burnrate6h + - name: kube-apiserver-histogram.rules + rules: + - expr: | + histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 + labels: + quantile: "0.99" + verb: read + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 + labels: + quantile: "0.99" + verb: write + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - interval: 3m + name: kube-apiserver-availability.rules + rules: + - expr: | + avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + labels: + verb: read + record: code:apiserver_request_total:increase30d + - expr: | + sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + labels: + verb: write + record: code:apiserver_request_total:increase30d + - expr: | + sum by (cluster, verb, scope) (increase(apiserver_request_duration_seconds_count[1h])) + record: cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h + - expr: | + sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h[30d]) * 24 * 30) + record: cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d + - expr: | + sum by (cluster, verb, scope, le) (increase(apiserver_request_duration_seconds_bucket[1h])) + record: cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h + - expr: | + sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h[30d]) * 24 * 30) + record: cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d + - expr: | + 1 - ( + ( + # write too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + ( + # read too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="40"}) + ) + ) + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d) + labels: + verb: all + record: apiserver_request:availability30d + - expr: | + 1 - ( + sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + # too slow + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="40"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) + labels: + verb: read + record: apiserver_request:availability30d + - expr: | + 1 - ( + ( + # too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) + labels: + verb: write + record: apiserver_request:availability30d + - expr: | + sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: code_resource:apiserver_request_total:rate5m + - expr: | + sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: code_resource:apiserver_request_total:rate5m + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - name: k8s.rules + rules: + - expr: | + sum by (cluster, namespace, pod, container) ( + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + - expr: | + container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_working_set_bytes + - expr: | + container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_rss + - expr: | + container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_cache + - expr: | + container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_swap + - expr: | + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_requests:sum + - expr: | + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_requests:sum + - expr: | + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_limits:sum + - expr: | + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_limits:sum + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: deployment + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: daemonset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: statefulset + record: namespace_workload_pod:kube_pod_owner:relabel + - name: kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - name: node.rules + rules: + - expr: | + topk by(namespace, pod) (1, + max by (node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + )) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + count by (cluster, node) (sum by (node, cpu) ( + node_cpu_seconds_total{job="node-exporter"} + * on (namespace, pod) group_left(node) + topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:) + )) + record: node:node_num_cpu:sum + - expr: | + sum( + node_memory_MemAvailable_bytes{job="node-exporter"} or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + node_memory_Cached_bytes{job="node-exporter"} + + node_memory_MemFree_bytes{job="node-exporter"} + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) by (cluster) + record: :node_memory_MemAvailable_bytes:sum + - name: kubelet.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.99" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.9" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.5" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile diff --git a/manifests/kubernetes-serviceMonitorApiserver.yaml b/manifests/kubernetes-serviceMonitorApiserver.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cce20bf7db945c9ceb4692faca255e1d2b082c9f --- /dev/null +++ b/manifests/kubernetes-serviceMonitorApiserver.yaml @@ -0,0 +1,74 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: apiserver + name: kube-apiserver + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(debugging|disk|server).* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_controller_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_step_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50) + sourceLabels: + - __name__ + - le + port: https + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + jobLabel: component + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + component: apiserver + provider: kubernetes diff --git a/manifests/prometheus-serviceMonitorCoreDNS.yaml b/manifests/kubernetes-serviceMonitorCoreDNS.yaml similarity index 83% rename from manifests/prometheus-serviceMonitorCoreDNS.yaml rename to manifests/kubernetes-serviceMonitorCoreDNS.yaml index 633aa18cf1d4d352e4f527b726a1127393da6f4d..38b602d6e72105ef8110689eb369fbc25120d462 100644 --- a/manifests/prometheus-serviceMonitorCoreDNS.yaml +++ b/manifests/kubernetes-serviceMonitorCoreDNS.yaml @@ -2,7 +2,7 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - k8s-app: coredns + app.kubernetes.io/name: coredns name: coredns namespace: monitoring spec: @@ -10,7 +10,7 @@ spec: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token interval: 15s port: metrics - jobLabel: k8s-app + jobLabel: app.kubernetes.io/name namespaceSelector: matchNames: - kube-system diff --git a/manifests/kubernetes-serviceMonitorKubeControllerManager.yaml b/manifests/kubernetes-serviceMonitorKubeControllerManager.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4aab77016ea21c37094ab020d5fef596c4369610 --- /dev/null +++ b/manifests/kubernetes-serviceMonitorKubeControllerManager.yaml @@ -0,0 +1,59 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: kube-controller-manager + name: kube-controller-manager + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(debugging|disk|request|server).* + sourceLabels: + - __name__ + port: https-metrics + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/name: kube-controller-manager diff --git a/manifests/prometheus-serviceMonitorKubeScheduler.yaml b/manifests/kubernetes-serviceMonitorKubeScheduler.yaml similarity index 76% rename from manifests/prometheus-serviceMonitorKubeScheduler.yaml rename to manifests/kubernetes-serviceMonitorKubeScheduler.yaml index 8073eaca160709fb041b9f5764eec1abcfc9240a..ca30352e16bdc14ca4b70dc8dd5e203908e8cb38 100644 --- a/manifests/prometheus-serviceMonitorKubeScheduler.yaml +++ b/manifests/kubernetes-serviceMonitorKubeScheduler.yaml @@ -2,7 +2,7 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - k8s-app: kube-scheduler + app.kubernetes.io/name: kube-scheduler name: kube-scheduler namespace: monitoring spec: @@ -13,10 +13,10 @@ spec: scheme: https tlsConfig: insecureSkipVerify: true - jobLabel: k8s-app + jobLabel: app.kubernetes.io/name namespaceSelector: matchNames: - kube-system selector: matchLabels: - k8s-app: kube-scheduler + app.kubernetes.io/name: kube-scheduler diff --git a/manifests/kubernetes-serviceMonitorKubelet.yaml b/manifests/kubernetes-serviceMonitorKubelet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5c6fc6ff4fdfb108c1c1df202ac80f50733e11ed --- /dev/null +++ b/manifests/kubernetes-serviceMonitorKubelet.yaml @@ -0,0 +1,96 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: kubelet + name: kubelet + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + honorTimestamps: false + interval: 30s + metricRelabelings: + - action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + sourceLabels: + - __name__ + - action: drop + regex: (container_fs_.*|container_spec_.*|container_blkio_device_usage_total|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);; + sourceLabels: + - __name__ + - pod + - namespace + path: /metrics/cadvisor + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + path: /metrics/probes + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/name: kubelet diff --git a/manifests/node-exporter-clusterRole.yaml b/manifests/node-exporter-clusterRole.yaml index ad783ae9bfdb3eb2c57e8d7bc9be580e75880bbf..fe5db25b631269817850191b81749715d23bd708 100644 --- a/manifests/node-exporter-clusterRole.yaml +++ b/manifests/node-exporter-clusterRole.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter rules: - apiGroups: diff --git a/manifests/node-exporter-clusterRoleBinding.yaml b/manifests/node-exporter-clusterRoleBinding.yaml index a5a2050810d0976538a2aa08d4565d4a84bc4a07..be1016beeff20b388e4f000357fd8bb8969a2f9d 100644 --- a/manifests/node-exporter-clusterRoleBinding.yaml +++ b/manifests/node-exporter-clusterRoleBinding.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index 275d275306ff23d347aee2c1a36e301caabf8cb6..69e14810cf1b9711a4dd9288fb32443097b2de59 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -2,30 +2,37 @@ apiVersion: apps/v1 kind: DaemonSet metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v1.0.1 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter namespace: monitoring spec: selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus template: metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v1.0.1 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 spec: containers: - args: - --web.listen-address=127.0.0.1:9100 - - --path.procfs=/host/proc - --path.sysfs=/host/sys - --path.rootfs=/host/root - --no-collector.wifi - --no-collector.hwmon - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) - image: quay.io/prometheus/node-exporter:v1.0.1 + - --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$ + - --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$ + image: quay.io/prometheus/node-exporter:v1.2.2 name: node-exporter resources: limits: @@ -35,10 +42,6 @@ spec: cpu: 102m memory: 180Mi volumeMounts: - - mountPath: /host/proc - mountPropagation: HostToContainer - name: proc - readOnly: true - mountPath: /host/sys mountPropagation: HostToContainer name: sys @@ -57,7 +60,7 @@ spec: valueFrom: fieldRef: fieldPath: status.podIP - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.11.0 name: kube-rbac-proxy ports: - containerPort: 9100 @@ -70,6 +73,10 @@ spec: requests: cpu: 10m memory: 20Mi + securityContext: + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 hostNetwork: true hostPID: true nodeSelector: @@ -81,9 +88,6 @@ spec: tolerations: - operator: Exists volumes: - - hostPath: - path: /proc - name: proc - hostPath: path: /sys name: sys @@ -93,3 +97,4 @@ spec: updateStrategy: rollingUpdate: maxUnavailable: 10% + type: RollingUpdate diff --git a/manifests/node-exporter-prometheusRule.yaml b/manifests/node-exporter-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0a39b89035f2599914524f5e305f591a2b4ec405 --- /dev/null +++ b/manifests/node-exporter-prometheusRule.yaml @@ -0,0 +1,304 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 + prometheus: k8s + role: alert-rules + name: node-exporter-rules + namespace: monitoring +spec: + groups: + - name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 5% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 30m + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 3% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 30m + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 5% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 3% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs + summary: Network interface is reporting many receive errors. + expr: | + rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs + summary: Network interface is reporting many transmit errors. + expr: | + rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ $value | humanizePercentage }} of conntrack entries are used.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit. + expr: | + (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: Node Exporter text file collector failed to scrape. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror + summary: Node Exporter text file collector failed to scrape. + expr: | + node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + description: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected + summary: Clock skew detected. + expr: | + ( + node_timex_offset_seconds > 0.05 + and + deriv(node_timex_offset_seconds[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds < -0.05 + and + deriv(node_timex_offset_seconds[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising + summary: Clock not synchronising. + expr: | + min_over_time(node_timex_sync_status[5m]) == 0 + and + node_timex_maxerror_seconds >= 16 + for: 10m + labels: + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded + summary: RAID Array is degraded + expr: | + node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure + summary: Failed device in RAID array + expr: | + node_md_disks{state="failed"} > 0 + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 + ) + for: 15m + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical + - name: node-exporter.rules + rules: + - expr: | + count without (cpu, mode) ( + node_cpu_seconds_total{job="node-exporter",mode="idle"} + ) + record: instance:node_num_cpu:sum + - expr: | + 1 - avg without (cpu, mode) ( + rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) + ) + record: instance:node_cpu_utilisation:rate5m + - expr: | + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: | + 1 - ( + node_memory_MemAvailable_bytes{job="node-exporter"} + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: | + rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m + - expr: | + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m + - expr: | + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate5m diff --git a/manifests/node-exporter-service.yaml b/manifests/node-exporter-service.yaml index cb9666013e982be081f15ce8d1e4e5a7dd88395c..c18c7c75396bf8b22b7724fa2f5a6c4f4c4d5c1f 100644 --- a/manifests/node-exporter-service.yaml +++ b/manifests/node-exporter-service.yaml @@ -2,8 +2,10 @@ apiVersion: v1 kind: Service metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v1.0.1 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter namespace: monitoring spec: @@ -13,4 +15,6 @@ spec: port: 9100 targetPort: https selector: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/node-exporter-serviceAccount.yaml b/manifests/node-exporter-serviceAccount.yaml index 8a03ac1641f68cbca54468aae86c79a20156058a..343fe6daa3ad5053d002b2cfd9f9f81453f737c2 100644 --- a/manifests/node-exporter-serviceAccount.yaml +++ b/manifests/node-exporter-serviceAccount.yaml @@ -1,5 +1,10 @@ apiVersion: v1 kind: ServiceAccount metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter namespace: monitoring diff --git a/manifests/node-exporter-serviceMonitor.yaml b/manifests/node-exporter-serviceMonitor.yaml index 8e5a97c9ba0786445bad9fd577e591cd12472b9c..6d2edd3a5cb6cf325078b8f36f0ef2aa2427799f 100644 --- a/manifests/node-exporter-serviceMonitor.yaml +++ b/manifests/node-exporter-serviceMonitor.yaml @@ -2,8 +2,10 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v1.0.1 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 1.2.2 name: node-exporter namespace: monitoring spec: @@ -24,4 +26,6 @@ spec: jobLabel: app.kubernetes.io/name selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/prometheus-adapter-apiService.yaml b/manifests/prometheus-adapter-apiService.yaml index a215efe4554c583ec42ec99df08c4b581972c23b..ea4476dca87d6132c160e607eda54232e170b49b 100644 --- a/manifests/prometheus-adapter-apiService.yaml +++ b/manifests/prometheus-adapter-apiService.yaml @@ -1,6 +1,11 @@ apiVersion: apiregistration.k8s.io/v1 kind: APIService metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: v1beta1.metrics.k8s.io spec: group: metrics.k8s.io diff --git a/manifests/prometheus-adapter-clusterRole.yaml b/manifests/prometheus-adapter-clusterRole.yaml index a02d2bb050365e23d178dca0bd580ed6f6c94584..091d981834644e640f9491971f27c8ccc3c905cd 100644 --- a/manifests/prometheus-adapter-clusterRole.yaml +++ b/manifests/prometheus-adapter-clusterRole.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter rules: - apiGroups: diff --git a/manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml b/manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml index 9f0dbb3495d748de8be167007643c9e8696d4fed..4dc8e7a32c91f512232bf7acb2f0855e11198372 100644 --- a/manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml +++ b/manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml @@ -2,6 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 rbac.authorization.k8s.io/aggregate-to-admin: "true" rbac.authorization.k8s.io/aggregate-to-edit: "true" rbac.authorization.k8s.io/aggregate-to-view: "true" diff --git a/manifests/prometheus-adapter-clusterRoleBinding.yaml b/manifests/prometheus-adapter-clusterRoleBinding.yaml index 7e8f3da92b6b7b48ee08481d0a77b850c633c514..dc8bfbb19a45d0504749e814aeb11a4b6fbdad1c 100644 --- a/manifests/prometheus-adapter-clusterRoleBinding.yaml +++ b/manifests/prometheus-adapter-clusterRoleBinding.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml b/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml index 4295b50f008f4a7ce5841445ab71f150dc8c4334..86e7517d0ae9ea7cdb78ad2e5789598b40eef624 100644 --- a/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml +++ b/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: resource-metrics:system:auth-delegator roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/prometheus-adapter-clusterRoleServerResources.yaml b/manifests/prometheus-adapter-clusterRoleServerResources.yaml index fcb914c364dbdef9bf57defdccfe2f0ead709eca..655efb9bf7c1e7bf1afc3a0dda571f9cfa0fd1f3 100644 --- a/manifests/prometheus-adapter-clusterRoleServerResources.yaml +++ b/manifests/prometheus-adapter-clusterRoleServerResources.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: resource-metrics-server-resources rules: - apiGroups: diff --git a/manifests/prometheus-adapter-configMap.yaml b/manifests/prometheus-adapter-configMap.yaml index b2bde3cd1fe3d702149c586831c4ce53d1e8a194..b5329689845d6556e82798e719c7ef1c500e5878 100644 --- a/manifests/prometheus-adapter-configMap.yaml +++ b/manifests/prometheus-adapter-configMap.yaml @@ -4,8 +4,26 @@ data: "resourceRules": "cpu": "containerLabel": "container" - "containerQuery": "sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!=\"POD\",container!=\"\",pod!=\"\"}[5m])) by (<<.GroupBy>>)" - "nodeQuery": "sum(1 - irate(node_cpu_seconds_total{mode=\"idle\"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)" + "containerQuery": | + sum by (<<.GroupBy>>) ( + irate ( + container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="",pod!=""}[120s] + ) + ) + "nodeQuery": | + sum by (<<.GroupBy>>) ( + 1 - irate( + node_cpu_seconds_total{mode="idle"}[60s] + ) + * on(namespace, pod) group_left(node) ( + node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>} + ) + ) + or sum by (<<.GroupBy>>) ( + 1 - irate( + windows_cpu_time_total{mode="idle", job="windows-exporter",<<.LabelMatchers>>}[4m] + ) + ) "resources": "overrides": "namespace": @@ -16,8 +34,21 @@ data: "resource": "pod" "memory": "containerLabel": "container" - "containerQuery": "sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!=\"POD\",container!=\"\",pod!=\"\"}) by (<<.GroupBy>>)" - "nodeQuery": "sum(node_memory_MemTotal_bytes{job=\"node-exporter\",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job=\"node-exporter\",<<.LabelMatchers>>}) by (<<.GroupBy>>)" + "containerQuery": | + sum by (<<.GroupBy>>) ( + container_memory_working_set_bytes{<<.LabelMatchers>>,container!="",pod!=""} + ) + "nodeQuery": | + sum by (<<.GroupBy>>) ( + node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} + - + node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>} + ) + or sum by (<<.GroupBy>>) ( + windows_cs_physical_memory_bytes{job="windows-exporter",<<.LabelMatchers>>} + - + windows_memory_available_bytes{job="windows-exporter",<<.LabelMatchers>>} + ) "resources": "overrides": "instance": @@ -29,5 +60,10 @@ data: "window": "5m" kind: ConfigMap metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: adapter-config namespace: monitoring diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml index bb85e2519f34db7a48908875f72ba1cf4e65f7f7..e6c97afc648549b505f5b00763fc4c1877a47a4f 100644 --- a/manifests/prometheus-adapter-deployment.yaml +++ b/manifests/prometheus-adapter-deployment.yaml @@ -1,21 +1,31 @@ apiVersion: apps/v1 kind: Deployment metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter namespace: monitoring spec: - replicas: 1 + replicas: 2 selector: matchLabels: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus strategy: rollingUpdate: maxSurge: 1 - maxUnavailable: 0 + maxUnavailable: 1 template: metadata: labels: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 spec: containers: - args: @@ -25,10 +35,18 @@ spec: - --metrics-relist-interval=1m - --prometheus-url=http://prometheus-k8s.monitoring.svc.cluster.local:9090/ - --secure-port=6443 - image: directxman12/k8s-prometheus-adapter:v0.8.2 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA + image: k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.0 name: prometheus-adapter ports: - containerPort: 6443 + resources: + limits: + cpu: 250m + memory: 180Mi + requests: + cpu: 102m + memory: 180Mi volumeMounts: - mountPath: /tmp name: tmpfs diff --git a/manifests/prometheus-adapter-podDisruptionBudget.yaml b/manifests/prometheus-adapter-podDisruptionBudget.yaml new file mode 100644 index 0000000000000000000000000000000000000000..639eefd4deaba300e91a78d49f991bd2eaf1e628 --- /dev/null +++ b/manifests/prometheus-adapter-podDisruptionBudget.yaml @@ -0,0 +1,17 @@ +apiVersion: policy/v1beta1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 + name: prometheus-adapter + namespace: monitoring +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/prometheus-adapter-roleBindingAuthReader.yaml b/manifests/prometheus-adapter-roleBindingAuthReader.yaml index 48c8f3253d484a15d96df4fc1265e03e6c273390..3bdf4ad947cb5ba9ebfbcb12922b64f15918ca08 100644 --- a/manifests/prometheus-adapter-roleBindingAuthReader.yaml +++ b/manifests/prometheus-adapter-roleBindingAuthReader.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: resource-metrics-auth-reader namespace: kube-system roleRef: diff --git a/manifests/prometheus-adapter-service.yaml b/manifests/prometheus-adapter-service.yaml index e786e01c4aff3ce0e9ca3c3266b1dda817ca8935..be8c44b6662a60a0917c036f2b6a2cd0d5ac0390 100644 --- a/manifests/prometheus-adapter-service.yaml +++ b/manifests/prometheus-adapter-service.yaml @@ -2,7 +2,10 @@ apiVersion: v1 kind: Service metadata: labels: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter namespace: monitoring spec: @@ -11,4 +14,6 @@ spec: port: 443 targetPort: 6443 selector: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/prometheus-adapter-serviceAccount.yaml b/manifests/prometheus-adapter-serviceAccount.yaml index d7e7050391eecfc48a05e98fcd95d5b50f88a18e..2ddbe4604f786bec2296287daafbc49dc2602621 100644 --- a/manifests/prometheus-adapter-serviceAccount.yaml +++ b/manifests/prometheus-adapter-serviceAccount.yaml @@ -1,5 +1,10 @@ apiVersion: v1 kind: ServiceAccount metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter namespace: monitoring diff --git a/manifests/prometheus-adapter-serviceMonitor.yaml b/manifests/prometheus-adapter-serviceMonitor.yaml index 91a8d51a3a8014e741db041b9aa1b1902a05f201..a75158526ca84b834d2155b9442f85018ba23fcb 100644 --- a/manifests/prometheus-adapter-serviceMonitor.yaml +++ b/manifests/prometheus-adapter-serviceMonitor.yaml @@ -2,7 +2,10 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.9.0 name: prometheus-adapter namespace: monitoring spec: @@ -15,4 +18,6 @@ spec: insecureSkipVerify: true selector: matchLabels: - name: prometheus-adapter + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/prometheus-clusterRole.yaml b/manifests/prometheus-clusterRole.yaml index d5c4598304eaf7fe0c183e34969ca734f793aa75..8fac941b811b7be5c60e82b5e15cb3f414c26b1a 100644 --- a/manifests/prometheus-clusterRole.yaml +++ b/manifests/prometheus-clusterRole.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s rules: - apiGroups: diff --git a/manifests/prometheus-clusterRoleBinding.yaml b/manifests/prometheus-clusterRoleBinding.yaml index 554bb6f8767c0a8a346614c685649321a176b640..cafde39036266841e50c226e63a62fa63f615760 100644 --- a/manifests/prometheus-clusterRoleBinding.yaml +++ b/manifests/prometheus-clusterRoleBinding.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/prometheus-operator-prometheusRule.yaml b/manifests/prometheus-operator-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4921fe6848b7759bfaa9ff13e2148cefe6e7c099 --- /dev/null +++ b/manifests/prometheus-operator-prometheusRule.yaml @@ -0,0 +1,86 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 + prometheus: k8s + role: alert-rules + name: prometheus-operator-rules + namespace: monitoring +spec: + groups: + - name: prometheus-operator + rules: + - alert: PrometheusOperatorListErrors + annotations: + description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors + summary: Errors while performing list operations in controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorWatchErrors + annotations: + description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors + summary: Errors while performing watch operations in controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorSyncFailed + annotations: + description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed + summary: Last controller reconciliation failed + expr: | + min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorReconcileErrors + annotations: + description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors + summary: Errors while reconciling controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors + summary: Errors while reconciling Prometheus. + expr: | + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNotReady + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready + summary: Prometheus operator not ready + expr: | + min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) + for: 5m + labels: + severity: warning + - alert: PrometheusOperatorRejectedResources + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources + summary: Resources rejected by Prometheus operator + expr: | + min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 + for: 5m + labels: + severity: warning diff --git a/manifests/prometheus-operator-serviceMonitor.yaml b/manifests/prometheus-operator-serviceMonitor.yaml index 9bae609a017adb975ceaf8c100f22a73caad7915..dddeb0ac753946378721babdfe0e642ca08e4fb9 100644 --- a/manifests/prometheus-operator-serviceMonitor.yaml +++ b/manifests/prometheus-operator-serviceMonitor.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator namespace: monitoring spec: @@ -19,4 +20,5 @@ spec: matchLabels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 diff --git a/manifests/prometheus-podDisruptionBudget.yaml b/manifests/prometheus-podDisruptionBudget.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f7adef54bae3dd920770067b7bc7626a2f5961a0 --- /dev/null +++ b/manifests/prometheus-podDisruptionBudget.yaml @@ -0,0 +1,18 @@ +apiVersion: policy/v1beta1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 + name: prometheus-k8s + namespace: monitoring +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index e50cc9bfefa6e10c1904b263335865c4f9b67196..8731891f39877e6baaf056ab99c8f1425da6acd2 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -2,18 +2,31 @@ apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 prometheus: k8s name: k8s namespace: monitoring spec: alerting: alertmanagers: - - name: alertmanager-main + - apiVersion: v2 + name: alertmanager-main namespace: monitoring port: web - image: quay.io/prometheus/prometheus:v2.22.1 + enableFeatures: [] + externalLabels: {} + image: quay.io/prometheus/prometheus:v2.29.2 nodeSelector: kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 podMonitorNamespaceSelector: {} podMonitorSelector: {} probeNamespaceSelector: {} @@ -22,10 +35,8 @@ spec: resources: requests: memory: 400Mi - ruleSelector: - matchLabels: - prometheus: k8s - role: alert-rules + ruleNamespaceSelector: {} + ruleSelector: {} securityContext: fsGroup: 2000 runAsNonRoot: true @@ -33,4 +44,4 @@ spec: serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} - version: v2.22.1 + version: 2.29.2 diff --git a/manifests/prometheus-prometheusRule.yaml b/manifests/prometheus-prometheusRule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c15e8c139ad3bdb8ed8054f0acde5270236181d0 --- /dev/null +++ b/manifests/prometheus-prometheusRule.yaml @@ -0,0 +1,249 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 + prometheus: k8s + role: alert-rules + name: prometheus-k8s-prometheus-rules + namespace: monitoring +spec: + groups: + - name: prometheus + rules: + - alert: PrometheusBadConfig + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig + summary: Failed Prometheus configuration reload. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull + summary: Prometheus alert notification queue predicted to run full in less than 30m. + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers + annotations: + description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers + summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. + expr: | + ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers + summary: Prometheus is not connected to any Alertmanagers. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing + summary: Prometheus has issues reloading blocks from disk. + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing + summary: Prometheus has issues compacting blocks. + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples + summary: Prometheus is not ingesting samples. + expr: | + ( + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0 + ) + ) + for: 10m + labels: + severity: warning + - alert: PrometheusDuplicateTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps + summary: Prometheus is dropping samples with duplicate timestamps. + expr: | + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOutOfOrderTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps + summary: Prometheus drops samples with out-of-order timestamps. + expr: | + rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusRemoteStorageFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures + summary: Prometheus fails to send samples to remote storage. + expr: | + ( + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + / + ( + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + + + (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + ) + ) + * 100 + > 1 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteBehind + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind + summary: Prometheus remote write is behind. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + - ignoring(remote_name, url) group_right + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + > 120 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteDesiredShards + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards + summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) + > + max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusRuleFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures + summary: Prometheus is failing rule evaluations. + expr: | + increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: PrometheusMissingRuleEvaluations + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations + summary: Prometheus is missing rule evaluations due to slow rule group evaluation. + expr: | + increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusLabelLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetSyncFailure + annotations: + description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure + summary: Prometheus has failed to sync targets. + expr: | + increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring"}[30m]) > 0 + for: 5m + labels: + severity: critical + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: | + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical diff --git a/manifests/prometheus-roleBindingConfig.yaml b/manifests/prometheus-roleBindingConfig.yaml index ec0129db5bf7f474b8aa124b63be72d50e97d667..ca17d7eb791f99100e91c4946aa1d9c743933c85 100644 --- a/manifests/prometheus-roleBindingConfig.yaml +++ b/manifests/prometheus-roleBindingConfig.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s-config namespace: monitoring roleRef: diff --git a/manifests/prometheus-roleBindingSpecificNamespaces.yaml b/manifests/prometheus-roleBindingSpecificNamespaces.yaml index c7527f6aa1330f894adb83136444c8160ea92e90..33309a20210c7ac06b1a1ff1316c090ec09cc621 100644 --- a/manifests/prometheus-roleBindingSpecificNamespaces.yaml +++ b/manifests/prometheus-roleBindingSpecificNamespaces.yaml @@ -3,6 +3,11 @@ items: - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: default roleRef: @@ -16,6 +21,11 @@ items: - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: kube-system roleRef: @@ -29,6 +39,11 @@ items: - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: monitoring roleRef: diff --git a/manifests/prometheus-roleConfig.yaml b/manifests/prometheus-roleConfig.yaml index 5f1cd043919a18141c2331c05836d5ef45d099eb..0f7129cb4540e80128ad830bcbd1395279f2a6de 100644 --- a/manifests/prometheus-roleConfig.yaml +++ b/manifests/prometheus-roleConfig.yaml @@ -1,6 +1,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s-config namespace: monitoring rules: diff --git a/manifests/prometheus-roleSpecificNamespaces.yaml b/manifests/prometheus-roleSpecificNamespaces.yaml index 689baa932f888172044e553da52c2398f1282efa..ffe15628b8cb42f2d5065ee5d728fe02689a0349 100644 --- a/manifests/prometheus-roleSpecificNamespaces.yaml +++ b/manifests/prometheus-roleSpecificNamespaces.yaml @@ -3,6 +3,11 @@ items: - apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: default rules: @@ -24,9 +29,22 @@ items: - get - list - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch - apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: kube-system rules: @@ -48,9 +66,22 @@ items: - get - list - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch - apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: monitoring rules: @@ -72,4 +103,12 @@ items: - get - list - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch kind: RoleList diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml deleted file mode 100644 index a46fe442fb21627b1c7358502f39e77e6096c351..0000000000000000000000000000000000000000 --- a/manifests/prometheus-rules.yaml +++ /dev/null @@ -1,2011 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - prometheus: k8s - role: alert-rules - name: prometheus-k8s-rules - namespace: monitoring -spec: - groups: - - name: node-exporter.rules - rules: - - expr: | - count without (cpu) ( - count without (mode) ( - node_cpu_seconds_total{job="node-exporter"} - ) - ) - record: instance:node_num_cpu:sum - - expr: | - 1 - avg without (cpu, mode) ( - rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) - ) - record: instance:node_cpu_utilisation:rate1m - - expr: | - ( - node_load1{job="node-exporter"} - / - instance:node_num_cpu:sum{job="node-exporter"} - ) - record: instance:node_load1_per_cpu:ratio - - expr: | - 1 - ( - node_memory_MemAvailable_bytes{job="node-exporter"} - / - node_memory_MemTotal_bytes{job="node-exporter"} - ) - record: instance:node_memory_utilisation:ratio - - expr: | - rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) - record: instance:node_vmstat_pgmajfault:rate1m - - expr: | - rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_seconds:rate1m - - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_weighted_seconds:rate1m - - expr: | - sum without (device) ( - rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_receive_bytes_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_transmit_bytes_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_receive_drop_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_transmit_drop_excluding_lo:rate1m - - name: kube-apiserver.rules - rules: - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) - labels: - verb: read - record: apiserver_request:burnrate1d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) - labels: - verb: read - record: apiserver_request:burnrate1h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) - labels: - verb: read - record: apiserver_request:burnrate2h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) - labels: - verb: read - record: apiserver_request:burnrate30m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) - labels: - verb: read - record: apiserver_request:burnrate3d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: apiserver_request:burnrate5m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) - labels: - verb: read - record: apiserver_request:burnrate6h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - labels: - verb: write - record: apiserver_request:burnrate1d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - labels: - verb: write - record: apiserver_request:burnrate1h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - labels: - verb: write - record: apiserver_request:burnrate2h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - labels: - verb: write - record: apiserver_request:burnrate30m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - labels: - verb: write - record: apiserver_request:burnrate3d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: apiserver_request:burnrate5m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - labels: - verb: write - record: apiserver_request:burnrate6h - - expr: | - sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: code_resource:apiserver_request_total:rate5m - - expr: | - sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: code_resource:apiserver_request_total:rate5m - - expr: | - histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 - labels: - quantile: "0.99" - verb: read - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 - labels: - quantile: "0.99" - verb: write - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) - / - sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) - record: cluster:apiserver_request_duration_seconds:mean5m - - expr: | - histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - interval: 3m - name: kube-apiserver-availability.rules - rules: - - expr: | - 1 - ( - ( - # write too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) - ) + - ( - # read too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) - - - ( - ( - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) - or - vector(0) - ) - + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) - + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) - ) - ) + - # errors - sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d) - labels: - verb: all - record: apiserver_request:availability30d - - expr: | - 1 - ( - sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) - - - ( - # too slow - ( - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) - or - vector(0) - ) - + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) - + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) - ) - + - # errors - sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d{verb="read"}) - labels: - verb: read - record: apiserver_request:availability30d - - expr: | - 1 - ( - ( - # too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) - ) - + - # errors - sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d{verb="write"}) - labels: - verb: write - record: apiserver_request:availability30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) - labels: - verb: read - record: code:apiserver_request_total:increase30d - - expr: | - sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - labels: - verb: write - record: code:apiserver_request_total:increase30d - - name: k8s.rules - rules: - - expr: | - sum(rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])) by (namespace) - record: namespace:container_cpu_usage_seconds_total:sum_rate - - expr: | - sum by (cluster, namespace, pod, container) ( - rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) - ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( - 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate - - expr: | - container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_working_set_bytes - - expr: | - container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_rss - - expr: | - container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_cache - - expr: | - container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_swap - - expr: | - sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace) - record: namespace:container_memory_usage_bytes:sum - - expr: | - sum by (namespace) ( - sum by (namespace, pod) ( - max by (namespace, pod, container) ( - kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} - ) * on(namespace, pod) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace:kube_pod_container_resource_requests_memory_bytes:sum - - expr: | - sum by (namespace) ( - sum by (namespace, pod) ( - max by (namespace, pod, container) ( - kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} - ) * on(namespace, pod) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace:kube_pod_container_resource_requests_cpu_cores:sum - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, - "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( - 1, max by (replicaset, namespace, owner_name) ( - kube_replicaset_owner{job="kube-state-metrics"} - ) - ), - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: deployment - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: daemonset - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: statefulset - record: namespace_workload_pod:kube_pod_owner:relabel - - name: kube-scheduler.rules - rules: - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - name: node.rules - rules: - - expr: | - sum(min(kube_pod_info{node!=""}) by (cluster, node)) - record: ':kube_pod_info_node_count:' - - expr: | - topk by(namespace, pod) (1, - max by (node, namespace, pod) ( - label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") - )) - record: 'node_namespace_pod:kube_pod_info:' - - expr: | - count by (cluster, node) (sum by (node, cpu) ( - node_cpu_seconds_total{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - )) - record: node:node_num_cpu:sum - - expr: | - sum( - node_memory_MemAvailable_bytes{job="node-exporter"} or - ( - node_memory_Buffers_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Slab_bytes{job="node-exporter"} - ) - ) by (cluster) - record: :node_memory_MemAvailable_bytes:sum - - name: kubelet.rules - rules: - - expr: | - histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: "0.99" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: "0.9" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) - labels: - quantile: "0.5" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - name: kube-prometheus-node-recording.rules - rules: - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance) - record: instance:node_cpu:rate:sum - - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) - record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) - record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) - record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) - record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) - record: cluster:node_cpu:ratio - - name: kube-prometheus-general.rules - rules: - - expr: count without(instance, pod, node) (up == 1) - record: count:up1 - - expr: count without(instance, pod, node) (up == 0) - record: count:up0 - - name: kube-state-metrics - rules: - - alert: KubeStateMetricsListErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors - summary: kube-state-metrics is experiencing errors in list operations. - expr: | - (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical - - alert: KubeStateMetricsWatchErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors - summary: kube-state-metrics is experiencing errors in watch operations. - expr: | - (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical - - name: node-exporter - rules: - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 24 hours. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 4 hours. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace - summary: Filesystem has less than 5% space left. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace - summary: Filesystem has less than 3% space left. - expr: | - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 24 hours. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 4 hours. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles - summary: Filesystem has less than 5% inodes left. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles - summary: Filesystem has less than 3% inodes left. - expr: | - ( - node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs - summary: Network interface is reporting many receive errors. - expr: | - rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeNetworkTransmitErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs - summary: Network interface is reporting many transmit errors. - expr: | - rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{{ $value | humanizePercentage }} of conntrack entries are used.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused - summary: Number of conntrack are getting close to the limit. - expr: | - (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 - labels: - severity: warning - - alert: NodeTextFileCollectorScrapeError - annotations: - description: Node Exporter text file collector failed to scrape. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror - summary: Node Exporter text file collector failed to scrape. - expr: | - node_textfile_scrape_error{job="node-exporter"} == 1 - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected - summary: Clock skew detected. - expr: | - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising - summary: Clock not synchronising. - expr: | - min_over_time(node_timex_sync_status[5m]) == 0 - and - node_timex_maxerror_seconds >= 16 - for: 10m - labels: - severity: warning - - alert: NodeRAIDDegraded - annotations: - description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded - summary: RAID Array is degraded - expr: | - node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 - for: 15m - labels: - severity: critical - - alert: NodeRAIDDiskFailure - annotations: - description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure - summary: Failed device in RAID array - expr: | - node_md_disks{state="fail"} > 0 - labels: - severity: warning - - name: prometheus-operator - rules: - - alert: PrometheusOperatorListErrors - annotations: - description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorlisterrors - summary: Errors while performing list operations in controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorWatchErrors - annotations: - description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors - summary: Errors while performing watch operations in controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorSyncFailed - annotations: - description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorsyncfailed - summary: Last controller reconciliation failed - expr: | - min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorReconcileErrors - annotations: - description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorreconcileerrors - summary: Errors while reconciling controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNodeLookupErrors - annotations: - description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors - summary: Errors while reconciling Prometheus. - expr: | - rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNotReady - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornotready - summary: Prometheus operator not ready - expr: | - min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) - for: 5m - labels: - severity: warning - - alert: PrometheusOperatorRejectedResources - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorrejectedresources - summary: Resources rejected by Prometheus operator - expr: | - min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 - for: 5m - labels: - severity: warning - - name: kubernetes-apps - rules: - - alert: KubePodCrashLooping - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping - summary: Pod is crash looping. - expr: | - rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 - for: 15m - labels: - severity: warning - - alert: KubePodNotReady - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready - summary: Pod has been in a non-ready state for more than 15 minutes. - expr: | - sum by (namespace, pod) ( - max by(namespace, pod) ( - kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} - ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( - 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) - ) - ) > 0 - for: 15m - labels: - severity: warning - - alert: KubeDeploymentGenerationMismatch - annotations: - description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch - summary: Deployment generation mismatch due to possible roll-back - expr: | - kube_deployment_status_observed_generation{job="kube-state-metrics"} - != - kube_deployment_metadata_generation{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - alert: KubeDeploymentReplicasMismatch - annotations: - description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: | - ( - kube_deployment_spec_replicas{job="kube-state-metrics"} - != - kube_deployment_status_replicas_available{job="kube-state-metrics"} - ) and ( - changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetReplicasMismatch - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: | - ( - kube_statefulset_status_replicas_ready{job="kube-state-metrics"} - != - kube_statefulset_status_replicas{job="kube-state-metrics"} - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetGenerationMismatch - annotations: - description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch - summary: StatefulSet generation mismatch due to possible roll-back - expr: | - kube_statefulset_status_observed_generation{job="kube-state-metrics"} - != - kube_statefulset_metadata_generation{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetUpdateNotRolledOut - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout - summary: StatefulSet update has not been rolled out. - expr: | - ( - max without (revision) ( - kube_statefulset_status_current_revision{job="kube-state-metrics"} - unless - kube_statefulset_status_update_revision{job="kube-state-metrics"} - ) - * - ( - kube_statefulset_replicas{job="kube-state-metrics"} - != - kube_statefulset_status_replicas_updated{job="kube-state-metrics"} - ) - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeDaemonSetRolloutStuck - annotations: - description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - summary: DaemonSet rollout is stuck. - expr: | - ( - ( - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} - != - 0 - ) or ( - kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_available{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) - ) and ( - changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeContainerWaiting - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting - summary: Pod container waiting longer than 1 hour - expr: | - sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 - for: 1h - labels: - severity: warning - - alert: KubeDaemonSetNotScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled - summary: DaemonSet pods are not scheduled. - expr: | - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - - - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 - for: 10m - labels: - severity: warning - - alert: KubeDaemonSetMisScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled - summary: DaemonSet pods are misscheduled. - expr: | - kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 - for: 15m - labels: - severity: warning - - alert: KubeJobCompletion - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion - summary: Job did not complete in time - expr: | - kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 - for: 12h - labels: - severity: warning - - alert: KubeJobFailed - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed - summary: Job failed to complete. - expr: | - kube_job_failed{job="kube-state-metrics"} > 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaReplicasMismatch - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch - summary: HPA has not matched descired number of replicas. - expr: | - (kube_hpa_status_desired_replicas{job="kube-state-metrics"} - != - kube_hpa_status_current_replicas{job="kube-state-metrics"}) - and - changes(kube_hpa_status_current_replicas[15m]) == 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaMaxedOut - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout - summary: HPA is running at max replicas - expr: | - kube_hpa_status_current_replicas{job="kube-state-metrics"} - == - kube_hpa_spec_max_replicas{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - name: kubernetes-resources - rules: - - alert: KubeCPUOvercommit - annotations: - description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: | - sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) - / - sum(kube_node_status_allocatable_cpu_cores) - > - (count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores) - for: 5m - labels: - severity: warning - - alert: KubeMemoryOvercommit - annotations: - description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit - summary: Cluster has overcommitted memory resource requests. - expr: | - sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) - / - sum(kube_node_status_allocatable_memory_bytes) - > - (count(kube_node_status_allocatable_memory_bytes)-1) - / - count(kube_node_status_allocatable_memory_bytes) - for: 5m - labels: - severity: warning - - alert: KubeCPUQuotaOvercommit - annotations: - description: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) - / - sum(kube_node_status_allocatable_cpu_cores) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeMemoryQuotaOvercommit - annotations: - description: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit - summary: Cluster has overcommitted memory resource requests. - expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) - / - sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeQuotaAlmostFull - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull - summary: Namespace quota is going to be full. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 0.9 < 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaFullyUsed - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused - summary: Namespace quota is fully used. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - == 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaExceeded - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded - summary: Namespace quota has exceeded the limits. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 1 - for: 15m - labels: - severity: warning - - alert: CPUThrottlingHigh - annotations: - description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh - summary: Processes experience elevated CPU throttling. - expr: | - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) - / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) - > ( 25 / 100 ) - for: 15m - labels: - severity: info - - name: kubernetes-storage - rules: - - alert: KubePersistentVolumeFillingUp - annotations: - description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: | - kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} - < 0.03 - for: 1m - labels: - severity: critical - - alert: KubePersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: | - ( - kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} - ) < 0.15 - and - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 - for: 1h - labels: - severity: warning - - alert: KubePersistentVolumeErrors - annotations: - description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors - summary: PersistentVolume is having issues with provisioning. - expr: | - kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 - for: 5m - labels: - severity: critical - - name: kubernetes-system - rules: - - alert: KubeVersionMismatch - annotations: - description: There are {{ $value }} different semantic versions of Kubernetes components running. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch - summary: Different semantic versions of Kubernetes components running. - expr: | - count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 - for: 15m - labels: - severity: warning - - alert: KubeClientErrors - annotations: - description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors - summary: Kubernetes API server client is experiencing errors. - expr: | - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) - / - sum(rate(rest_client_requests_total[5m])) by (instance, job)) - > 0.01 - for: 15m - labels: - severity: warning - - name: kube-apiserver-slos - rules: - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) - and - sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) - for: 2m - labels: - long: 1h - severity: critical - short: 5m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) - and - sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) - for: 15m - labels: - long: 6h - severity: critical - short: 30m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) - and - sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) - for: 1h - labels: - long: 1d - severity: warning - short: 2h - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) - and - sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) - for: 3h - labels: - long: 3d - severity: warning - short: 6h - - name: kubernetes-system-apiserver - rules: - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 - labels: - severity: warning - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 - labels: - severity: critical - - alert: AggregatedAPIErrors - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors - summary: An aggregated API has reported errors. - expr: | - sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 - labels: - severity: warning - - alert: AggregatedAPIDown - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown - summary: An aggregated API is down. - expr: | - (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 - for: 5m - labels: - severity: warning - - alert: KubeAPIDown - annotations: - description: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="apiserver"} == 1) - for: 15m - labels: - severity: critical - - name: kubernetes-system-kubelet - rules: - - alert: KubeNodeNotReady - annotations: - description: '{{ $labels.node }} has been unready for more than 15 minutes.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready - summary: Node is not ready. - expr: | - kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 - for: 15m - labels: - severity: warning - - alert: KubeNodeUnreachable - annotations: - description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable - summary: Node is unreachable. - expr: | - (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 - for: 15m - labels: - severity: warning - - alert: KubeletTooManyPods - annotations: - description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods - summary: Kubelet is running at capacity. - expr: | - count by(node) ( - (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) - ) - / - max by(node) ( - kube_node_status_capacity_pods{job="kube-state-metrics"} != 1 - ) > 0.95 - for: 15m - labels: - severity: warning - - alert: KubeNodeReadinessFlapping - annotations: - description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping - summary: Node readiness status is flapping. - expr: | - sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 - for: 15m - labels: - severity: warning - - alert: KubeletPlegDurationHigh - annotations: - description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh - summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. - expr: | - node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 - for: 5m - labels: - severity: warning - - alert: KubeletPodStartUpLatencyHigh - annotations: - description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh - summary: Kubelet Pod startup latency is too high. - expr: | - histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 - for: 15m - labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 604800 - labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 86400 - labels: - severity: critical - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 604800 - labels: - severity: warning - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 86400 - labels: - severity: critical - - alert: KubeletClientCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors - summary: Kubelet has failed to renew its client certificate. - expr: | - increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: KubeletServerCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors - summary: Kubelet has failed to renew its server certificate. - expr: | - increase(kubelet_server_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: KubeletDown - annotations: - description: Kubelet has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kubelet", metrics_path="/metrics"} == 1) - for: 15m - labels: - severity: critical - - name: kubernetes-system-scheduler - rules: - - alert: KubeSchedulerDown - annotations: - description: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kube-scheduler"} == 1) - for: 15m - labels: - severity: critical - - name: kubernetes-system-controller-manager - rules: - - alert: KubeControllerManagerDown - annotations: - description: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kube-controller-manager"} == 1) - for: 15m - labels: - severity: critical - - name: prometheus - rules: - - alert: PrometheusBadConfig - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. - summary: Failed Prometheus configuration reload. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 - for: 10m - labels: - severity: critical - - alert: PrometheusNotificationQueueRunningFull - annotations: - description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. - summary: Prometheus alert notification queue predicted to run full in less than 30m. - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) - > - min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers - annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. - expr: | - ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - * 100 - > 1 - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: | - min without(alertmanager) ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical - - alert: PrometheusNotConnectedToAlertmanagers - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. - summary: Prometheus is not connected to any Alertmanagers. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 - for: 10m - labels: - severity: warning - - alert: PrometheusTSDBReloadsFailing - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. - summary: Prometheus has issues reloading blocks from disk. - expr: | - increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusTSDBCompactionsFailing - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. - summary: Prometheus has issues compacting blocks. - expr: | - increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusNotIngestingSamples - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. - summary: Prometheus is not ingesting samples. - expr: | - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 - for: 10m - labels: - severity: warning - - alert: PrometheusDuplicateTimestamps - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. - summary: Prometheus is dropping samples with duplicate timestamps. - expr: | - rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOutOfOrderTimestamps - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. - summary: Prometheus drops samples with out-of-order timestamps. - expr: | - rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusRemoteStorageFailures - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} - summary: Prometheus fails to send samples to remote storage. - expr: | - ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - / - ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - + - rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - ) - * 100 - > 1 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteBehind - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. - summary: Prometheus remote write is behind. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) - - on(job, instance) group_right - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - > 120 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteDesiredShards - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. - summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) - > - max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusRuleFailures - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. - summary: Prometheus is failing rule evaluations. - expr: | - increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m - labels: - severity: critical - - alert: PrometheusMissingRuleEvaluations - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. - summary: Prometheus is missing rule evaluations due to slow rule group evaluation. - expr: | - increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusTargetLimitHit - annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. - summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. - expr: | - increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 - for: 15m - labels: - severity: warning - - name: alertmanager.rules - rules: - - alert: AlertmanagerConfigInconsistent - annotations: - message: | - The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync. - {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }} - Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}" - {{ end }} - expr: | - count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})) != 1 - for: 5m - labels: - severity: critical - - alert: AlertmanagerFailedReload - annotations: - message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. - expr: | - alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0 - for: 10m - labels: - severity: warning - - alert: AlertmanagerMembersInconsistent - annotations: - message: Alertmanager has not found all other members of the cluster. - expr: | - alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"} - != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}) - for: 5m - labels: - severity: critical - - name: general.rules - rules: - - alert: TargetDown - annotations: - message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' - expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 - for: 10m - labels: - severity: warning - - alert: Watchdog - annotations: - message: | - This is an alert meant to ensure that the entire alerting pipeline is functional. - This alert is always firing, therefore it should always be firing in Alertmanager - and always fire against a receiver. There are integrations with various notification - mechanisms that send a notification when this alert is not firing. For example the - "DeadMansSnitch" integration in PagerDuty. - expr: vector(1) - labels: - severity: none - - name: node-network - rules: - - alert: NodeNetworkInterfaceFlapping - annotations: - message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" - expr: | - changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 - for: 2m - labels: - severity: warning diff --git a/manifests/prometheus-service.yaml b/manifests/prometheus-service.yaml index 4f61e88ab777a70b7ff106cceb10a42556949e2b..0b14d9bb514e2edf2a49887bae68afca6f087508 100644 --- a/manifests/prometheus-service.yaml +++ b/manifests/prometheus-service.yaml @@ -2,6 +2,10 @@ apiVersion: v1 kind: Service metadata: labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 prometheus: k8s name: prometheus-k8s namespace: monitoring @@ -12,5 +16,8 @@ spec: targetPort: web selector: app: prometheus + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus prometheus: k8s sessionAffinity: ClientIP diff --git a/manifests/prometheus-serviceAccount.yaml b/manifests/prometheus-serviceAccount.yaml index 3e55fad675b8691f3aab29f152fdaa5416605745..371b8ec9fb81f3016a0dd40e89568e53afb3b3a7 100644 --- a/manifests/prometheus-serviceAccount.yaml +++ b/manifests/prometheus-serviceAccount.yaml @@ -1,5 +1,10 @@ apiVersion: v1 kind: ServiceAccount metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 name: prometheus-k8s namespace: monitoring diff --git a/manifests/prometheus-serviceMonitor.yaml b/manifests/prometheus-serviceMonitor.yaml index b7605dbeb6e4a4253db105a0a6faee1882880767..b5282ebf17cf1742dc800e47dc99c5dedaf649f1 100644 --- a/manifests/prometheus-serviceMonitor.yaml +++ b/manifests/prometheus-serviceMonitor.yaml @@ -2,8 +2,11 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - k8s-app: prometheus - name: prometheus + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.29.2 + name: prometheus-k8s namespace: monitoring spec: endpoints: @@ -11,4 +14,7 @@ spec: port: web selector: matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus prometheus: k8s diff --git a/manifests/prometheus-serviceMonitorApiserver.yaml b/manifests/prometheus-serviceMonitorApiserver.yaml deleted file mode 100644 index 1ff61fe9f56a0fb27bec7be175f99b2f9024b0a3..0000000000000000000000000000000000000000 --- a/manifests/prometheus-serviceMonitorApiserver.yaml +++ /dev/null @@ -1,74 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - k8s-app: apiserver - name: kube-apiserver - namespace: monitoring -spec: - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s - metricRelabelings: - - action: drop - regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) - sourceLabels: - - __name__ - - action: drop - regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) - sourceLabels: - - __name__ - - action: drop - regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) - sourceLabels: - - __name__ - - action: drop - regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) - sourceLabels: - - __name__ - - action: drop - regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) - sourceLabels: - - __name__ - - action: drop - regex: transformation_(transformation_latencies_microseconds|failures_total) - sourceLabels: - - __name__ - - action: drop - regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(debugging|disk|server).* - sourceLabels: - - __name__ - - action: drop - regex: apiserver_admission_controller_admission_latencies_seconds_.* - sourceLabels: - - __name__ - - action: drop - regex: apiserver_admission_step_admission_latencies_seconds_.* - sourceLabels: - - __name__ - - action: drop - regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50) - sourceLabels: - - __name__ - - le - port: https - scheme: https - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - serverName: kubernetes - jobLabel: component - namespaceSelector: - matchNames: - - default - selector: - matchLabels: - component: apiserver - provider: kubernetes diff --git a/manifests/prometheus-serviceMonitorKubeControllerManager.yaml b/manifests/prometheus-serviceMonitorKubeControllerManager.yaml deleted file mode 100644 index 0f23d84df850d717204ac67428fac0a55db0b3bf..0000000000000000000000000000000000000000 --- a/manifests/prometheus-serviceMonitorKubeControllerManager.yaml +++ /dev/null @@ -1,59 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - k8s-app: kube-controller-manager - name: kube-controller-manager - namespace: monitoring -spec: - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s - metricRelabelings: - - action: drop - regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) - sourceLabels: - - __name__ - - action: drop - regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) - sourceLabels: - - __name__ - - action: drop - regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) - sourceLabels: - - __name__ - - action: drop - regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) - sourceLabels: - - __name__ - - action: drop - regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) - sourceLabels: - - __name__ - - action: drop - regex: transformation_(transformation_latencies_microseconds|failures_total) - sourceLabels: - - __name__ - - action: drop - regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(debugging|disk|request|server).* - sourceLabels: - - __name__ - port: https-metrics - scheme: https - tlsConfig: - insecureSkipVerify: true - jobLabel: k8s-app - namespaceSelector: - matchNames: - - kube-system - selector: - matchLabels: - k8s-app: kube-controller-manager diff --git a/manifests/prometheus-serviceMonitorKubelet.yaml b/manifests/prometheus-serviceMonitorKubelet.yaml deleted file mode 100644 index 7db47ef0ad0d7a3c7f8074ee32b53160ebe708d1..0000000000000000000000000000000000000000 --- a/manifests/prometheus-serviceMonitorKubelet.yaml +++ /dev/null @@ -1,90 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - k8s-app: kubelet - name: kubelet - namespace: monitoring -spec: - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - honorLabels: true - interval: 30s - metricRelabelings: - - action: drop - regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) - sourceLabels: - - __name__ - - action: drop - regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) - sourceLabels: - - __name__ - - action: drop - regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) - sourceLabels: - - __name__ - - action: drop - regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) - sourceLabels: - - __name__ - - action: drop - regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) - sourceLabels: - - __name__ - - action: drop - regex: transformation_(transformation_latencies_microseconds|failures_total) - sourceLabels: - - __name__ - - action: drop - regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) - sourceLabels: - - __name__ - port: https-metrics - relabelings: - - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - scheme: https - tlsConfig: - insecureSkipVerify: true - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - honorLabels: true - honorTimestamps: false - interval: 30s - metricRelabelings: - - action: drop - regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) - sourceLabels: - - __name__ - path: /metrics/cadvisor - port: https-metrics - relabelings: - - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - scheme: https - tlsConfig: - insecureSkipVerify: true - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - honorLabels: true - interval: 30s - path: /metrics/probes - port: https-metrics - relabelings: - - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - scheme: https - tlsConfig: - insecureSkipVerify: true - jobLabel: k8s-app - namespaceSelector: - matchNames: - - kube-system - selector: - matchLabels: - k8s-app: kubelet diff --git a/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml index 9c923d54f4b8afc80d152b169f4cd1eff7c6adb6..7f87995914faef4b89f206e70204ed589630ba99 100644 --- a/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: alertmanagerconfigs.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: AlertmanagerConfig listKind: AlertmanagerConfigList plural: alertmanagerconfigs @@ -17,7 +19,7 @@ spec: - name: v1alpha1 schema: openAPIV3Schema: - description: AlertmanagerConfig defines a namespaced AlertmanagerConfig to be aggregated across multiple namespaces configuring one Alertmanager. + description: AlertmanagerConfig defines a namespaced AlertmanagerConfig to be aggregated across multiple namespaces configuring one Alertmanager cluster. properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' @@ -28,54 +30,251 @@ spec: metadata: type: object spec: + description: AlertmanagerConfigSpec is a specification of the desired behavior of the Alertmanager configuration. By definition, the Alertmanager configuration only applies to alerts for which the `namespace` label is equal to the namespace of the AlertmanagerConfig resource. properties: inhibitRules: + description: List of inhibition rules. The rules will only apply to alerts matching the resource’s namespace. items: + description: InhibitRule defines an inhibition rule that allows to mute alerts when other alerts are already firing. See https://prometheus.io/docs/alerting/latest/configuration/#inhibit_rule properties: equal: + description: Labels that must have an equal value in the source and target alert for the inhibition to take effect. items: type: string type: array sourceMatch: + description: Matchers for which one or more alerts have to exist for the inhibition to take effect. The operator enforces that the alert matches the resource’s namespace. items: + description: Matcher defines how to match on alert's labels. properties: name: + description: Label to match. + minLength: 1 type: string regex: + description: Whether to match on equality (false) or regular-expression (true). type: boolean value: + description: Label value to match. type: string required: - name - - value type: object type: array targetMatch: + description: Matchers that have to be fulfilled in the alerts to be muted. The operator enforces that the alert matches the resource’s namespace. items: + description: Matcher defines how to match on alert's labels. properties: name: + description: Label to match. + minLength: 1 type: string regex: + description: Whether to match on equality (false) or regular-expression (true). type: boolean value: + description: Label value to match. type: string required: - name - - value type: object type: array type: object type: array receivers: + description: List of receivers. items: + description: Receiver defines one or more notification integrations. properties: + emailConfigs: + description: List of Email configurations. + items: + description: EmailConfig configures notifications via Email. + properties: + authIdentity: + description: The identity to use for authentication. + type: string + authPassword: + description: The secret's key that contains the password to use for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + authSecret: + description: The secret's key that contains the CRAM-MD5 secret. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + authUsername: + description: The username to use for authentication. + type: string + from: + description: The sender address. + type: string + headers: + description: Further headers email header key/value pairs. Overrides any headers previously set by the notification implementation. + items: + description: KeyValue defines a (key, value) tuple. + properties: + key: + description: Key of the tuple. + minLength: 1 + type: string + value: + description: Value of the tuple. + type: string + required: + - key + - value + type: object + type: array + hello: + description: The hostname to identify to the SMTP server. + type: string + html: + description: The HTML body of the email notification. + type: string + requireTLS: + description: The SMTP TLS requirement. Note that Go does not support unencrypted connections to remote SMTP endpoints. + type: boolean + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + smarthost: + description: The SMTP host through which emails are sent. + type: string + text: + description: The text body of the email notification. + type: string + tlsConfig: + description: TLS configuration + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + to: + description: The email address to send notifications to. + type: string + type: object + type: array name: + description: Name of the receiver. Must be unique across all items from the list. + minLength: 1 type: string opsgenieConfigs: + description: List of OpsGenie configurations. items: + description: OpsGenieConfig configures notifications via OpsGenie. See https://prometheus.io/docs/alerting/latest/configuration/#opsgenie_config properties: apiKey: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the OpsGenie API key. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -90,15 +289,22 @@ spec: - key type: object apiURL: + description: The URL to send OpsGenie API requests to. type: string description: + description: Description of the incident. type: string details: + description: A set of arbitrary key/value pairs that provide further detail about the incident. items: + description: KeyValue defines a (key, value) tuple. properties: key: + description: Key of the tuple. + minLength: 1 type: string value: + description: Value of the tuple. type: string required: - key @@ -106,9 +312,32 @@ spec: type: object type: array httpConfig: + description: HTTP client configuration. properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: - description: 'BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. properties: password: description: The secret in the service monitor namespace that contains the password for authentication. @@ -142,7 +371,7 @@ spec: type: object type: object bearerTokenSecret: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -157,9 +386,10 @@ spec: - key type: object proxyURL: + description: Optional proxy URL. type: string tlsConfig: - description: SafeTLSConfig specifies safe TLS configuration parameters. + description: TLS configuration for the client. properties: ca: description: Struct containing the CA cert to use for the targets. @@ -253,51 +483,78 @@ spec: type: object type: object message: + description: Alert text limited to 130 characters. type: string note: + description: Additional alert note. type: string priority: + description: Priority level of alert. Possible values are P1, P2, P3, P4, and P5. type: string responders: + description: List of responders responsible for notifications. items: + description: OpsGenieConfigResponder defines a responder to an incident. One of `id`, `name` or `username` has to be defined. properties: id: + description: ID of the responder. type: string name: + description: Name of the responder. type: string type: + description: Type of responder. + minLength: 1 type: string username: + description: Username of the responder. type: string + required: + - type type: object type: array sendResolved: + description: Whether or not to notify about resolved alerts. type: boolean source: + description: Backlink to the sender of the notification. type: string tags: + description: Comma separated list of tags attached to the notifications. type: string type: object type: array - pagerDutyConfigs: + pagerdutyConfigs: + description: List of PagerDuty configurations. items: + description: PagerDutyConfig configures notifications via PagerDuty. See https://prometheus.io/docs/alerting/latest/configuration/#pagerduty_config properties: class: + description: The class/type of the event. type: string client: + description: Client identification. type: string clientURL: + description: Backlink to the sender of notification. type: string component: + description: The part or component of the affected system that is broken. type: string description: + description: Description of the incident. type: string details: + description: Arbitrary key/value pairs that provide further detail about the incident. items: + description: KeyValue defines a (key, value) tuple. properties: key: + description: Key of the tuple. + minLength: 1 type: string value: + description: Value of the tuple. type: string required: - key @@ -305,11 +562,35 @@ spec: type: object type: array group: + description: A cluster or grouping of sources. type: string httpConfig: + description: HTTP client configuration. properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: - description: 'BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. properties: password: description: The secret in the service monitor namespace that contains the password for authentication. @@ -343,7 +624,7 @@ spec: type: object type: object bearerTokenSecret: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -358,9 +639,10 @@ spec: - key type: object proxyURL: + description: Optional proxy URL. type: string tlsConfig: - description: SafeTLSConfig specifies safe TLS configuration parameters. + description: TLS configuration for the client. properties: ca: description: Struct containing the CA cert to use for the targets. @@ -454,7 +736,7 @@ spec: type: object type: object routingKey: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the PagerDuty integration key (when using Events API v2). Either this field or `serviceKey` needs to be defined. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -469,9 +751,10 @@ spec: - key type: object sendResolved: + description: Whether or not to notify about resolved alerts. type: boolean serviceKey: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the PagerDuty service key (when using integration type "Prometheus"). Either this field or `routingKey` needs to be defined. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -486,18 +769,51 @@ spec: - key type: object severity: + description: Severity of the incident. type: string url: + description: The URL to send requests to. type: string type: object type: array - webhookConfigs: + pushoverConfigs: + description: List of Pushover configurations. items: + description: PushoverConfig configures notifications via Pushover. See https://prometheus.io/docs/alerting/latest/configuration/#pushover_config properties: + expire: + description: How long your notification will continue to be retried for, unless the user acknowledges the notification. + type: string + html: + description: Whether notification message is HTML or plain text. + type: boolean httpConfig: + description: HTTP client configuration. properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: - description: 'BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. properties: password: description: The secret in the service monitor namespace that contains the password for authentication. @@ -531,7 +847,7 @@ spec: type: object type: object bearerTokenSecret: - description: SecretKeySelector selects a key of a Secret. + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -546,9 +862,10 @@ spec: - key type: object proxyURL: + description: Optional proxy URL. type: string tlsConfig: - description: SafeTLSConfig specifies safe TLS configuration parameters. + description: TLS configuration for the client. properties: ca: description: Struct containing the CA cert to use for the targets. @@ -641,15 +958,47 @@ spec: type: string type: object type: object - maxAlerts: - format: int32 - type: integer + message: + description: Notification message. + type: string + priority: + description: Priority, see https://pushover.net/api#priority + type: string + retry: + description: How often the Pushover servers will send the same notification to the user. Must be at least 30 seconds. + type: string sendResolved: + description: Whether or not to notify about resolved alerts. type: boolean + sound: + description: The name of one of the sounds supported by device clients to override the user's default sound choice + type: string + title: + description: Notification title. + type: string + token: + description: The secret's key that contains the registered application’s API token, see https://pushover.net/apps. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object url: + description: A supplementary URL shown alongside the message. type: string - urlSecret: - description: SecretKeySelector selects a key of a Secret. + urlTitle: + description: A title for supplementary URL, otherwise just the URL is shown + type: string + userKey: + description: The secret's key that contains the recipient user’s user key. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -665,43 +1014,996 @@ spec: type: object type: object type: array - required: - - name - type: object - type: array - route: - properties: - continue: - type: boolean - groupBy: - items: - type: string - type: array - groupInterval: - type: string - groupWait: - type: string - matchers: - items: - properties: - name: - type: string - regex: - type: boolean - value: - type: string - required: - - name - - value - type: object - type: array - receiver: - type: string - repeatInterval: - type: string - routes: - items: - type: object + slackConfigs: + description: List of Slack configurations. + items: + description: SlackConfig configures notifications via Slack. See https://prometheus.io/docs/alerting/latest/configuration/#slack_config + properties: + actions: + description: A list of Slack actions that are sent with each notification. + items: + description: SlackAction configures a single Slack action that is sent with each notification. See https://api.slack.com/docs/message-attachments#action_fields and https://api.slack.com/docs/message-buttons for more information. + properties: + confirm: + description: SlackConfirmationField protect users from destructive actions or particularly distinguished decisions by asking them to confirm their button click one more time. See https://api.slack.com/docs/interactive-message-field-guide#confirmation_fields for more information. + properties: + dismissText: + type: string + okText: + type: string + text: + minLength: 1 + type: string + title: + type: string + required: + - text + type: object + name: + type: string + style: + type: string + text: + minLength: 1 + type: string + type: + minLength: 1 + type: string + url: + type: string + value: + type: string + required: + - text + - type + type: object + type: array + apiURL: + description: The secret's key that contains the Slack webhook URL. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + callbackId: + type: string + channel: + description: The channel or user to send notifications to. + type: string + color: + type: string + fallback: + type: string + fields: + description: A list of Slack fields that are sent with each notification. + items: + description: SlackField configures a single Slack field that is sent with each notification. Each field must contain a title, value, and optionally, a boolean value to indicate if the field is short enough to be displayed next to other fields designated as short. See https://api.slack.com/docs/message-attachments#fields for more information. + properties: + short: + type: boolean + title: + minLength: 1 + type: string + value: + minLength: 1 + type: string + required: + - title + - value + type: object + type: array + footer: + type: string + httpConfig: + description: HTTP client configuration. + properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object + basicAuth: + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. + properties: + password: + description: The secret in the service monitor namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + type: object + iconEmoji: + type: string + iconURL: + type: string + imageURL: + type: string + linkNames: + type: boolean + mrkdwnIn: + items: + type: string + type: array + pretext: + type: string + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + shortFields: + type: boolean + text: + type: string + thumbURL: + type: string + title: + type: string + titleLink: + type: string + username: + type: string + type: object + type: array + victoropsConfigs: + description: List of VictorOps configurations. + items: + description: VictorOpsConfig configures notifications via VictorOps. See https://prometheus.io/docs/alerting/latest/configuration/#victorops_config + properties: + apiKey: + description: The secret's key that contains the API key to use when talking to the VictorOps API. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + apiUrl: + description: The VictorOps API URL. + type: string + customFields: + description: Additional custom fields for notification. + items: + description: KeyValue defines a (key, value) tuple. + properties: + key: + description: Key of the tuple. + minLength: 1 + type: string + value: + description: Value of the tuple. + type: string + required: + - key + - value + type: object + type: array + entityDisplayName: + description: Contains summary of the alerted problem. + type: string + httpConfig: + description: The HTTP client's configuration. + properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object + basicAuth: + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. + properties: + password: + description: The secret in the service monitor namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + type: object + messageType: + description: Describes the behavior of the alert (CRITICAL, WARNING, INFO). + type: string + monitoringTool: + description: The monitoring tool the state message is from. + type: string + routingKey: + description: A key used to map the alert to a team. + type: string + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + stateMessage: + description: Contains long explanation of the alerted problem. + type: string + type: object + type: array + webhookConfigs: + description: List of webhook configurations. + items: + description: WebhookConfig configures notifications via a generic receiver supporting the webhook payload. See https://prometheus.io/docs/alerting/latest/configuration/#webhook_config + properties: + httpConfig: + description: HTTP client configuration. + properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object + basicAuth: + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. + properties: + password: + description: The secret in the service monitor namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + type: object + maxAlerts: + description: Maximum number of alerts to be sent per webhook message. When 0, all alerts are included. + format: int32 + minimum: 0 + type: integer + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + url: + description: The URL to send HTTP POST requests to. `urlSecret` takes precedence over `url`. One of `urlSecret` and `url` should be defined. + type: string + urlSecret: + description: The secret's key that contains the webhook URL to send HTTP requests to. `urlSecret` takes precedence over `url`. One of `urlSecret` and `url` should be defined. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + type: array + wechatConfigs: + description: List of WeChat configurations. + items: + description: WeChatConfig configures notifications via WeChat. See https://prometheus.io/docs/alerting/latest/configuration/#wechat_config + properties: + agentID: + type: string + apiSecret: + description: The secret's key that contains the WeChat API key. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + apiURL: + description: The WeChat API URL. + type: string + corpID: + description: The corp id for authentication. + type: string + httpConfig: + description: HTTP client configuration. + properties: + authorization: + description: Authorization header configuration for the client. This is mutually exclusive with BasicAuth and is only available starting from Alertmanager v0.22+. + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object + basicAuth: + description: BasicAuth for the client. This is mutually exclusive with Authorization. If both are defined, BasicAuth takes precedence. + properties: + password: + description: The secret in the service monitor namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer token to be used by the client for authentication. The secret needs to be in the same namespace as the AlertmanagerConfig object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + type: object + message: + description: API request data as defined by the WeChat API. + type: string + messageType: + type: string + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + toParty: + type: string + toTag: + type: string + toUser: + type: string + type: object + type: array + required: + - name + type: object + type: array + route: + description: The Alertmanager route definition for alerts matching the resource’s namespace. If present, it will be added to the generated Alertmanager configuration as a first-level route. + properties: + continue: + description: Boolean indicating whether an alert should continue matching subsequent sibling nodes. It will always be overridden to true for the first-level route by the Prometheus operator. + type: boolean + groupBy: + description: List of labels to group by. + items: + type: string + type: array + groupInterval: + description: How long to wait before sending an updated notification. Must match the regular expression `[0-9]+(ms|s|m|h)` (milliseconds seconds minutes hours). + type: string + groupWait: + description: How long to wait before sending the initial notification. Must match the regular expression `[0-9]+(ms|s|m|h)` (milliseconds seconds minutes hours). + type: string + matchers: + description: 'List of matchers that the alert’s labels should match. For the first level route, the operator removes any existing equality and regexp matcher on the `namespace` label and adds a `namespace: <object namespace>` matcher.' + items: + description: Matcher defines how to match on alert's labels. + properties: + name: + description: Label to match. + minLength: 1 + type: string + regex: + description: Whether to match on equality (false) or regular-expression (true). + type: boolean + value: + description: Label value to match. + type: string + required: + - name + type: object + type: array + receiver: + description: Name of the receiver for this route. If not empty, it should be listed in the `receivers` field. + type: string + repeatInterval: + description: How long to wait before repeating the last notification. Must match the regular expression `[0-9]+(ms|s|m|h)` (milliseconds seconds minutes hours). + type: string + routes: + description: Child routes. + items: + x-kubernetes-preserve-unknown-fields: true type: array type: object type: object diff --git a/manifests/setup/prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index e567a3506895c633b5c47c91c1e6b36c89d3ea4b..4bbdb953491d9849dd8ccbc2f3dd31c23042d728 100644 --- a/manifests/setup/prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: alertmanagers.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: Alertmanager listKind: AlertmanagerList plural: alertmanagers @@ -454,6 +456,15 @@ spec: clusterAdvertiseAddress: description: 'ClusterAdvertiseAddress is the explicit address to advertise in cluster. Needs to be provided for non RFC1918 [1] (public) addresses. [1] RFC1918: https://tools.ietf.org/html/rfc1918' type: string + clusterGossipInterval: + description: Interval between gossip attempts. + type: string + clusterPeerTimeout: + description: Timeout for cluster peering. + type: string + clusterPushpullInterval: + description: Interval between pushpull attempts. + type: string configMaps: description: ConfigMaps is a list of ConfigMaps in the same namespace as the Alertmanager object, which shall be mounted into the Alertmanager Pods. The ConfigMaps are mounted into /etc/alertmanager/configmaps/<configmap-name>. items: @@ -525,8 +536,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -831,12 +846,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -927,12 +947,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -1241,8 +1269,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -1547,12 +1579,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -1643,12 +1680,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -1884,13 +1929,17 @@ spec: logLevel: description: Log level for Alertmanager to be configured with. type: string + minReadySeconds: + description: Minimum number of seconds for which a newly created pod should be ready without any of its container crashing for it to be considered available. Defaults to 0 (pod will be considered available as soon as it is ready) This is an alpha field and requires enabling StatefulSetMinReadySeconds feature gate. + format: int32 + type: integer nodeSelector: additionalProperties: type: string description: Define which Nodes the Pods are scheduled on. type: object paused: - description: If set to true all actions on the underlaying managed objects are not goint to be performed, except for delete actions. + description: If set to true all actions on the underlying managed objects are not goint to be performed, except for delete actions. type: boolean podMetadata: description: PodMetadata configures Labels and Annotations which are propagated to the alertmanager pods. @@ -1924,12 +1973,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2036,8 +2093,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object volumeClaimTemplate: description: A PVC spec to be used by the Prometheus StatefulSets. @@ -2094,12 +2155,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2153,7 +2222,11 @@ spec: type: array capacity: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: Represents the actual resources of the underlying volume. type: object conditions: @@ -2515,8 +2588,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -2535,8 +2612,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object fc: description: FC represents a Fibre Channel resource that is attached to a kubelet's host machine and then exposed to the pod. @@ -2843,8 +2924,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -3099,7 +3184,7 @@ spec: format: int32 type: integer paused: - description: Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed. + description: Represents whether any actions on the underlying managed objects are being performed. Only delete actions will be performed. type: boolean replicas: description: Total number of non-terminated pods targeted by this Alertmanager cluster (their labels match the selector). diff --git a/manifests/setup/prometheus-operator-0podmonitorCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0podmonitorCustomResourceDefinition.yaml index 50096e73d1d43c71dc7d2dd4269c36e1acd329e5..927a0ba6df860bfd200a3d77000c564c2ce0d10d 100644 --- a/manifests/setup/prometheus-operator-0podmonitorCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0podmonitorCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: podmonitors.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: PodMonitor listKind: PodMonitorList plural: podmonitors @@ -33,6 +35,18 @@ spec: jobLabel: description: The label to use to retrieve the job name from. type: string + labelLimit: + description: Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelNameLengthLimit: + description: Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelValueLengthLimit: + description: Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer namespaceSelector: description: Selector to select which namespaces the Endpoints objects are discovered from. properties: @@ -50,6 +64,28 @@ spec: items: description: PodMetricsEndpoint defines a scrapeable endpoint of a Kubernetes Pod serving Prometheus metrics. properties: + authorization: + description: Authorization section for this endpoint + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: description: 'BasicAuth allow an endpoint to authenticate over basic authentication. More info: https://prometheus.io/docs/operating/configuration/#endpoint' properties: @@ -139,6 +175,77 @@ spec: type: string type: object type: array + oauth2: + description: OAuth2 for the URL. Only valid in Prometheus versions 2.27.0 and newer. + properties: + clientId: + description: The secret or configmap containing the OAuth2 client id + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientSecret: + description: The secret containing the OAuth2 client secret + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + endpointParams: + additionalProperties: + type: string + description: Parameters to append to the token URL + type: object + scopes: + description: OAuth2 scopes used for the token request + items: + type: string + type: array + tokenUrl: + description: The URL to fetch the token from + minLength: 1 + type: string + required: + - clientId + - clientSecret + - tokenUrl + type: object params: additionalProperties: items: @@ -156,7 +263,7 @@ spec: description: ProxyURL eg http://proxyserver:2195 Directs scrapes to proxy through this endpoint. type: string relabelings: - description: 'RelabelConfigs to apply to samples before ingestion. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' + description: 'RelabelConfigs to apply to samples before scraping. Prometheus Operator automatically adds relabelings for a few standard Kubernetes fields and replaces original scrape job name with __tmp_prometheus_job_name. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' items: description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `<metric_relabel_configs>`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' properties: diff --git a/manifests/setup/prometheus-operator-0probeCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0probeCustomResourceDefinition.yaml index 691b1e9fb7ca7465cc820a0a86cf50e3f98f9d5b..aa511ffb65b6727fd29cc0cc5ecb915a2fb9b144 100644 --- a/manifests/setup/prometheus-operator-0probeCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0probeCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: probes.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: Probe listKind: ProbeList plural: probes @@ -30,21 +32,209 @@ spec: spec: description: Specification of desired Ingress selection for target discovery by Prometheus. properties: + authorization: + description: Authorization section for this endpoint + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over basic authentication. More info: https://prometheus.io/docs/operating/configuration/#endpoint' + properties: + password: + description: The secret in the service monitor namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: Secret to mount to read bearer token for scraping targets. The secret needs to be in the same namespace as the probe and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object interval: description: Interval at which targets are probed using the configured prober. If not specified Prometheus' global scrape interval is used. type: string jobName: description: The job name assigned to scraped metrics by default. type: string + labelLimit: + description: Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelNameLengthLimit: + description: Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelValueLengthLimit: + description: Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + metricRelabelings: + description: MetricRelabelConfigs to apply to samples before ingestion. + items: + description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `<metric_relabel_configs>`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' + properties: + action: + description: Action to perform based on regex matching. Default is 'replace' + type: string + modulus: + description: Modulus to take of the hash of the source label values. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted value is matched. Default is '(.*)' + type: string + replacement: + description: Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1' + type: string + separator: + description: Separator placed between concatenated source label values. default is ';'. + type: string + sourceLabels: + description: The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions. + items: + type: string + type: array + targetLabel: + description: Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available. + type: string + type: object + type: array module: description: 'The module to use for probing specifying how to probe the target. Example module configuring in the blackbox exporter: https://github.com/prometheus/blackbox_exporter/blob/master/example.yml' type: string + oauth2: + description: OAuth2 for the URL. Only valid in Prometheus versions 2.27.0 and newer. + properties: + clientId: + description: The secret or configmap containing the OAuth2 client id + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientSecret: + description: The secret containing the OAuth2 client secret + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + endpointParams: + additionalProperties: + type: string + description: Parameters to append to the token URL + type: object + scopes: + description: OAuth2 scopes used for the token request + items: + type: string + type: array + tokenUrl: + description: The URL to fetch the token from + minLength: 1 + type: string + required: + - clientId + - clientSecret + - tokenUrl + type: object prober: description: Specification for the prober to use for probing targets. The prober.URL parameter is required. Targets cannot be probed if left empty. properties: path: description: Path to collect metrics from. Defaults to `/probe`. type: string + proxyUrl: + description: Optional ProxyURL. + type: string scheme: description: HTTP scheme to use for scraping. Defaults to `http`. type: string @@ -54,9 +244,17 @@ spec: required: - url type: object + sampleLimit: + description: SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + format: int64 + type: integer scrapeTimeout: description: Timeout for scraping metrics from the Prometheus exporter. type: string + targetLimit: + description: TargetLimit defines a limit on the number of scraped targets that will be accepted. + format: int64 + type: integer targets: description: Targets defines a set of static and/or dynamically discovered targets to be probed using the prober. properties: @@ -145,6 +343,37 @@ spec: type: string description: Labels assigned to all metrics scraped from the targets. type: object + relabelingConfigs: + description: 'RelabelConfigs to apply to samples before ingestion. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' + items: + description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `<metric_relabel_configs>`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' + properties: + action: + description: Action to perform based on regex matching. Default is 'replace' + type: string + modulus: + description: Modulus to take of the hash of the source label values. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted value is matched. Default is '(.*)' + type: string + replacement: + description: Replacement value against which a regex replace is performed if the regular expression matches. Regex capture groups are available. Default is '$1' + type: string + separator: + description: Separator placed between concatenated source label values. default is ';'. + type: string + sourceLabels: + description: The source labels select values from existing labels. Their content is concatenated using the configured separator and matched against the configured regular expression for the replace, keep, and drop actions. + items: + type: string + type: array + targetLabel: + description: Label to which the resulting value is written in a replace action. It is mandatory for replace actions. Regex capture groups are available. + type: string + type: object + type: array static: description: Targets is a list of URLs to probe using the configured prober. items: @@ -152,6 +381,99 @@ spec: type: array type: object type: object + tlsConfig: + description: TLS configuration to use when scraping the endpoint. + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object type: object required: - spec diff --git a/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml index eb40f0fc156a8e9ea662dc979c287878a4804ab9..1122aec463e260a50f47e74ba7f6edab522fad5a 100644 --- a/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: prometheuses.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: Prometheus listKind: PrometheusList plural: prometheuses @@ -439,6 +441,28 @@ spec: apiVersion: description: Version of the Alertmanager API that Prometheus uses to send alerts. It can be "v1" or "v2". type: string + authorization: + description: Authorization section for this alertmanager endpoint + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object bearerTokenFile: description: BearerTokenFile to read from filesystem to use when authenticating to Alertmanager. type: string @@ -580,6 +604,31 @@ spec: apiserverConfig: description: APIServerConfig allows specifying a host and auth methods to access apiserver. If left empty, Prometheus is assumed to run inside of the cluster and will discover API servers automatically and use the pod's CA certificate and bearer token file at /var/run/secrets/kubernetes.io/serviceaccount/. properties: + authorization: + description: Authorization section for accessing apiserver + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + credentialsFile: + description: File to read a secret from, mutually exclusive with Credentials (from SafeAuthorization) + type: string + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: description: BasicAuth allow an endpoint to authenticate over basic authentication properties: @@ -805,8 +854,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -1111,12 +1164,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -1207,12 +1265,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -1445,19 +1511,36 @@ spec: enableAdminAPI: description: 'Enable access to prometheus web admin API. Defaults to the value of `false`. WARNING: Enabling the admin APIs enables mutating endpoints, to delete data, shutdown Prometheus, and more. Enabling this should be done with care and the user is advised to add additional authentication authorization via a proxy to ensure only clients authorized to perform these actions can do so. For more information see https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis' type: boolean + enableFeatures: + description: Enable access to Prometheus disabled features. By default, no features are enabled. Enabling disabled features is entirely outside the scope of what the maintainers will support and by doing so, you accept that this behaviour may break at any time without notice. For more information see https://prometheus.io/docs/prometheus/latest/disabled_features/ + items: + type: string + type: array + enforcedLabelLimit: + description: Per-scrape limit on number of labels that will be accepted for a sample. If more than this number of labels are present post metric-relabeling, the entire scrape will be treated as failed. 0 means no limit. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + enforcedLabelNameLengthLimit: + description: Per-scrape limit on length of labels name that will be accepted for a sample. If a label name is longer than this number post metric-relabeling, the entire scrape will be treated as failed. 0 means no limit. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + enforcedLabelValueLengthLimit: + description: Per-scrape limit on length of labels value that will be accepted for a sample. If a label value is longer than this number post metric-relabeling, the entire scrape will be treated as failed. 0 means no limit. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer enforcedNamespaceLabel: - description: EnforcedNamespaceLabel enforces adding a namespace label of origin for each alert and metric that is user created. The label value will always be the namespace of the object that is being created. + description: "EnforcedNamespaceLabel If set, a label will be added to \n 1. all user-metrics (created by `ServiceMonitor`, `PodMonitor` and `ProbeConfig` object) and 2. in all `PrometheusRule` objects (except the ones excluded in `prometheusRulesExcludedFromEnforce`) to * alerting & recording rules and * the metrics used in their expressions (`expr`). \n Label name is this field's value. Label value is the namespace of the created object (mentioned above)." type: string enforcedSampleLimit: description: EnforcedSampleLimit defines global limit on number of scraped samples that will be accepted. This overrides any SampleLimit set per ServiceMonitor or/and PodMonitor. It is meant to be used by admins to enforce the SampleLimit to keep overall number of samples/series under the desired limit. Note that if SampleLimit is lower that value will be taken instead. format: int64 type: integer enforcedTargetLimit: - description: EnforcedTargetLimit defines a global limit on the number of scraped targets. This overrides any TargetLimit set per ServiceMonitor or/and PodMonitor. It is meant to be used by admins to enforce the TargetLimit to keep overall number of targets under the desired limit. Note that if TargetLimit is higher that value will be taken instead. + description: EnforcedTargetLimit defines a global limit on the number of scraped targets. This overrides any TargetLimit set per ServiceMonitor or/and PodMonitor. It is meant to be used by admins to enforce the TargetLimit to keep the overall number of targets under the desired limit. Note that if TargetLimit is lower, that value will be taken instead, except if either value is zero, in which case the non-zero value will be used. If both values are zero, no limit is enforced. format: int64 type: integer evaluationInterval: - description: Interval between consecutive evaluations. + description: 'Interval between consecutive evaluations. Default: `1m`' type: string externalLabels: additionalProperties: @@ -1484,7 +1567,7 @@ spec: type: object type: array initContainers: - description: 'InitContainers allows adding initContainers to the pod definition. Those can be used to e.g. fetch secrets for injection into the Prometheus configuration from external sources. Any errors during the execution of an initContainer will lead to a restart of the Pod. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ Using initContainers for any use case other then secret fetching is entirely outside the scope of what the maintainers will support and by doing so, you accept that this behaviour may break at any time without notice.' + description: 'InitContainers allows adding initContainers to the pod definition. Those can be used to e.g. fetch secrets for injection into the Prometheus configuration from external sources. Any errors during the execution of an initContainer will lead to a restart of the Pod. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ InitContainers described here modify an operator generated init containers if they share the same name and modifications are done via a strategic merge patch. The current init container name is: `init-config-reloader`. Overriding init containers is entirely outside the scope of what the maintainers will support and by doing so, you accept that this behaviour may break at any time without notice.' items: description: A single application container that you want to run within a pod. properties: @@ -1546,8 +1629,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -1852,12 +1939,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -1948,12 +2040,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2189,6 +2289,10 @@ spec: logLevel: description: Log level for Prometheus to be configured with. type: string + minReadySeconds: + description: Minimum number of seconds for which a newly created pod should be ready without any of its container crashing for it to be considered available. Defaults to 0 (pod will be considered available as soon as it is ready) This is an alpha field and requires enabling StatefulSetMinReadySeconds feature gate. + format: int32 + type: integer nodeSelector: additionalProperties: type: string @@ -2221,7 +2325,7 @@ spec: type: string type: object podMonitorNamespaceSelector: - description: Namespaces to be selected for PodMonitor discovery. If nil, only check own namespace. + description: Namespace's labels to match for PodMonitor discovery. If nil, only check own namespace. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. @@ -2391,6 +2495,31 @@ spec: items: description: RemoteReadSpec defines the remote_read configuration for prometheus. properties: + authorization: + description: Authorization section for remote read + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + credentialsFile: + description: File to read a secret from, mutually exclusive with Credentials (from SafeAuthorization) + type: string + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: description: BasicAuth for the URL. properties: @@ -2426,7 +2555,7 @@ spec: type: object type: object bearerToken: - description: bearer token for remote read. + description: Bearer token for remote read. type: string bearerTokenFile: description: File to read bearer token for remote read. @@ -2434,6 +2563,77 @@ spec: name: description: The name of the remote read queue, must be unique if specified. The name is used in metrics and logging in order to differentiate read configurations. Only valid in Prometheus versions 2.15.0 and newer. type: string + oauth2: + description: OAuth2 for the URL. Only valid in Prometheus versions 2.27.0 and newer. + properties: + clientId: + description: The secret or configmap containing the OAuth2 client id + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientSecret: + description: The secret containing the OAuth2 client secret + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + endpointParams: + additionalProperties: + type: string + description: Parameters to append to the token URL + type: object + scopes: + description: OAuth2 scopes used for the token request + items: + type: string + type: array + tokenUrl: + description: The URL to fetch the token from + minLength: 1 + type: string + required: + - clientId + - clientSecret + - tokenUrl + type: object proxyUrl: description: Optional ProxyURL type: string @@ -2562,6 +2762,31 @@ spec: items: description: RemoteWriteSpec defines the remote_write configuration for prometheus. properties: + authorization: + description: Authorization section for remote write + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + credentialsFile: + description: File to read a secret from, mutually exclusive with Credentials (from SafeAuthorization) + type: string + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: description: BasicAuth for the URL. properties: @@ -2597,14 +2822,100 @@ spec: type: object type: object bearerToken: - description: File to read bearer token for remote write. + description: Bearer token for remote write. type: string bearerTokenFile: description: File to read bearer token for remote write. type: string + headers: + additionalProperties: + type: string + description: Custom HTTP headers to be sent along with each remote write request. Be aware that headers that are set by Prometheus itself can't be overwritten. Only valid in Prometheus versions 2.25.0 and newer. + type: object + metadataConfig: + description: MetadataConfig configures the sending of series metadata to remote storage. + properties: + send: + description: Whether metric metadata is sent to remote storage or not. + type: boolean + sendInterval: + description: How frequently metric metadata is sent to remote storage. + type: string + type: object name: description: The name of the remote write queue, must be unique if specified. The name is used in metrics and logging in order to differentiate queues. Only valid in Prometheus versions 2.15.0 and newer. type: string + oauth2: + description: OAuth2 for the URL. Only valid in Prometheus versions 2.27.0 and newer. + properties: + clientId: + description: The secret or configmap containing the OAuth2 client id + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientSecret: + description: The secret containing the OAuth2 client secret + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + endpointParams: + additionalProperties: + type: string + description: Parameters to append to the token URL + type: object + scopes: + description: OAuth2 scopes used for the token request + items: + type: string + type: array + tokenUrl: + description: The URL to fetch the token from + minLength: 1 + type: string + required: + - clientId + - clientSecret + - tokenUrl + type: object proxyUrl: description: Optional ProxyURL type: string @@ -2639,6 +2950,9 @@ spec: remoteTimeout: description: Timeout for requests to the remote write endpoint. type: string + sendExemplars: + description: Enables sending of exemplars over remote write. Note that exemplar-storage itself must be enabled using the enableFeature option for exemplars to be scraped in the first place. Only valid in Prometheus versions 2.27.0 and newer. + type: boolean tlsConfig: description: TLS Config to use for remote write. properties: @@ -2783,7 +3097,7 @@ spec: description: Name of Prometheus external label used to denote replica name. Defaults to the value of `prometheus_replica`. External label will _not_ be added when value is set to empty string (`""`). type: string replicas: - description: Number of instances to deploy for a Prometheus deployment. + description: Number of replicas of each shard to deploy for a Prometheus deployment. Number of replicas multiplied by shards is the total number of Pods created. format: int32 type: integer resources: @@ -2791,12 +3105,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2804,7 +3126,7 @@ spec: description: Time duration Prometheus shall retain data for. Default is '24h', and must match the regular expression `[0-9]+(ms|s|m|h|d|w|y)` (milliseconds seconds minutes hours days weeks years). type: string retentionSize: - description: Maximum amount of disk space used by blocks. + description: 'Maximum amount of disk space used by blocks. Supported units: B, KB, MB, GB, TB, PB, EB. Ex: `512MB`.' type: string routePrefix: description: The route prefix Prometheus registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`. @@ -2887,7 +3209,7 @@ spec: type: object type: object scrapeInterval: - description: Interval between consecutive scrapes. + description: 'Interval between consecutive scrapes. Default: `1m`' type: string scrapeTimeout: description: Number of seconds to wait for target to respond before erroring. @@ -2974,7 +3296,7 @@ spec: description: ServiceAccountName is the name of the ServiceAccount to use to run the Prometheus Pods. type: string serviceMonitorNamespaceSelector: - description: Namespaces to be selected for ServiceMonitor discovery. If nil, only check own namespace. + description: Namespace's labels to match for ServiceMonitor discovery. If nil, only check own namespace. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. @@ -3036,6 +3358,10 @@ spec: sha: description: 'SHA of Prometheus container image to be deployed. Defaults to the value of `version`. Similar to a tag, but the SHA explicitly deploys an immutable container image. Version and Tag are ignored if SHA is set. Deprecated: use ''image'' instead. The image digest can be specified as part of the image URL.' type: string + shards: + description: 'EXPERIMENTAL: Number of shards to distribute targets onto. Number of replicas multiplied by shards is the total number of Pods created. Note that scaling down shards will not reshard data onto remaining instances, it must be manually moved. Increasing shards will not reshard data either but it will continue to be available from the same instances. To query globally use Thanos sidecar and Thanos querier or remote write data to a central location. Sharding is done on the content of the `__address__` target meta-label.' + format: int32 + type: integer storage: description: Storage spec to specify how storage shall be used. properties: @@ -3049,8 +3375,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object volumeClaimTemplate: description: A PVC spec to be used by the Prometheus StatefulSets. @@ -3107,12 +3437,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -3166,7 +3504,11 @@ spec: type: array capacity: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: Represents the actual resources of the underlying volume. type: object conditions: @@ -3331,7 +3673,7 @@ spec: description: MinTime for Thanos sidecar to be configured with. Option can be a constant time in RFC3339 format or time duration relative to current time, such as -1d or 2h45m. Valid duration units are ms, s, m, h, d, w, y. type: string objectStorageConfig: - description: ObjectStorageConfig configures object storage in Thanos. + description: ObjectStorageConfig configures object storage in Thanos. Alternative to ObjectStorageConfigFile, and lower order priority. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -3345,17 +3687,31 @@ spec: required: - key type: object + objectStorageConfigFile: + description: ObjectStorageConfigFile specifies the path of the object storage configuration file. When used alongside with ObjectStorageConfig, ObjectStorageConfigFile takes precedence. + type: string + readyTimeout: + description: ReadyTimeout is the maximum time Thanos sidecar will wait for Prometheus to start. Eg 10m + type: string resources: description: Resources defines the resource requirements for the Thanos sidecar. If not provided, no requests/limits will be set properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -3380,9 +3736,40 @@ spec: required: - key type: object + tracingConfigFile: + description: TracingConfig specifies the path of the tracing configuration file. When used alongside with TracingConfig, TracingConfigFile takes precedence. + type: string version: description: Version describes the version of Thanos to use. type: string + volumeMounts: + description: VolumeMounts allows configuration of additional VolumeMounts on the output StatefulSet definition. VolumeMounts specified will be appended to other VolumeMounts in the thanos-sidecar container. + items: + description: VolumeMount describes a mounting of a Volume within a container. + properties: + mountPath: + description: Path within the container at which the volume should be mounted. Must not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationNone is used. This field is beta in 1.10. + type: string + name: + description: This must match the Name of a Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume from which the container's volume should be mounted. Defaults to "" (volume's root). + type: string + subPathExpr: + description: Expanded path within the volume from which the container's volume should be mounted. Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. Defaults to "" (volume's root). SubPathExpr and SubPath are mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array type: object tolerations: description: If specified, the pod's tolerations. @@ -3705,8 +4092,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -3725,8 +4116,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object fc: description: FC represents a Fibre Channel resource that is attached to a kubelet's host machine and then exposed to the pod. @@ -4033,8 +4428,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -4289,6 +4688,118 @@ spec: pageTitle: description: The prometheus web page title type: string + tlsConfig: + description: WebTLSConfig defines the TLS parameters for HTTPS. + properties: + cert: + description: Contains the TLS certificate for the server. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + cipherSuites: + description: 'List of supported cipher suites for TLS versions up to TLS 1.2. If empty, Go default cipher suites are used. Available cipher suites are documented in the go documentation: https://golang.org/pkg/crypto/tls/#pkg-constants' + items: + type: string + type: array + client_ca: + description: Contains the CA certificate for client certificate authentication to the server. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientAuthType: + description: 'Server policy for client authentication. Maps to ClientAuth Policies. For more detail on clientAuth options: https://golang.org/pkg/crypto/tls/#ClientAuthType' + type: string + curvePreferences: + description: 'Elliptic curves that will be used in an ECDHE handshake, in preference order. Available curves are documented in the go documentation: https://golang.org/pkg/crypto/tls/#CurveID' + items: + type: string + type: array + keySecret: + description: Secret containing the TLS key for the server. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + maxVersion: + description: Maximum TLS version that is acceptable. Defaults to TLS13. + type: string + minVersion: + description: Minimum TLS version that is acceptable. Defaults to TLS12. + type: string + preferServerCipherSuites: + description: Controls whether the server selects the client's most preferred cipher suite, or the server's most preferred cipher suite. If true then the server's preference, as expressed in the order of elements in cipherSuites, is used. + type: boolean + required: + - cert + - keySecret + type: object type: object type: object status: @@ -4299,7 +4810,7 @@ spec: format: int32 type: integer paused: - description: Represents whether any actions on the underlaying managed objects are being performed. Only delete actions will be performed. + description: Represents whether any actions on the underlying managed objects are being performed. Only delete actions will be performed. type: boolean replicas: description: Total number of non-terminated pods targeted by this Prometheus deployment (their labels match the selector). diff --git a/manifests/setup/prometheus-operator-0prometheusruleCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0prometheusruleCustomResourceDefinition.yaml index cf99071592cb8b159b84d677abcd743d92a45f85..a44c7058c0ba6affcf55d62fde82e3b67fead38a 100644 --- a/manifests/setup/prometheus-operator-0prometheusruleCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0prometheusruleCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: prometheusrules.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: PrometheusRule listKind: PrometheusRuleList plural: prometheusrules @@ -17,7 +19,7 @@ spec: - name: v1 schema: openAPIV3Schema: - description: PrometheusRule defines alerting rules for a Prometheus instance + description: PrometheusRule defines recording and alerting rules for a Prometheus instance properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' @@ -43,7 +45,7 @@ spec: type: string rules: items: - description: Rule describes an alerting or recording rule. + description: 'Rule describes an alerting or recording rule See Prometheus documentation: [alerting](https://www.prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) or [recording](https://www.prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) rule' properties: alert: type: string diff --git a/manifests/setup/prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index 6d946998237634fd94aae490a198e441dbc79285..e5f847714c851341fdf4cdba3c544c628e4d8e21 100644 --- a/manifests/setup/prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: servicemonitors.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: ServiceMonitor listKind: ServiceMonitorList plural: servicemonitors @@ -35,6 +37,28 @@ spec: items: description: Endpoint defines a scrapeable endpoint serving Prometheus metrics. properties: + authorization: + description: Authorization section for this endpoint + properties: + credentials: + description: The secret's key that contains the credentials of the request + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: + description: Set the authentication type. Defaults to Bearer, Basic will cause an error + type: string + type: object basicAuth: description: 'BasicAuth allow an endpoint to authenticate over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' properties: @@ -127,6 +151,77 @@ spec: type: string type: object type: array + oauth2: + description: OAuth2 for the URL. Only valid in Prometheus versions 2.27.0 and newer. + properties: + clientId: + description: The secret or configmap containing the OAuth2 client id + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + type: object + clientSecret: + description: The secret containing the OAuth2 client secret + properties: + key: + description: The key of the secret to select from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object + endpointParams: + additionalProperties: + type: string + description: Parameters to append to the token URL + type: object + scopes: + description: OAuth2 scopes used for the token request + items: + type: string + type: array + tokenUrl: + description: The URL to fetch the token from + minLength: 1 + type: string + required: + - clientId + - clientSecret + - tokenUrl + type: object params: additionalProperties: items: @@ -144,7 +239,7 @@ spec: description: ProxyURL eg http://proxyserver:2195 Directs scrapes to proxy through this endpoint. type: string relabelings: - description: 'RelabelConfigs to apply to samples before scraping. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' + description: 'RelabelConfigs to apply to samples before scraping. Prometheus Operator automatically adds relabelings for a few standard Kubernetes fields and replaces original scrape job name with __tmp_prometheus_job_name. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' items: description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It defines `<metric_relabel_configs>`-section of Prometheus configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' properties: @@ -291,10 +386,22 @@ spec: type: object type: array jobLabel: - description: The label to use to retrieve the job name from. + description: "Chooses the label of the Kubernetes `Endpoints`. Its value will be used for the `job`-label's value of the created metrics. \n Default & fallback value: the name of the respective Kubernetes `Endpoint`." type: string + labelLimit: + description: Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelNameLengthLimit: + description: Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer + labelValueLengthLimit: + description: Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + format: int64 + type: integer namespaceSelector: - description: Selector to select which namespaces the Endpoints objects are discovered from. + description: Selector to select which namespaces the Kubernetes Endpoints objects are discovered from. properties: any: description: Boolean describing whether all namespaces are selected in contrast to a list restricting them. @@ -306,7 +413,7 @@ spec: type: array type: object podTargetLabels: - description: PodTargetLabels transfers labels on the Kubernetes Pod onto the target. + description: PodTargetLabels transfers labels on the Kubernetes `Pod` onto the created metrics. items: type: string type: array @@ -345,7 +452,7 @@ spec: type: object type: object targetLabels: - description: TargetLabels transfers labels on the Kubernetes Service onto the target. + description: TargetLabels transfers labels from the Kubernetes `Service` onto the created metrics. All labels set in `selector.matchLabels` are automatically transferred. items: type: string type: array diff --git a/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml index a6c61355ff8c5cceab71f662b13aed8b7e8ae912..bd9ede435271facafce0cf62083b5f5fd1089e28 100644 --- a/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml @@ -2,12 +2,14 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: thanosrulers.monitoring.coreos.com spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: ThanosRuler listKind: ThanosRulerList plural: thanosrulers @@ -462,8 +464,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -768,12 +774,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -864,12 +875,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -1283,8 +1302,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -1589,12 +1612,17 @@ spec: description: If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' properties: @@ -1685,12 +1713,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -1931,13 +1967,17 @@ spec: logLevel: description: Log level for ThanosRuler to be configured with. type: string + minReadySeconds: + description: Minimum number of seconds for which a newly created pod should be ready without any of its container crashing for it to be considered available. Defaults to 0 (pod will be considered available as soon as it is ready) This is an alpha field and requires enabling StatefulSetMinReadySeconds feature gate. + format: int32 + type: integer nodeSelector: additionalProperties: type: string description: Define which Nodes the Pods are scheduled on. type: object objectStorageConfig: - description: ObjectStorageConfig configures object storage in Thanos. + description: ObjectStorageConfig configures object storage in Thanos. Alternative to ObjectStorageConfigFile, and lower order priority. properties: key: description: The key of the secret to select from. Must be a valid secret key. @@ -1951,6 +1991,9 @@ spec: required: - key type: object + objectStorageConfigFile: + description: ObjectStorageConfigFile specifies the path of the object storage configuration file. When used alongside with ObjectStorageConfig, ObjectStorageConfigFile takes precedence. + type: string paused: description: When a ThanosRuler deployment is paused, no actions except for deletion will be performed on the underlying objects. type: boolean @@ -2022,12 +2065,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2186,8 +2237,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object volumeClaimTemplate: description: A PVC spec to be used by the Prometheus StatefulSets. @@ -2244,12 +2299,20 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object type: object @@ -2303,7 +2366,11 @@ spec: type: array capacity: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: Represents the actual resources of the underlying volume. type: object conditions: @@ -2646,8 +2713,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -2666,8 +2737,12 @@ spec: description: 'What type of storage medium should back this directory. The default is "" which means to use the node''s default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object fc: description: FC represents a Fibre Channel resource that is attached to a kubelet's host machine and then exposed to the pod. @@ -2974,8 +3049,12 @@ spec: description: 'Container name: required for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string diff --git a/manifests/setup/prometheus-operator-clusterRole.yaml b/manifests/setup/prometheus-operator-clusterRole.yaml index 8153e807f43953a2f283d09dc2dcbfe6a48e10d8..5eea2738b8e61d06f25323a90c2b5ae5fce7740a 100644 --- a/manifests/setup/prometheus-operator-clusterRole.yaml +++ b/manifests/setup/prometheus-operator-clusterRole.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator rules: - apiGroups: diff --git a/manifests/setup/prometheus-operator-clusterRoleBinding.yaml b/manifests/setup/prometheus-operator-clusterRoleBinding.yaml index 00f305a0793685debbd6314550d0b067a1f4e436..3ce2b874876404ef5c5ae52ca735dffed4e2492c 100644 --- a/manifests/setup/prometheus-operator-clusterRoleBinding.yaml +++ b/manifests/setup/prometheus-operator-clusterRoleBinding.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/setup/prometheus-operator-deployment.yaml b/manifests/setup/prometheus-operator-deployment.yaml index 119f639057ef1355a0ef99e49642e319ae2d4872..b832acb8dca8387baf26cf898a8b5ebb4bc268ef 100644 --- a/manifests/setup/prometheus-operator-deployment.yaml +++ b/manifests/setup/prometheus-operator-deployment.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator namespace: monitoring spec: @@ -13,19 +14,22 @@ spec: matchLabels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus template: metadata: + annotations: + kubectl.kubernetes.io/default-container: prometheus-operator labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 spec: containers: - args: - --kubelet-service=kube-system/kubelet - - --logtostderr=true - - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.43.2 - image: quay.io/prometheus-operator/prometheus-operator:v0.43.2 + - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.50.0 + image: quay.io/prometheus-operator/prometheus-operator:v0.50.0 name: prometheus-operator ports: - containerPort: 8080 @@ -44,15 +48,24 @@ spec: - --secure-listen-address=:8443 - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 - --upstream=http://127.0.0.1:8080/ - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.11.0 name: kube-rbac-proxy ports: - containerPort: 8443 name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi securityContext: - runAsUser: 65534 + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 nodeSelector: - beta.kubernetes.io/os: linux + kubernetes.io/os: linux securityContext: runAsNonRoot: true runAsUser: 65534 diff --git a/manifests/setup/prometheus-operator-service.yaml b/manifests/setup/prometheus-operator-service.yaml index 37d8f50422879c1ba6eed47cb2089b3f4a908e25..a8161b086ce80ad8e78af7b5ce396f03dc07da84 100644 --- a/manifests/setup/prometheus-operator-service.yaml +++ b/manifests/setup/prometheus-operator-service.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator namespace: monitoring spec: @@ -16,3 +17,4 @@ spec: selector: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/setup/prometheus-operator-serviceAccount.yaml b/manifests/setup/prometheus-operator-serviceAccount.yaml index cc4a1f8de44a32020b71e4761159ba49406405a5..9acb906d900d6af091c7dfac308ca9f762c49d5c 100644 --- a/manifests/setup/prometheus-operator-serviceAccount.yaml +++ b/manifests/setup/prometheus-operator-serviceAccount.yaml @@ -4,6 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.43.2 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.50.0 name: prometheus-operator namespace: monitoring diff --git a/scripts/generate-schemas.sh b/scripts/generate-schemas.sh new file mode 100755 index 0000000000000000000000000000000000000000..06b9bbe327381793649e7083a1480c207cc469dd --- /dev/null +++ b/scripts/generate-schemas.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +DIR="crdschemas" + +# Go to git repository root +cd ./$(git rev-parse --show-cdup) + +rm -rf "$DIR" +mkdir "$DIR" + +for crd in vendor/prometheus-operator/*-crd.libsonnet; do + jq '.spec.versions[0].schema.openAPIV3Schema' < "$crd" > "$DIR/$(basename "$crd" | sed 's/-crd.libsonnet/.json/')" +done diff --git a/scripts/generate-versions.sh b/scripts/generate-versions.sh new file mode 100755 index 0000000000000000000000000000000000000000..05c48a6351a0665de6534b408e08687ec0e9938b --- /dev/null +++ b/scripts/generate-versions.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +set -euo pipefail + +# Get component version from GitHub API +get_latest_version() { + echo >&2 "Checking release version for ${1}" + curl --retry 5 --silent --fail -H "Authorization: token $GITHUB_TOKEN" "https://api.github.com/repos/${1}/releases/latest" | jq '.tag_name' | tr -d '"v' +} + +# Get component version from version file +get_current_version() { + echo >&2 "Reading currently used version of ${1}" + v=$(jq -r ".${1}" "$VERSION_FILE") + if [ "${v}" == "" ]; then + echo >&2 "Couldn't read version of ${1} from $VERSION_FILE" + exit 1 + fi + echo "$v" +} + +# Get version from online source and filter out unstable releases. In case of unstable release use what is set in version file +get_version() { + component="${1}" + v="$(get_latest_version "${component}")" + + component="$(convert_to_camel_case "$(echo "${component}" | sed 's/^.*\///')")" + cv="$(get_current_version "${component}")" + + # Advanced AI heurestics to filter out common patterns suggesting new version is not stable /s + if [[ "$v" == "" ]] || [[ "$v" == *"alpha"* ]] || [[ "$v" == *"beta"* ]] || [[ "$v" == *"rc"* ]] || [[ "$v" == *"helm"* ]]; then + echo "$cv" + return + fi + + # Use higher version from new version and current version + v=$(printf '%s\n' "$v" "$cv" | sort -r | head -n1) + + echo "$v" +} + +convert_to_camel_case() { + echo "${1}" | sed -E 's/[ _-]([a-z])/\U\1/gi;s/^([A-Z])/\l\1/' +} + +# File is used to read current versions +VERSION_FILE="$(pwd)/jsonnet/kube-prometheus/versions.json" + +# token can be passed as `GITHUB_TOKEN` variable or passed as first argument +GITHUB_TOKEN=${GITHUB_TOKEN:-${1}} + +if [ -z "$GITHUB_TOKEN" ]; then + echo >&2 "GITHUB_TOKEN not set. Exiting" + exit 1 +fi + +cat <<-EOF +{ + "alertmanager": "$(get_version "prometheus/alertmanager")", + "blackboxExporter": "$(get_version "prometheus/blackbox_exporter")", + "grafana": "$(get_version "grafana/grafana")", + "kubeStateMetrics": "$(get_version "kubernetes/kube-state-metrics")", + "nodeExporter": "$(get_version "prometheus/node_exporter")", + "prometheus": "$(get_version "prometheus/prometheus")", + "prometheusAdapter": "$(get_version "kubernetes-sigs/prometheus-adapter")", + "prometheusOperator": "$(get_version "prometheus-operator/prometheus-operator")", + "kubeRbacProxy": "$(get_version "brancz/kube-rbac-proxy")", + "configmapReload": "$(get_version "jimmidyson/configmap-reload")" +} +EOF diff --git a/scripts/go.mod b/scripts/go.mod new file mode 100644 index 0000000000000000000000000000000000000000..07b37e914dc055b64d63ecc96267fb85fa46d441 --- /dev/null +++ b/scripts/go.mod @@ -0,0 +1,11 @@ +module _ // go.mod created for tooling dependencies + +go 1.15 + +require ( + github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c + github.com/campoy/embedmd v1.0.0 + github.com/google/go-jsonnet v0.17.1-0.20210101181740-31d71aaccda6 // 7 commits after 0.17.0. Needed by jsonnet linter + github.com/jsonnet-bundler/jsonnet-bundler v0.4.0 + github.com/yannh/kubeconform v0.4.7 +) diff --git a/scripts/go.sum b/scripts/go.sum new file mode 100644 index 0000000000000000000000000000000000000000..1f96b0edb69687387285c8efad6e329915aa1b9b --- /dev/null +++ b/scripts/go.sum @@ -0,0 +1,74 @@ +github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc h1:cAKDfWh5VpdgMhJosfJnn5/FoN2SRZ4p7fJNX58YPaU= +github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= +github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf h1:qet1QNfXsQxTZqLG4oE62mJzwPIB8+Tee4RNCL9ulrY= +github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/beevik/etree v1.1.0 h1:T0xke/WvNtMoCqgzPhkX2r4rjY3GDZFi+FjpRZY2Jbs= +github.com/beevik/etree v1.1.0/go.mod h1:r8Aw8JqVegEf0w2fDnATrX9VpkMcyFeM0FhwO62wh+A= +github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c h1:hb6WqfcKQZlNx/vahy51SaIvKnoXD5609Nm0PC4msEM= +github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c/go.mod h1:+00lOjYXPgMfxHVPvg9GDtc3BX5Xh5aFpB4gMB8gfMo= +github.com/campoy/embedmd v1.0.0 h1:V4kI2qTJJLf4J29RzI/MAt2c3Bl4dQSYPuflzwFH2hY= +github.com/campoy/embedmd v1.0.0/go.mod h1:oxyr9RCiSXg0M3VJ3ks0UGfp98BpSSGr0kpiX3MzVl8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/fatih/color v1.9.0 h1:8xPHl4/q1VyqGIPif1F+1V3Y3lSmrq01EabUW3CoW5s= +github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU= +github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= +github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= +github.com/google/go-jsonnet v0.17.0 h1:/9NIEfhK1NQRKl3sP2536b2+x5HnZMdql7x3yK/l8JY= +github.com/google/go-jsonnet v0.17.0/go.mod h1:sOcuej3UW1vpPTZOr8L7RQimqai1a57bt5j22LzGZCw= +github.com/google/go-jsonnet v0.17.1-0.20210101181740-31d71aaccda6 h1:91EupyycmO5ctzKuWEZ9nX0Cal1NveMiWcXxmRtLyLQ= +github.com/google/go-jsonnet v0.17.1-0.20210101181740-31d71aaccda6/go.mod h1:sOcuej3UW1vpPTZOr8L7RQimqai1a57bt5j22LzGZCw= +github.com/jsonnet-bundler/jsonnet-bundler v0.4.0 h1:4BKZ6LDqPc2wJDmaKnmYD/vDjUptJtnUpai802MibFc= +github.com/jsonnet-bundler/jsonnet-bundler v0.4.0/go.mod h1:/by7P/OoohkI3q4CgSFqcoFsVY+IaNbzOVDknEsKDeU= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1 h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= +github.com/mattn/go-colorable v0.1.4 h1:snbPLB8fVfU9iwbbo30TPtbLRzwWu6aJS6Xh4eaaviA= +github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= +github.com/mattn/go-isatty v0.0.6/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/mattn/go-isatty v0.0.11 h1:FxPOTFNqGkuDUGi3H/qkUbQO4ZiBa2brKq5r0l8TGeM= +github.com/mattn/go-isatty v0.0.11/go.mod h1:PhnuNfih5lzO57/f3n+odYbM4JtupLOxQOAqxQCu2WE= +github.com/pkg/errors v0.8.0 h1:WdK/asTD0HN+q6hsWO3/vpuAkAr+tw6aNJNDFFf0+qw= +github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= +github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= +github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f h1:J9EGpcZtP0E/raorCMxlFGSTBrsSlaDGf3jU/qvAE2c= +github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= +github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHovont7NscjpAxXsDA8S8BMYve8Y5+7cuRE7R0= +github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= +github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74= +github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= +github.com/yannh/kubeconform v0.4.2 h1:8ve/dz6ns9tT5efR1Qfn8569JkenPFqnWcVWGz3lqPw= +github.com/yannh/kubeconform v0.4.2/go.mod h1:Ysf3RSreh2rX8IJsVt/uT3Um/U3e3ykx6Fcz8nCdskM= +github.com/yannh/kubeconform v0.4.7 h1:ExAjZYd6D0WnG/Eq/IRxvTebPbARh6e6M96Pq8Xy5u0= +github.com/yannh/kubeconform v0.4.7/go.mod h1:lhkEiaDOtSewHGGZ8iR2iiTC0CSnR7xbMEtyL4bm4rE= +golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190310054646-10058d7d4faa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037 h1:YyJpGZS1sBuBCzLAR1VEpK193GlqGZbnPFnPV/5Rsb4= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +gopkg.in/alecthomas/kingpin.v2 v2.2.6 h1:jMFz6MfLP0/4fUyZle81rXUoxOBFi19VUFKVDOQfozc= +gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU= +gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +sigs.k8s.io/yaml v1.2.0 h1:kr/MCeFWJWTwyaHoR9c8EjH9OumOmoF9YGiZd7lFm/Q= +sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc= diff --git a/test.sh b/scripts/test.sh similarity index 80% rename from test.sh rename to scripts/test.sh index 58b06fe8cce60d57cdfac33553f9f0fc9e731206..6774c63ad9d253e989cc9d3ee41c37b2479d5408 100755 --- a/test.sh +++ b/scripts/test.sh @@ -5,6 +5,8 @@ set -o pipefail # Make sure to use project tooling PATH="$(pwd)/tmp/bin:${PATH}" +TESTFILE="$(pwd)/tmp/test.jsonnet" +mkdir -p "$(pwd)/tmp" for i in examples/jsonnet-snippets/*.jsonnet; do [ -f "$i" ] || break @@ -14,13 +16,13 @@ for i in examples/jsonnet-snippets/*.jsonnet; do snippet="local kp = $fileContent; $(<examples/jsonnet-build-snippet/build-snippet.jsonnet)" - echo "${snippet}" > "test.jsonnet" + echo "${snippet}" > "${TESTFILE}" echo "\`\`\`" echo "${snippet}" echo "\`\`\`" echo "" - jsonnet -J vendor "test.jsonnet" > /dev/null - rm -rf "test.jsonnet" + jsonnet -J vendor "${TESTFILE}" > /dev/null + rm -rf "${TESTFILE}" done for i in examples/*.jsonnet; do diff --git a/scripts/tools.go b/scripts/tools.go index b6cba4f29006323743b508f4c57349140e4fefba..64813348a240ce1ce110681b229f495559fe9690 100644 --- a/scripts/tools.go +++ b/scripts/tools.go @@ -8,6 +8,8 @@ import ( _ "github.com/brancz/gojsontoyaml" _ "github.com/campoy/embedmd" _ "github.com/google/go-jsonnet/cmd/jsonnet" + _ "github.com/google/go-jsonnet/cmd/jsonnet-lint" _ "github.com/google/go-jsonnet/cmd/jsonnetfmt" + _ "github.com/yannh/kubeconform/cmd/kubeconform" _ "github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb" )