diff --git a/README.md b/README.md index bf61d9aeb747c927f066d4f364db727d2a52ff58..67cbd4c15598cdaf6ac10904c064ad4bf236cd78 100644 --- a/README.md +++ b/README.md @@ -369,3 +369,23 @@ The Prometheus `/targets` page will show the kubelet job with the error `403 Una #### Authorization problem The Prometheus `/targets` page will show the kubelet job with the error `401 Unauthorized`, when token authorization is not enabled. Ensure that the `--authorization-mode=Webhook` flag is enabled on all kubelet configurations. + +### kube-state-metrics resource usage + +In some environments, kube-state-metrics may need additional +resources. One driver for more resource needs, is a high number of +namespaces. There may be others. + +kube-state-metrics resource allocation is managed by +[addon-resizer](https://github.com/kubernetes/autoscaler/tree/master/addon-resizer/nanny) +You can control it's parameters by setting variables in the +config. They default to: + +``` jsonnet + kubeStateMetrics+:: { + baseCPU: '100m', + cpuPerNode: '2m', + baseMemory: '150Mi', + memoryPerNode: '30Mi', + } +``` diff --git a/experimental/metrics-server/metrics-server-cluster-role.yaml b/experimental/metrics-server/metrics-server-cluster-role.yaml index 6976f5ce042609e8c7012f7a9ec000802225c480..38844d9a69d82ed6eed956d14467becd10b91714 100644 --- a/experimental/metrics-server/metrics-server-cluster-role.yaml +++ b/experimental/metrics-server/metrics-server-cluster-role.yaml @@ -8,6 +8,7 @@ rules: resources: - pods - nodes + - nodes/stats - namespaces verbs: - get diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/alerts/node.libsonnet index 46a5e36dd4705ffbd7cc6cb318a341abdaf5a186..5c24f09f3c742824d445b4559e886cf8d5c32afe 100644 --- a/jsonnet/kube-prometheus/alerts/node.libsonnet +++ b/jsonnet/kube-prometheus/alerts/node.libsonnet @@ -11,7 +11,7 @@ summary: 'Node disk is running full within 24 hours', }, expr: ||| - predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 + predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 and on(instance) up{%(nodeExporterSelector)s} ||| % $._config, 'for': '30m', labels: { @@ -25,7 +25,7 @@ summary: 'Node disk is running full within 2 hours', }, expr: ||| - predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 + predict_linear(node_filesystem_free{%(nodeExporterSelector)s,mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 and on(instance) up{%(nodeExporterSelector)s} ||| % $._config, 'for': '10m', labels: { diff --git a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index c36f293b33e367478f11adcfa696b67b319c7f33..2805fc9da993862cb0c8a135a1a220bb2f3e309c 100644 --- a/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -4,6 +4,17 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; _config+:: { namespace: 'default', + kubeStateMetrics+:: { + collectors: '', // empty string gets a default set + scrapeInterval: '30s', + scrapeTimeout: '30s', + + baseCPU: '100m', + baseMemory: '150Mi', + cpuPerNode: '2m', + memoryPerNode: '30Mi', + }, + versions+:: { kubeStateMetrics: 'v1.3.1', kubeRbacProxy: 'v0.3.1', @@ -137,19 +148,19 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082', - ]) + - container.mixin.resources.withRequests({ cpu: '102m', memory: '180Mi' }) + - container.mixin.resources.withLimits({ cpu: '102m', memory: '180Mi' }); + ] + if $._config.kubeStateMetrics.collectors != '' then ['--collectors=' + $._config.kubeStateMetrics.collectors] else []) + + container.mixin.resources.withRequests({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory }) + + container.mixin.resources.withLimits({ cpu: $._config.kubeStateMetrics.baseCPU, memory: $._config.kubeStateMetrics.baseMemory }); local addonResizer = container.new('addon-resizer', $._config.imageRepos.addonResizer + ':' + $._config.versions.addonResizer) + container.withCommand([ '/pod_nanny', '--container=kube-state-metrics', - '--cpu=100m', - '--extra-cpu=2m', - '--memory=150Mi', - '--extra-memory=30Mi', + '--cpu=' + $._config.kubeStateMetrics.baseCPU, + '--extra-cpu=' + $._config.kubeStateMetrics.cpuPerNode, + '--memory=' + $._config.kubeStateMetrics.baseMemory, + '--extra-memory=' + $._config.kubeStateMetrics.memoryPerNode, '--threshold=5', '--deployment=kube-state-metrics', ]) + @@ -258,7 +269,8 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; { port: 'https-main', scheme: 'https', - interval: '30s', + interval: $._config.kubeStateMetrics.scrapeInterval, + scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout, honorLabels: true, bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', tlsConfig: { diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index 6f30397aa749223fcdb71fb6995f5934b5d908c4..9d782f51938ac305d3c43268123ac0deb3b033d0 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -23,8 +23,8 @@ spec: submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' type: string spec: - description: 'Specification of the desired behavior of the Alertmanager - cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'AlertmanagerSpec is a specification of the desired behavior + of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: affinity: description: Affinity is a group of affinity scheduling rules. @@ -2372,9 +2372,9 @@ spec: description: Version the cluster should be on. type: string status: - description: 'Most recent observed status of the Alertmanager cluster. Read-only. - Not included when requesting from the apiserver, only from the Prometheus - Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'AlertmanagerStatus is the most recent observed status of the + Alertmanager cluster. Read-only. Not included when requesting from the + apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: availableReplicas: description: Total number of available pods (ready for at least minReadySeconds) diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 140deffa2d11881a2b979ff7da3d3c1ca5f9d293..df1274eb35705e9b222323fccfcbaf6bf68006fe 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -23,8 +23,8 @@ spec: submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' type: string spec: - description: 'Specification of the desired behavior of the Prometheus cluster. - More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'PrometheusSpec is a specification of the desired behavior + of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: additionalAlertManagerConfigs: description: SecretKeySelector selects a key of a Secret. @@ -2862,7 +2862,7 @@ spec: description: Peers is a DNS name for Thanos to discover peers through. type: string s3: - description: ThanosSpec defines parameters for of AWS Simple Storage + description: ThanosS3Spec defines parameters for of AWS Simple Storage Service (S3) with Thanos. (S3 compatible services apply as well) properties: accessKey: @@ -2961,9 +2961,9 @@ spec: description: Version of Prometheus to be deployed. type: string status: - description: 'Most recent observed status of the Prometheus cluster. Read-only. - Not included when requesting from the apiserver, only from the Prometheus - Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + description: 'PrometheusStatus is the most recent observed status of the + Prometheus cluster. Read-only. Not included when requesting from the apiserver, + only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' properties: availableReplicas: description: Total number of available pods (ready for at least minReadySeconds) diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml index f3068cf8ce71885c2e0896626a2f767b3e8479d5..9d96bfebf1c8918634bf181c590afa5ca5f069cf 100644 --- a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -169,7 +169,7 @@ spec: description: The label to use to retrieve the job name from. type: string namespaceSelector: - description: A selector for selecting namespaces either selecting all + description: NamespaceSelector is a selector for selecting either all namespaces or a list of namespaces. properties: any: diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 358fb6e228a108e4ebb3b60fc13418351aec1f10..a08269803127c74055588613b96ee1ee85c445ff 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -18,6 +18,7 @@ spec: containers: - args: - --kubelet-service=kube-system/kubelet + - -logtostderr=true - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.22.2 image: quay.io/coreos/prometheus-operator:v0.22.2 @@ -28,10 +29,13 @@ spec: resources: limits: cpu: 200m - memory: 100Mi + memory: 200Mi requests: cpu: 100m - memory: 50Mi + memory: 100Mi + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true nodeSelector: beta.kubernetes.io/os: linux securityContext: diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index c7bb25c66f6025bce9edcf290623c0e95a305c62..065c87a9821298a18eb150d2f19d89b4e26495ab 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -55,11 +55,11 @@ spec: name: kube-state-metrics resources: limits: - cpu: 102m - memory: 180Mi + cpu: 100m + memory: 150Mi requests: - cpu: 102m - memory: 180Mi + cpu: 100m + memory: 150Mi - command: - /pod_nanny - --container=kube-state-metrics diff --git a/manifests/kube-state-metrics-serviceMonitor.yaml b/manifests/kube-state-metrics-serviceMonitor.yaml index 3d1073ade8737ed1773f1750dfefc54600540e18..2100449da24436a7361342a2186a6d0cb96736d2 100644 --- a/manifests/kube-state-metrics-serviceMonitor.yaml +++ b/manifests/kube-state-metrics-serviceMonitor.yaml @@ -12,6 +12,7 @@ spec: interval: 30s port: https-main scheme: https + scrapeTimeout: 30s tlsConfig: insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 49c4a9954b7f092215f3fe90cf7cb3bded4a17f8..5af7d2fa93939357fe332d4bfc4786fc3b2d6680 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -744,7 +744,7 @@ spec: full within the next 24 hours (mounted at {{$labels.mountpoint}}) summary: Node disk is running full within 24 hours expr: | - predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 + predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 and on(instance) up{job="node-exporter"} for: 30m labels: severity: warning @@ -754,7 +754,7 @@ spec: full within the next 2 hours (mounted at {{$labels.mountpoint}}) summary: Node disk is running full within 2 hours expr: | - predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 + predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 and on(instance) up{job="node-exporter"} for: 10m labels: severity: critical