From d4b581fa49b4bac7f01b9fc37a409ddecf0a2199 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk <fbranczyk@gmail.com> Date: Wed, 14 Dec 2016 17:48:54 -0800 Subject: [PATCH] use ServiceMonitors to generate Prometheus Kubernetes config --- assets/prometheus/prometheus.yaml | 85 ----------------- assets/prometheus/rules/kubernetes.rules | 8 +- hack/cluster-monitoring/deploy | 2 +- hack/scripts/generate-configmaps.sh | 3 - .../alertmanager/alertmanager-service.yaml | 2 +- manifests/alertmanager/alertmanager.yaml | 2 +- .../example-app/prometheus-frontend.yaml | 2 +- .../exporters/kube-state-metrics-depl.yaml | 2 +- .../exporters/kube-state-metrics-svc.yaml | 5 +- manifests/exporters/node-exporter-svc.yaml | 3 +- manifests/k8s/minikube/kube-apiserver.yaml | 27 ------ manifests/k8s/self-hosted/kube-apiserver.yaml | 16 ---- manifests/prometheus-operator.yaml | 5 +- manifests/prometheus/prometheus-k8s-cm.yaml | 92 ------------------- .../prometheus/prometheus-k8s-rules.yaml | 8 +- .../prometheus-k8s-servicemonitor.yaml | 69 ++++++++++++++ manifests/prometheus/prometheus-k8s-svc.yaml | 2 +- manifests/prometheus/prometheus-k8s.yaml | 12 ++- 18 files changed, 103 insertions(+), 242 deletions(-) delete mode 100644 assets/prometheus/prometheus.yaml delete mode 100644 manifests/k8s/minikube/kube-apiserver.yaml delete mode 100644 manifests/k8s/self-hosted/kube-apiserver.yaml delete mode 100644 manifests/prometheus/prometheus-k8s-cm.yaml create mode 100644 manifests/prometheus/prometheus-k8s-servicemonitor.yaml diff --git a/assets/prometheus/prometheus.yaml b/assets/prometheus/prometheus.yaml deleted file mode 100644 index 14decf23..00000000 --- a/assets/prometheus/prometheus.yaml +++ /dev/null @@ -1,85 +0,0 @@ -alerting: - alertmanagers: - - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: alertmanager-main - source_labels: - - __meta_kubernetes_service_name - - action: keep - regex: monitoring - source_labels: - - __meta_kubernetes_namespace - - action: keep - regex: web - source_labels: - - __meta_kubernetes_endpoint_port_name - scheme: http - -global: - scrape_interval: 15s - evaluation_interval: 15s - -rule_files: -- /etc/prometheus/rules/*.rules - -scrape_configs: -- job_name: kubelets - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # Skip verification until we have resolved why the certificate validation - # for the kubelet on API server nodes fail. - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - -# Scrapes the endpoint lists for the Kubernetes API server, kube-state-metrics, -# and node-exporter, which we all consider part of a default setup. -- job_name: standard-endpoints - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # As for kubelets, certificate validation fails for the API server (node) - # and we circumvent it for now. - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - action: keep - source_labels: [__meta_kubernetes_service_name] - regex: prometheus|node-exporter|kube-state-metrics - - action: replace - source_labels: [__meta_kubernetes_service_name] - target_label: job - -# Scrapes the endpoint lists for the kube-dns server. Which we consider -# part of a default setup. -- job_name: kube-components - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - action: replace - source_labels: [__meta_kubernetes_service_label_k8s_app] - target_label: job - - action: keep - source_labels: [__meta_kubernetes_service_name] - regex: ".*-prometheus-discovery" - - action: keep - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: "http-metrics.*|https-metrics.*" - - action: replace - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: "https-metrics.*" - target_label: __scheme__ - replacement: https diff --git a/assets/prometheus/rules/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules index c0dddb92..157eb3fa 100644 --- a/assets/prometheus/rules/kubernetes.rules +++ b/assets/prometheus/rules/kubernetes.rules @@ -171,7 +171,7 @@ cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 ALERT K8SNodeDown - IF up{job="kubelets"} == 0 + IF up{job="kubelet"} == 0 FOR 1h LABELS { service = "k8s", @@ -226,7 +226,7 @@ ALERT K8SKubeletNodeExporterDown } ALERT K8SKubeletDown - IF absent(up{job="kubelets"}) or count by (cluster) (up{job="kubelets"} == 0) / count by (cluster) (up{job="kubelets"}) > 0.1 + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 FOR 1h LABELS { service = "k8s", @@ -323,7 +323,7 @@ ALERT K8SConntrackTuningMissing } ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50 + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 FOR 10m LABELS { service = "k8s", @@ -335,7 +335,7 @@ ALERT K8STooManyOpenFiles } ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80 + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 FOR 10m LABELS { service = "k8s", diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index cbf0270e..c3e2667f 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -26,7 +26,6 @@ echo "done!" kctl apply -f manifests/exporters kctl apply -f manifests/grafana -kctl apply -f manifests/prometheus/prometheus-k8s-cm.yaml kctl apply -f manifests/prometheus/prometheus-k8s-rules.yaml kctl apply -f manifests/prometheus/prometheus-k8s-svc.yaml @@ -36,6 +35,7 @@ kctl apply -f manifests/alertmanager/alertmanager-service.yaml # `kubectl apply` is currently not working for third party resources so we are # using `kubectl create` here for the time being. # (https://github.com/kubernetes/kubernetes/issues/29542) +kctl create -f manifests/prometheus/prometheus-k8s-servicemonitor.yaml kctl create -f manifests/prometheus/prometheus-k8s.yaml kctl create -f manifests/alertmanager/alertmanager.yaml diff --git a/hack/scripts/generate-configmaps.sh b/hack/scripts/generate-configmaps.sh index 6fb7b6e3..aa38878b 100755 --- a/hack/scripts/generate-configmaps.sh +++ b/hack/scripts/generate-configmaps.sh @@ -1,8 +1,5 @@ #!/bin/bash -# Generate Prometheus configuration ConfigMap -kubectl create configmap --dry-run=true prometheus-k8s --from-file=assets/prometheus/prometheus.yaml -oyaml > manifests/prometheus/prometheus-k8s-cm.yaml - # Generate Alert Rules ConfigMap kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml diff --git a/manifests/alertmanager/alertmanager-service.yaml b/manifests/alertmanager/alertmanager-service.yaml index 86599c35..1608d14d 100644 --- a/manifests/alertmanager/alertmanager-service.yaml +++ b/manifests/alertmanager/alertmanager-service.yaml @@ -11,4 +11,4 @@ spec: protocol: TCP targetPort: web selector: - alertmanager: alertmanager-main + alertmanager: main diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests/alertmanager/alertmanager.yaml index ce67f3bb..fbd2d452 100644 --- a/manifests/alertmanager/alertmanager.yaml +++ b/manifests/alertmanager/alertmanager.yaml @@ -1,7 +1,7 @@ apiVersion: "monitoring.coreos.com/v1alpha1" kind: "Alertmanager" metadata: - name: "alertmanager-main" + name: "main" labels: alertmanager: "main" spec: diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index 59decb14..915ded6a 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -6,7 +6,7 @@ metadata: labels: prometheus: frontend spec: - version: v1.4.1 + version: v1.5.2 serviceMonitorSelector: matchLabels: tier: frontend diff --git a/manifests/exporters/kube-state-metrics-depl.yaml b/manifests/exporters/kube-state-metrics-depl.yaml index b044ba2e..6ef971ce 100644 --- a/manifests/exporters/kube-state-metrics-depl.yaml +++ b/manifests/exporters/kube-state-metrics-depl.yaml @@ -11,7 +11,7 @@ spec: spec: containers: - name: kube-state-metrics - image: gcr.io/google_containers/kube-state-metrics:v0.3.0 + image: gcr.io/google_containers/kube-state-metrics:v0.4.1 ports: - name: metrics containerPort: 8080 diff --git a/manifests/exporters/kube-state-metrics-svc.yaml b/manifests/exporters/kube-state-metrics-svc.yaml index 8b68484b..607869e1 100644 --- a/manifests/exporters/kube-state-metrics-svc.yaml +++ b/manifests/exporters/kube-state-metrics-svc.yaml @@ -3,10 +3,13 @@ kind: Service metadata: labels: app: kube-state-metrics + k8s-app: kube-state-metrics + annotations: + alpha.monitoring.coreos.com/non-namespaced: "true" name: kube-state-metrics spec: ports: - - name: metrics + - name: http-metrics port: 8080 targetPort: metrics protocol: TCP diff --git a/manifests/exporters/node-exporter-svc.yaml b/manifests/exporters/node-exporter-svc.yaml index f2d24a42..46b1a3fd 100644 --- a/manifests/exporters/node-exporter-svc.yaml +++ b/manifests/exporters/node-exporter-svc.yaml @@ -3,12 +3,13 @@ kind: Service metadata: labels: app: node-exporter + k8s-app: node-exporter name: node-exporter spec: type: ClusterIP clusterIP: None ports: - - name: metrics + - name: http-metrics port: 9100 protocol: TCP selector: diff --git a/manifests/k8s/minikube/kube-apiserver.yaml b/manifests/k8s/minikube/kube-apiserver.yaml deleted file mode 100644 index 2b35a4ec..00000000 --- a/manifests/k8s/minikube/kube-apiserver.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: kube-apiserver-prometheus-discovery - labels: - k8s-app: kubernetes -spec: - type: ClusterIP - clusterIP: None - ports: - - name: https-metrics - port: 8443 - protocol: TCP ---- -apiVersion: v1 -kind: Endpoints -metadata: - name: kube-apiserver-prometheus-discovery - labels: - k8s-app: kubernetes -subsets: -- addresses: - - ip: 192.168.99.100 - ports: - - name: https-metrics - port: 8443 - protocol: TCP diff --git a/manifests/k8s/self-hosted/kube-apiserver.yaml b/manifests/k8s/self-hosted/kube-apiserver.yaml deleted file mode 100644 index 72b1c08f..00000000 --- a/manifests/k8s/self-hosted/kube-apiserver.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: kube-apiserver-prometheus-discovery - labels: - k8s-app: kubernetes -spec: - selector: - k8s-app: kube-apiserver - type: ClusterIP - clusterIP: None - ports: - - name: https-metrics - port: 443 - targetPort: 443 - protocol: TCP diff --git a/manifests/prometheus-operator.yaml b/manifests/prometheus-operator.yaml index bb1dab9a..78bc11d7 100644 --- a/manifests/prometheus-operator.yaml +++ b/manifests/prometheus-operator.yaml @@ -13,7 +13,10 @@ spec: spec: containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.2.1 + image: quay.io/coreos/prometheus-operator:v0.6.0 + args: + - "--kubelet-object=kube-system/kubelet" + - "--config-reloader-image=quay.io/coreos/configmap-reload:latest" resources: requests: cpu: 100m diff --git a/manifests/prometheus/prometheus-k8s-cm.yaml b/manifests/prometheus/prometheus-k8s-cm.yaml deleted file mode 100644 index 5e4a9cd3..00000000 --- a/manifests/prometheus/prometheus-k8s-cm.yaml +++ /dev/null @@ -1,92 +0,0 @@ -apiVersion: v1 -data: - prometheus.yaml: | - alerting: - alertmanagers: - - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: alertmanager-main - source_labels: - - __meta_kubernetes_service_name - - action: keep - regex: monitoring - source_labels: - - __meta_kubernetes_namespace - - action: keep - regex: web - source_labels: - - __meta_kubernetes_endpoint_port_name - scheme: http - - global: - scrape_interval: 15s - evaluation_interval: 15s - - rule_files: - - /etc/prometheus/rules/*.rules - - scrape_configs: - - job_name: kubelets - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # Skip verification until we have resolved why the certificate validation - # for the kubelet on API server nodes fail. - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - # Scrapes the endpoint lists for the Kubernetes API server, kube-state-metrics, - # and node-exporter, which we all consider part of a default setup. - - job_name: standard-endpoints - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # As for kubelets, certificate validation fails for the API server (node) - # and we circumvent it for now. - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - action: keep - source_labels: [__meta_kubernetes_service_name] - regex: prometheus|node-exporter|kube-state-metrics - - action: replace - source_labels: [__meta_kubernetes_service_name] - target_label: job - - # Scrapes the endpoint lists for the kube-dns server. Which we consider - # part of a default setup. - - job_name: kube-components - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - action: replace - source_labels: [__meta_kubernetes_service_label_k8s_app] - target_label: job - - action: keep - source_labels: [__meta_kubernetes_service_name] - regex: ".*-prometheus-discovery" - - action: keep - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: "http-metrics.*|https-metrics.*" - - action: replace - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: "https-metrics.*" - target_label: __scheme__ - replacement: https -kind: ConfigMap -metadata: - creationTimestamp: null - name: prometheus-k8s diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 6e83500e..08f6dddc 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -226,7 +226,7 @@ data: histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 ALERT K8SNodeDown - IF up{job="kubelets"} == 0 + IF up{job="kubelet"} == 0 FOR 1h LABELS { service = "k8s", @@ -281,7 +281,7 @@ data: } ALERT K8SKubeletDown - IF absent(up{job="kubelets"}) or count by (cluster) (up{job="kubelets"} == 0) / count by (cluster) (up{job="kubelets"}) > 0.1 + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 FOR 1h LABELS { service = "k8s", @@ -378,7 +378,7 @@ data: } ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50 + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 FOR 10m LABELS { service = "k8s", @@ -390,7 +390,7 @@ data: } ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80 + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 FOR 10m LABELS { service = "k8s", diff --git a/manifests/prometheus/prometheus-k8s-servicemonitor.yaml b/manifests/prometheus/prometheus-k8s-servicemonitor.yaml new file mode 100644 index 00000000..dbad7e5f --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-servicemonitor.yaml @@ -0,0 +1,69 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-apiserver + labels: + k8s-apps: https +spec: + jobLabel: provider + selector: + matchLabels: + component: apiserver + provider: kubernetes + namespaceSelector: + matchNames: + - default + endpoints: + - port: https + interval: 15s + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: k8s-apps-https + labels: + k8s-apps: https +spec: + jobLabel: k8s-app + selector: + matchExpressions: + - {key: k8s-app, operator: Exists} + namespaceSelector: + matchNames: + - kube-system + endpoints: + - port: https-metrics + interval: 15s + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: k8s-apps-http + labels: + k8s-apps: http +spec: + jobLabel: k8s-app + selector: + matchExpressions: + - {key: k8s-app, operator: Exists} + namespaceSelector: + matchNames: + - kube-system + - monitoring + endpoints: + - port: http-metrics + interval: 15s + - port: http-metrics-dnsmasq + interval: 15s + - port: http-metrics-skydns + interval: 15s diff --git a/manifests/prometheus/prometheus-k8s-svc.yaml b/manifests/prometheus/prometheus-k8s-svc.yaml index d3d25d2b..a558f30f 100644 --- a/manifests/prometheus/prometheus-k8s-svc.yaml +++ b/manifests/prometheus/prometheus-k8s-svc.yaml @@ -11,4 +11,4 @@ spec: protocol: TCP targetPort: web selector: - prometheus: prometheus-k8s + prometheus: k8s diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 6ed1fe90..a593f041 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -1,11 +1,14 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: Prometheus metadata: - name: prometheus-k8s + name: k8s labels: prometheus: k8s spec: - version: v1.4.1 + version: v1.5.2 + serviceMonitorSelector: + matchExpression: + - {key: k8s-apps, operator: Exists} resources: requests: # 2Gi is default, but won't schedule if you don't have a node with >2Gi @@ -13,3 +16,8 @@ spec: # production use. This value is mainly meant for demonstration/testing # purposes. memory: 400Mi + alerting: + alertmanagers: + - namespace: monitoring + name: alertmanager-main + port: web -- GitLab