diff --git a/assets/prometheus/prometheus.yaml b/assets/prometheus/prometheus.yaml deleted file mode 100644 index 14decf23708b82ec1aa9fb9dfd53cdd98654966a..0000000000000000000000000000000000000000 --- a/assets/prometheus/prometheus.yaml +++ /dev/null @@ -1,85 +0,0 @@ -alerting: - alertmanagers: - - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: alertmanager-main - source_labels: - - __meta_kubernetes_service_name - - action: keep - regex: monitoring - source_labels: - - __meta_kubernetes_namespace - - action: keep - regex: web - source_labels: - - __meta_kubernetes_endpoint_port_name - scheme: http - -global: - scrape_interval: 15s - evaluation_interval: 15s - -rule_files: -- /etc/prometheus/rules/*.rules - -scrape_configs: -- job_name: kubelets - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # Skip verification until we have resolved why the certificate validation - # for the kubelet on API server nodes fail. - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - -# Scrapes the endpoint lists for the Kubernetes API server, kube-state-metrics, -# and node-exporter, which we all consider part of a default setup. -- job_name: standard-endpoints - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # As for kubelets, certificate validation fails for the API server (node) - # and we circumvent it for now. - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - action: keep - source_labels: [__meta_kubernetes_service_name] - regex: prometheus|node-exporter|kube-state-metrics - - action: replace - source_labels: [__meta_kubernetes_service_name] - target_label: job - -# Scrapes the endpoint lists for the kube-dns server. Which we consider -# part of a default setup. -- job_name: kube-components - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - action: replace - source_labels: [__meta_kubernetes_service_label_k8s_app] - target_label: job - - action: keep - source_labels: [__meta_kubernetes_service_name] - regex: ".*-prometheus-discovery" - - action: keep - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: "http-metrics.*|https-metrics.*" - - action: replace - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: "https-metrics.*" - target_label: __scheme__ - replacement: https diff --git a/assets/prometheus/rules/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules index c0dddb92f652159f742b800945721001f05a9b61..157eb3fa8cb61b79ac7340a12b5d89542f3ff28b 100644 --- a/assets/prometheus/rules/kubernetes.rules +++ b/assets/prometheus/rules/kubernetes.rules @@ -171,7 +171,7 @@ cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} = histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 ALERT K8SNodeDown - IF up{job="kubelets"} == 0 + IF up{job="kubelet"} == 0 FOR 1h LABELS { service = "k8s", @@ -226,7 +226,7 @@ ALERT K8SKubeletNodeExporterDown } ALERT K8SKubeletDown - IF absent(up{job="kubelets"}) or count by (cluster) (up{job="kubelets"} == 0) / count by (cluster) (up{job="kubelets"}) > 0.1 + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 FOR 1h LABELS { service = "k8s", @@ -323,7 +323,7 @@ ALERT K8SConntrackTuningMissing } ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50 + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 FOR 10m LABELS { service = "k8s", @@ -335,7 +335,7 @@ ALERT K8STooManyOpenFiles } ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80 + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 FOR 10m LABELS { service = "k8s", diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index cbf0270e8a57114ef2cf0eb5a7982f1e5136d492..c3e2667f4f9a5ea1192c6fbf711dab0dbbe7e20d 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -26,7 +26,6 @@ echo "done!" kctl apply -f manifests/exporters kctl apply -f manifests/grafana -kctl apply -f manifests/prometheus/prometheus-k8s-cm.yaml kctl apply -f manifests/prometheus/prometheus-k8s-rules.yaml kctl apply -f manifests/prometheus/prometheus-k8s-svc.yaml @@ -36,6 +35,7 @@ kctl apply -f manifests/alertmanager/alertmanager-service.yaml # `kubectl apply` is currently not working for third party resources so we are # using `kubectl create` here for the time being. # (https://github.com/kubernetes/kubernetes/issues/29542) +kctl create -f manifests/prometheus/prometheus-k8s-servicemonitor.yaml kctl create -f manifests/prometheus/prometheus-k8s.yaml kctl create -f manifests/alertmanager/alertmanager.yaml diff --git a/hack/scripts/generate-configmaps.sh b/hack/scripts/generate-configmaps.sh index 6fb7b6e3801a7ea7169693fb0e952e825de1156c..aa38878b7791da5871de3ae134ce26f0d09c176a 100755 --- a/hack/scripts/generate-configmaps.sh +++ b/hack/scripts/generate-configmaps.sh @@ -1,8 +1,5 @@ #!/bin/bash -# Generate Prometheus configuration ConfigMap -kubectl create configmap --dry-run=true prometheus-k8s --from-file=assets/prometheus/prometheus.yaml -oyaml > manifests/prometheus/prometheus-k8s-cm.yaml - # Generate Alert Rules ConfigMap kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml diff --git a/manifests/alertmanager/alertmanager-service.yaml b/manifests/alertmanager/alertmanager-service.yaml index 86599c3544bda5a2cb6732786a81e8930e6477ed..1608d14dce32a8d1e6074dddf701bedb05fec865 100644 --- a/manifests/alertmanager/alertmanager-service.yaml +++ b/manifests/alertmanager/alertmanager-service.yaml @@ -11,4 +11,4 @@ spec: protocol: TCP targetPort: web selector: - alertmanager: alertmanager-main + alertmanager: main diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests/alertmanager/alertmanager.yaml index ce67f3bb30870f0b220547edf5827bae96eaad22..fbd2d4529a52490d690a9b7e2b4e95fc86fa7996 100644 --- a/manifests/alertmanager/alertmanager.yaml +++ b/manifests/alertmanager/alertmanager.yaml @@ -1,7 +1,7 @@ apiVersion: "monitoring.coreos.com/v1alpha1" kind: "Alertmanager" metadata: - name: "alertmanager-main" + name: "main" labels: alertmanager: "main" spec: diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index 59decb14cd6312f953e7c42747ef7c46727b34ec..915ded6a3ff3c3e472d41cb6183cb2567d18937f 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -6,7 +6,7 @@ metadata: labels: prometheus: frontend spec: - version: v1.4.1 + version: v1.5.2 serviceMonitorSelector: matchLabels: tier: frontend diff --git a/manifests/exporters/kube-state-metrics-depl.yaml b/manifests/exporters/kube-state-metrics-depl.yaml index b044ba2eb293a65c50071dab0379ce0a67a502e4..6ef971cefa096af3bf0f5c2c30ba93c41f8a4566 100644 --- a/manifests/exporters/kube-state-metrics-depl.yaml +++ b/manifests/exporters/kube-state-metrics-depl.yaml @@ -11,7 +11,7 @@ spec: spec: containers: - name: kube-state-metrics - image: gcr.io/google_containers/kube-state-metrics:v0.3.0 + image: gcr.io/google_containers/kube-state-metrics:v0.4.1 ports: - name: metrics containerPort: 8080 diff --git a/manifests/exporters/kube-state-metrics-svc.yaml b/manifests/exporters/kube-state-metrics-svc.yaml index 8b68484b072e09d05f15b5b6b7520a0860a69415..607869e1a55cd6918a9daeffae3626b1e42e9284 100644 --- a/manifests/exporters/kube-state-metrics-svc.yaml +++ b/manifests/exporters/kube-state-metrics-svc.yaml @@ -3,10 +3,13 @@ kind: Service metadata: labels: app: kube-state-metrics + k8s-app: kube-state-metrics + annotations: + alpha.monitoring.coreos.com/non-namespaced: "true" name: kube-state-metrics spec: ports: - - name: metrics + - name: http-metrics port: 8080 targetPort: metrics protocol: TCP diff --git a/manifests/exporters/node-exporter-svc.yaml b/manifests/exporters/node-exporter-svc.yaml index f2d24a422375075f7d5ee3f5f6650ceb33f7d7b7..46b1a3fd4bb31b0e1b0167df047a657db6ec6d76 100644 --- a/manifests/exporters/node-exporter-svc.yaml +++ b/manifests/exporters/node-exporter-svc.yaml @@ -3,12 +3,13 @@ kind: Service metadata: labels: app: node-exporter + k8s-app: node-exporter name: node-exporter spec: type: ClusterIP clusterIP: None ports: - - name: metrics + - name: http-metrics port: 9100 protocol: TCP selector: diff --git a/manifests/k8s/minikube/kube-apiserver.yaml b/manifests/k8s/minikube/kube-apiserver.yaml deleted file mode 100644 index 2b35a4ec6c73b362d6240d115f392926f79c65bd..0000000000000000000000000000000000000000 --- a/manifests/k8s/minikube/kube-apiserver.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: kube-apiserver-prometheus-discovery - labels: - k8s-app: kubernetes -spec: - type: ClusterIP - clusterIP: None - ports: - - name: https-metrics - port: 8443 - protocol: TCP ---- -apiVersion: v1 -kind: Endpoints -metadata: - name: kube-apiserver-prometheus-discovery - labels: - k8s-app: kubernetes -subsets: -- addresses: - - ip: 192.168.99.100 - ports: - - name: https-metrics - port: 8443 - protocol: TCP diff --git a/manifests/k8s/self-hosted/kube-apiserver.yaml b/manifests/k8s/self-hosted/kube-apiserver.yaml deleted file mode 100644 index 72b1c08fc3f475082a8149eb0c50c40a620636c7..0000000000000000000000000000000000000000 --- a/manifests/k8s/self-hosted/kube-apiserver.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: kube-apiserver-prometheus-discovery - labels: - k8s-app: kubernetes -spec: - selector: - k8s-app: kube-apiserver - type: ClusterIP - clusterIP: None - ports: - - name: https-metrics - port: 443 - targetPort: 443 - protocol: TCP diff --git a/manifests/prometheus-operator.yaml b/manifests/prometheus-operator.yaml index bb1dab9ae2b5d4a519da3e468e1e9a51dd040b58..78bc11d7884b3b1cff9fb0e8e8ffacaee307f6f8 100644 --- a/manifests/prometheus-operator.yaml +++ b/manifests/prometheus-operator.yaml @@ -13,7 +13,10 @@ spec: spec: containers: - name: prometheus-operator - image: quay.io/coreos/prometheus-operator:v0.2.1 + image: quay.io/coreos/prometheus-operator:v0.6.0 + args: + - "--kubelet-object=kube-system/kubelet" + - "--config-reloader-image=quay.io/coreos/configmap-reload:latest" resources: requests: cpu: 100m diff --git a/manifests/prometheus/prometheus-k8s-cm.yaml b/manifests/prometheus/prometheus-k8s-cm.yaml deleted file mode 100644 index 5e4a9cd3f9de9d148fbc452f306e765e6ac72cb8..0000000000000000000000000000000000000000 --- a/manifests/prometheus/prometheus-k8s-cm.yaml +++ /dev/null @@ -1,92 +0,0 @@ -apiVersion: v1 -data: - prometheus.yaml: | - alerting: - alertmanagers: - - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: alertmanager-main - source_labels: - - __meta_kubernetes_service_name - - action: keep - regex: monitoring - source_labels: - - __meta_kubernetes_namespace - - action: keep - regex: web - source_labels: - - __meta_kubernetes_endpoint_port_name - scheme: http - - global: - scrape_interval: 15s - evaluation_interval: 15s - - rule_files: - - /etc/prometheus/rules/*.rules - - scrape_configs: - - job_name: kubelets - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # Skip verification until we have resolved why the certificate validation - # for the kubelet on API server nodes fail. - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - # Scrapes the endpoint lists for the Kubernetes API server, kube-state-metrics, - # and node-exporter, which we all consider part of a default setup. - - job_name: standard-endpoints - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # As for kubelets, certificate validation fails for the API server (node) - # and we circumvent it for now. - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - action: keep - source_labels: [__meta_kubernetes_service_name] - regex: prometheus|node-exporter|kube-state-metrics - - action: replace - source_labels: [__meta_kubernetes_service_name] - target_label: job - - # Scrapes the endpoint lists for the kube-dns server. Which we consider - # part of a default setup. - - job_name: kube-components - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - action: replace - source_labels: [__meta_kubernetes_service_label_k8s_app] - target_label: job - - action: keep - source_labels: [__meta_kubernetes_service_name] - regex: ".*-prometheus-discovery" - - action: keep - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: "http-metrics.*|https-metrics.*" - - action: replace - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: "https-metrics.*" - target_label: __scheme__ - replacement: https -kind: ConfigMap -metadata: - creationTimestamp: null - name: prometheus-k8s diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 6e83500ea122092eaea6d8b5f2b276109b910083..08f6dddceaeb8e450c766a12f072e537397acf47 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -226,7 +226,7 @@ data: histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6 ALERT K8SNodeDown - IF up{job="kubelets"} == 0 + IF up{job="kubelet"} == 0 FOR 1h LABELS { service = "k8s", @@ -281,7 +281,7 @@ data: } ALERT K8SKubeletDown - IF absent(up{job="kubelets"}) or count by (cluster) (up{job="kubelets"} == 0) / count by (cluster) (up{job="kubelets"}) > 0.1 + IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1 FOR 1h LABELS { service = "k8s", @@ -378,7 +378,7 @@ data: } ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50 + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50 FOR 10m LABELS { service = "k8s", @@ -390,7 +390,7 @@ data: } ALERT K8STooManyOpenFiles - IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80 + IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80 FOR 10m LABELS { service = "k8s", diff --git a/manifests/prometheus/prometheus-k8s-servicemonitor.yaml b/manifests/prometheus/prometheus-k8s-servicemonitor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dbad7e5fffad052299d4ac569c81cdb53008cdc4 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-servicemonitor.yaml @@ -0,0 +1,69 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: kube-apiserver + labels: + k8s-apps: https +spec: + jobLabel: provider + selector: + matchLabels: + component: apiserver + provider: kubernetes + namespaceSelector: + matchNames: + - default + endpoints: + - port: https + interval: 15s + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: k8s-apps-https + labels: + k8s-apps: https +spec: + jobLabel: k8s-app + selector: + matchExpressions: + - {key: k8s-app, operator: Exists} + namespaceSelector: + matchNames: + - kube-system + endpoints: + - port: https-metrics + interval: 15s + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ServiceMonitor +metadata: + name: k8s-apps-http + labels: + k8s-apps: http +spec: + jobLabel: k8s-app + selector: + matchExpressions: + - {key: k8s-app, operator: Exists} + namespaceSelector: + matchNames: + - kube-system + - monitoring + endpoints: + - port: http-metrics + interval: 15s + - port: http-metrics-dnsmasq + interval: 15s + - port: http-metrics-skydns + interval: 15s diff --git a/manifests/prometheus/prometheus-k8s-svc.yaml b/manifests/prometheus/prometheus-k8s-svc.yaml index d3d25d2b780612ce756a66825c92a7cdd82d5ca3..a558f30fa9a1c43bab80822b6cc0d8c3ee6cf014 100644 --- a/manifests/prometheus/prometheus-k8s-svc.yaml +++ b/manifests/prometheus/prometheus-k8s-svc.yaml @@ -11,4 +11,4 @@ spec: protocol: TCP targetPort: web selector: - prometheus: prometheus-k8s + prometheus: k8s diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 6ed1fe90a7e5639fa2dbe686009b7e21c81b0e43..a593f04100e5f762fb237bc4f1a7e1f9f58012cf 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -1,11 +1,14 @@ apiVersion: monitoring.coreos.com/v1alpha1 kind: Prometheus metadata: - name: prometheus-k8s + name: k8s labels: prometheus: k8s spec: - version: v1.4.1 + version: v1.5.2 + serviceMonitorSelector: + matchExpression: + - {key: k8s-apps, operator: Exists} resources: requests: # 2Gi is default, but won't schedule if you don't have a node with >2Gi @@ -13,3 +16,8 @@ spec: # production use. This value is mainly meant for demonstration/testing # purposes. memory: 400Mi + alerting: + alertmanagers: + - namespace: monitoring + name: alertmanager-main + port: web