diff --git a/assets/prometheus/prometheus.yaml b/assets/prometheus/prometheus.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea10a6081d9693a0b39b97dc6b985ef0e26ef583 --- /dev/null +++ b/assets/prometheus/prometheus.yaml @@ -0,0 +1,68 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: +- /etc/prometheus/rules/*.rules + +scrape_configs: +- job_name: kubelets + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # Skip verification until we have resolved why the certificate validation + # for the kubelet on API server nodes fail. + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + +# Scrapes the endpoint lists for the Kubernetes API server, kube-state-metrics, +# and node-exporter, which we all consider part of a default setup. +- job_name: standard-endpoints + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # As for kubelets, certificate validation fails for the API server (node) + # and we circumvent it for now. + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - action: keep + source_labels: [__meta_kubernetes_service_name] + regex: prometheus|kubernetes|node-exporter|kube-state-metrics|etcd-k8s + - action: replace + source_labels: [__meta_kubernetes_service_name] + target_label: job + - action: replace + source_labels: [__meta_kubernetes_service_name] + regex: kubernetes + target_label: __scheme__ + replacement: https + +# Scrapes the endpoint lists for the kube-dns server. Which we consider +# part of a default setup. +- job_name: kube-components + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - action: replace + source_labels: [__meta_kubernetes_service_name] + target_label: job + regex: "kube-(.*)-prometheus-discovery" + replacement: "kube-${1}" + - action: keep + source_labels: [__meta_kubernetes_service_name] + regex: "kube-(.*)-prometheus-discovery" + - action: keep + source_labels: [__meta_kubernetes_endpoint_port_name] + regex: "prometheus" diff --git a/assets/alerts/etcd2.rules b/assets/prometheus/rules/etcd2.rules similarity index 100% rename from assets/alerts/etcd2.rules rename to assets/prometheus/rules/etcd2.rules diff --git a/assets/alerts/kubernetes.rules b/assets/prometheus/rules/kubernetes.rules similarity index 97% rename from assets/alerts/kubernetes.rules rename to assets/prometheus/rules/kubernetes.rules index d6fcaf8aaaf8a0a148681b9f7e4d1cc9b3f0e74b..216c0ccde6f71b762b2488dd22fe563136bb87ce 100644 --- a/assets/alerts/kubernetes.rules +++ b/assets/prometheus/rules/kubernetes.rules @@ -1,3 +1,5 @@ +# NOTE: These rules were kindly contributed by the SoundCloud engineering team. + ### Container resources ### cluster_namespace_controller_pod_container:spec_memory_limit_bytes = @@ -249,7 +251,7 @@ ALERT K8SApiserverDown # Disable for non HA kubernetes setups. ALERT K8SApiserverDown - IF absent({job="kubernetes"}) or count by(cluster) (up{job="kubernetes"} == 1) < 2 + IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) FOR 5m LABELS { service = "k8s", @@ -361,7 +363,7 @@ ALERT K8STooManyOpenFiles ALERT K8SApiServerLatency IF histogram_quantile( 0.99, - sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST"}) + sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) ) / 1e6 > 1.0 FOR 10m LABELS { diff --git a/hack/scripts/generate-configmaps.sh b/hack/scripts/generate-configmaps.sh index a178878cfd14545c9eb3f95e0353908251b3b4ff..50f3c9033ec05921b8475cc9517fd1926b76e8bd 100755 --- a/hack/scripts/generate-configmaps.sh +++ b/hack/scripts/generate-configmaps.sh @@ -1,7 +1,10 @@ #!/bin/bash +# Generate Prometheus configuration ConfigMap +kubectl create configmap --dry-run=true prometheus-k8s --from-file=assets/prometheus/prometheus.yaml -oyaml > manifests/prometheus/prometheus-k8s-cm.yaml + # Generate Alert Rules ConfigMap -kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/alerts/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml +kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml # Generate Dashboard ConfigMap kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-cm.yaml diff --git a/manifests/prometheus/prometheus-k8s-cm.yaml b/manifests/prometheus/prometheus-k8s-cm.yaml index 73389f517a46e2596ae168bb7752874a2dac4268..59c6389be89ecdc02e628aac0134f13e0500d2cf 100644 --- a/manifests/prometheus/prometheus-k8s-cm.yaml +++ b/manifests/prometheus/prometheus-k8s-cm.yaml @@ -1,18 +1,15 @@ apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-k8s data: prometheus.yaml: | global: - evaluation_interval: 30s + scrape_interval: 15s + evaluation_interval: 15s rule_files: - - /etc/prometheus/rules/*.rules + - /etc/prometheus/rules/*.rules scrape_configs: - job_name: kubelets - scrape_interval: 20s scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt @@ -27,7 +24,6 @@ data: # Scrapes the endpoint lists for the Kubernetes API server, kube-state-metrics, # and node-exporter, which we all consider part of a default setup. - job_name: standard-endpoints - scrape_interval: 20s tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt # As for kubelets, certificate validation fails for the API server (node) @@ -41,7 +37,7 @@ data: relabel_configs: - action: keep source_labels: [__meta_kubernetes_service_name] - regex: kubernetes|node-exporter|kube-state-metrics|etcd-k8s + regex: prometheus|kubernetes|node-exporter|kube-state-metrics|etcd-k8s - action: replace source_labels: [__meta_kubernetes_service_name] target_label: job @@ -54,7 +50,6 @@ data: # Scrapes the endpoint lists for the kube-dns server. Which we consider # part of a default setup. - job_name: kube-components - scrape_interval: 20s tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token @@ -74,3 +69,7 @@ data: - action: keep source_labels: [__meta_kubernetes_endpoint_port_name] regex: "prometheus" +kind: ConfigMap +metadata: + creationTimestamp: null + name: prometheus-k8s diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index db867d731fc424d5c1014088f163bcca605c5417..c01a7f3e4856ae4fcbf44d2f2fc7dd8589418b33 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -53,6 +53,8 @@ data: \ summary = \"high fsync durations\",\n description = \"ectd instance {{ $labels.instance }} fync durations are high\",\n }\n" kubernetes.rules: |+ + # NOTE: These rules were kindly contributed by the SoundCloud engineering team. + ### Container resources ### cluster_namespace_controller_pod_container:spec_memory_limit_bytes = @@ -304,7 +306,7 @@ data: # Disable for non HA kubernetes setups. ALERT K8SApiserverDown - IF absent({job="kubernetes"}) or count by(cluster) (up{job="kubernetes"} == 1) < 2 + IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"})) FOR 5m LABELS { service = "k8s", @@ -416,7 +418,7 @@ data: ALERT K8SApiServerLatency IF histogram_quantile( 0.99, - sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST"}) + sum without (instance,node,resource) (apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH"}) ) / 1e6 > 1.0 FOR 10m LABELS {