diff --git a/README.md b/README.md index 74df0c205aeb0a34b4b98eaa058bd7dccb1aa6b4..a751a4a94aac271f6eb9038b3205d6e6eb6229f2 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ which manages Prometheus servers and their configuration in a cluster. With a si * A Prometheus configuration covering monitoring of all Kubernetes core components and exporters * A default set of alerting rules on the cluster component's health * A Grafana instance serving dashboards on cluster metrics +* A three node highly available Alertmanager cluster Simply run: @@ -35,6 +36,7 @@ hack/cluster-monitoring/deploy After all pods are ready, you can reach: * Prometheus UI on node port `30900` +* Alertmanager UI on node port `30903` * Grafana on node port `30902` To tear it all down again, run: @@ -57,7 +59,9 @@ hack/example-service-monitoring/deploy ``` After all pods are ready you can reach the Prometheus server on node port `30100` and observe -how it monitors the service as specified. +how it monitors the service as specified. Same as before, this Prometheus server automatically +discovers the Alertmanager cluster deployed in the [Monitoring Kubernetes](#Monitoring-Kubernetes) +section. Teardown: diff --git a/hack/cluster-monitoring/deploy b/hack/cluster-monitoring/deploy index 6da8cd62713455f71b5f7e33e0a00f5a64a118ca..a096747e64768a75d356762bb9b83f1a143fd629 100755 --- a/hack/cluster-monitoring/deploy +++ b/hack/cluster-monitoring/deploy @@ -19,3 +19,4 @@ until kctl get prometheus; do sleep 1; done kctl apply -f manifests/exporters kctl apply -f manifests/grafana kctl apply -f manifests/prometheus +kctl apply -f manifests/alertmanager diff --git a/hack/cluster-monitoring/teardown b/hack/cluster-monitoring/teardown index a6edb0ae20f327d3557e66ced48fca370217b12a..afa4ce1486e0153c95eac0ced8b41e71a60e2df2 100755 --- a/hack/cluster-monitoring/teardown +++ b/hack/cluster-monitoring/teardown @@ -11,6 +11,7 @@ kctl() { kctl delete -f manifests/exporters kctl delete -f manifests/grafana kctl delete -f manifests/prometheus +kctl delete -f manifests/alertmanager # Hack: wait a bit to let the controller delete the deployed Prometheus server. sleep 5 diff --git a/manifests/examples/example-app/prometheus-frontend.yaml b/manifests/examples/example-app/prometheus-frontend.yaml index 80fd9e0474dfab05e95e7ec51b637c91e5b24705..59decb14cd6312f953e7c42747ef7c46727b34ec 100644 --- a/manifests/examples/example-app/prometheus-frontend.yaml +++ b/manifests/examples/example-app/prometheus-frontend.yaml @@ -7,10 +7,9 @@ metadata: prometheus: frontend spec: version: v1.4.1 - serviceMonitors: - - selector: - matchLabels: - tier: frontend + serviceMonitorSelector: + matchLabels: + tier: frontend resources: requests: # 2Gi is default, but won't schedule if you don't have a node with >2Gi diff --git a/manifests/prometheus/prometheus-k8s-cm.yaml b/manifests/prometheus/prometheus-k8s-cm.yaml index a8846b92b800d00021634bc5d82fbcd11fac8af3..f6d61cdd0b7c9159aeef39caa89a333dc194390f 100644 --- a/manifests/prometheus/prometheus-k8s-cm.yaml +++ b/manifests/prometheus/prometheus-k8s-cm.yaml @@ -1,6 +1,25 @@ apiVersion: v1 data: prometheus.yaml: | + alerting: + alertmanagers: + - kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: alertmanager-main + source_labels: + - __meta_kubernetes_service_name + - action: keep + regex: monitoring + source_labels: + - __meta_kubernetes_namespace + - action: keep + regex: web + source_labels: + - __meta_kubernetes_endpoint_port_name + scheme: http + global: scrape_interval: 15s evaluation_interval: 15s diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 602ff146552b255a209a943df30154658a4b1291..6ed1fe90a7e5639fa2dbe686009b7e21c81b0e43 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -13,8 +13,3 @@ spec: # production use. This value is mainly meant for demonstration/testing # purposes. memory: 400Mi - alerting: - alertmanagers: - - namespace: monitoring - name: alertmanager-main - port: web