From c3ea4675da5894b4f4859762a9f72db15809ec01 Mon Sep 17 00:00:00 2001 From: Simon Pasquier <spasquie@redhat.com> Date: Fri, 19 Jun 2020 10:40:30 +0200 Subject: [PATCH] Fix AlertmanagerConfigInconsistent alert Previously the alert would fire when the number of Alertmanager pods didn't match the number of replicas defined in the Alertmanager spec even though all the running pods had the same configuration hash. This type of issue is already covered by KubeStatefulSetUpdateNotRolledOut (and possibly KubePodNotReady), having AlertmanagerConfigInconsistent also active in this situation creates unnecessary noise. With this change, the alert expression only returns when Alertmanager pods have different configuration hash values irrespective of the number of pod replicas. The message annotation has also been enhanced to report the configuration hash for each pod. Signed-off-by: Simon Pasquier <spasquie@redhat.com> --- jsonnet/kube-prometheus/alerts/alertmanager.libsonnet | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet index bda69d00..bcabf4d9 100644 --- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -7,10 +7,15 @@ { alert: 'AlertmanagerConfigInconsistent', annotations: { - message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.', + message: ||| + The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync. + {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }} + Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}" + {{ end }} + |||, }, expr: ||| - count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s,controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1 + count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1 ||| % $._config, 'for': '5m', labels: { -- GitLab