From c3ea4675da5894b4f4859762a9f72db15809ec01 Mon Sep 17 00:00:00 2001
From: Simon Pasquier <spasquie@redhat.com>
Date: Fri, 19 Jun 2020 10:40:30 +0200
Subject: [PATCH] Fix AlertmanagerConfigInconsistent alert

Previously the alert would fire when the number of Alertmanager pods
didn't match the number of replicas defined in the Alertmanager spec
even though all the running pods had the same configuration hash. This
type of issue is already covered by KubeStatefulSetUpdateNotRolledOut
(and possibly KubePodNotReady), having AlertmanagerConfigInconsistent
also active in this situation creates unnecessary noise.

With this change, the alert expression only returns when Alertmanager
pods have different configuration hash values irrespective of the number
of pod replicas. The message annotation has also been enhanced to report
the configuration hash for each pod.

Signed-off-by: Simon Pasquier <spasquie@redhat.com>
---
 jsonnet/kube-prometheus/alerts/alertmanager.libsonnet | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet
index bda69d00..bcabf4d9 100644
--- a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet
+++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet
@@ -7,10 +7,15 @@
           {
             alert: 'AlertmanagerConfigInconsistent',
             annotations: {
-              message: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.',
+              message: |||
+                The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
+                {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
+                Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
+                {{ end }}
+              |||,
             },
             expr: |||
-              count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{%(prometheusOperatorSelector)s,controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
+              count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1
             ||| % $._config,
             'for': '5m',
             labels: {
-- 
GitLab