Skip to content
Snippets Groups Projects
Commit a8fe1720 authored by Simon Pasquier's avatar Simon Pasquier
Browse files

tests/e2e: add test to detect many-to-many errors

It happened quite a few times that some queries failed when Prometheus
scrapes metrics from 2 kube-state-metrics instances. This situation can
happen briefly when the kube-state-metrics instance is rolled out. It
might also be more apparent when automatic sharding of
kube-state-metrics is enabled.

https://github.com/kubernetes-monitoring/kubernetes-mixin/pull/306
https://github.com/kubernetes-monitoring/kubernetes-mixin/pull/361



Signed-off-by: default avatarSimon Pasquier <spasquie@redhat.com>
parent 08b577c8
Branches
Tags
No related merge requests found
...@@ -3,29 +3,13 @@ module github.com/prometheus-operator/kube-prometheus ...@@ -3,29 +3,13 @@ module github.com/prometheus-operator/kube-prometheus
go 1.13 go 1.13
require ( require (
github.com/Jeffail/gabs v1.2.0 github.com/Jeffail/gabs v1.4.0
github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d // indirect
github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c
github.com/campoy/embedmd v1.0.0 github.com/campoy/embedmd v1.0.0
github.com/google/go-jsonnet v0.16.1-0.20200703153429-aaf50f5b655f github.com/google/go-jsonnet v0.16.1-0.20200703153429-aaf50f5b655f
github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d // indirect
github.com/imdario/mergo v0.3.7 // indirect
github.com/jsonnet-bundler/jsonnet-bundler v0.4.0 github.com/jsonnet-bundler/jsonnet-bundler v0.4.0
github.com/kr/pretty v0.2.0 // indirect github.com/pkg/errors v0.9.1
github.com/mattn/go-colorable v0.1.7 // indirect github.com/prometheus/client_golang v1.8.0
github.com/pkg/errors v0.8.1 k8s.io/apimachinery v0.19.3
github.com/prometheus/client_golang v1.5.1 k8s.io/client-go v0.19.3
github.com/spf13/pflag v1.0.3 // indirect
golang.org/x/crypto v0.0.0-20190411191339-88737f569e3a // indirect
golang.org/x/oauth2 v0.0.0-20190402181905-9f3314589c9a // indirect
golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae // indirect
golang.org/x/text v0.3.1-0.20181227161524-e6919f6577db // indirect
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
k8s.io/api v0.0.0-20190313235455-40a48860b5ab // indirect
k8s.io/apimachinery v0.0.0-20190313205120-d7deff9243b1
k8s.io/client-go v11.0.0+incompatible
k8s.io/klog v0.0.0-20190306015804-8e90cee79f82 // indirect
k8s.io/utils v0.0.0-20190308190857-21c4ce38f2a7 // indirect
sigs.k8s.io/yaml v1.1.0 // indirect
) )
This diff is collapsed.
...@@ -15,13 +15,16 @@ ...@@ -15,13 +15,16 @@
package e2e package e2e
import ( import (
"context"
"log" "log"
"os" "os"
"strings" "strings"
"testing" "testing"
"time" "time"
"github.com/Jeffail/gabs"
"github.com/pkg/errors" "github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/tools/clientcmd"
...@@ -121,7 +124,7 @@ func TestDroppedMetrics(t *testing.T) { ...@@ -121,7 +124,7 @@ func TestDroppedMetrics(t *testing.T) {
// query metadata for all metrics and their metadata // query metadata for all metrics and their metadata
md, err := promClient.metadata("{job=~\".+\"}") md, err := promClient.metadata("{job=~\".+\"}")
if err != nil { if err != nil {
log.Fatal(err) t.Fatal(err)
} }
for _, k := range md { for _, k := range md {
// check if the metric' help text contains Deprecated // check if the metric' help text contains Deprecated
...@@ -129,13 +132,12 @@ func TestDroppedMetrics(t *testing.T) { ...@@ -129,13 +132,12 @@ func TestDroppedMetrics(t *testing.T) {
// query prometheus for the Deprecated metric // query prometheus for the Deprecated metric
n, err := promClient.query(k.Metric) n, err := promClient.query(k.Metric)
if err != nil { if err != nil {
log.Fatal(err) t.Fatal(err)
} }
if n > 0 { if n > 0 {
t.Fatalf("deprecated metric with name: %s and help text: %s exists.", k.Metric, k.Help) t.Fatalf("deprecated metric with name: %s and help text: %s exists.", k.Metric, k.Help)
} }
} }
} }
} }
...@@ -143,7 +145,7 @@ func TestTargetsScheme(t *testing.T) { ...@@ -143,7 +145,7 @@ func TestTargetsScheme(t *testing.T) {
// query targets for all endpoints // query targets for all endpoints
tgs, err := promClient.targets() tgs, err := promClient.targets()
if err != nil { if err != nil {
log.Fatal(err) t.Fatal(err)
} }
// exclude jobs from checking for http endpoints // exclude jobs from checking for http endpoints
...@@ -158,7 +160,120 @@ func TestTargetsScheme(t *testing.T) { ...@@ -158,7 +160,120 @@ func TestTargetsScheme(t *testing.T) {
for _, k := range tgs.Active { for _, k := range tgs.Active {
job := k.Labels["job"] job := k.Labels["job"]
if k.DiscoveredLabels["__scheme__"] == "http" && !exclude[string(job)] { if k.DiscoveredLabels["__scheme__"] == "http" && !exclude[string(job)] {
log.Fatalf("target exposing metrics over HTTP instead of HTTPS: %+v", k) t.Fatalf("target exposing metrics over HTTP instead of HTTPS: %+v", k)
}
}
}
// TestFailedRuleEvaluations detects recording and alerting rules that may
// trigger "many-to-many" evaluation errors when multiple kube-state-metrics
// instances are running.
func TestFailedRuleEvaluations(t *testing.T) {
// Scale kube-state-metrics to 2 replicas.
kClient := promClient.kubeClient
scale, err := kClient.AppsV1().Deployments("monitoring").GetScale(context.Background(), "kube-state-metrics", metav1.GetOptions{})
if err != nil {
t.Fatal(err)
}
scale.Spec.Replicas = 2
scale, err = kClient.AppsV1().Deployments("monitoring").UpdateScale(context.Background(), "kube-state-metrics", scale, metav1.UpdateOptions{})
if err != nil {
t.Fatal(err)
}
// Rollback to 1 replica at the end of the test.
defer func() {
scale, err := kClient.AppsV1().Deployments("monitoring").GetScale(context.Background(), "kube-state-metrics", metav1.GetOptions{})
if err != nil {
t.Fatal(err)
}
scale.Spec.Replicas = 1
_, err = kClient.AppsV1().Deployments("monitoring").UpdateScale(context.Background(), "kube-state-metrics", scale, metav1.UpdateOptions{})
if err != nil {
t.Fatal(err)
}
}()
// Wait for the 2 replicas of kube-state-metrics to be successfully scraped.
err = wait.Poll(5*time.Second, 1*time.Minute, func() (bool, error) {
n, err := promClient.query(`up{job="kube-state-metrics"} == 1`)
if err != nil {
return false, err
}
if n != 2 {
t.Logf("expecting 2 kube-state-metrics targets, got %d", n)
return false, nil
}
return true, nil
})
if err != nil {
t.Fatal(err)
}
// Wait for all rule groups to be evaluated at least once without error.
now := time.Now()
err = wait.Poll(30*time.Second, 5*time.Minute, func() (bool, error) {
rsp, err := promClient.apiRequest("/api/v1/rules", "type", "")
if err != nil {
return false, err
}
res, err := gabs.ParseJSON(rsp.Data)
if err != nil {
return false, err
}
groups, err := res.Path("groups").Children()
if err != nil {
return false, err
}
if len(groups) == 0 {
return false, errors.New("got 0 rule groups")
}
for _, group := range groups {
groupName := group.Path("name").Data().(string)
if err != nil {
return false, err
}
lastEvalString := group.Path("lastEvaluation").Data().(string)
lastEval, err := time.Parse(time.RFC3339Nano, lastEvalString)
if err != nil {
return false, err
}
if lastEval.Before(now) {
t.Logf("%s not yet evaluated", groupName)
return false, nil
}
rules, err := group.Path("rules").Children()
if err != nil {
return false, err
}
if len(rules) == 0 {
return false, errors.Errorf("got 0 rules in group %s", groupName)
}
for _, rule := range rules {
health := rule.Path("health").Data().(string)
if health != "ok" {
return false, errors.Errorf("error evaluating rule: %v", rule)
}
}
} }
return true, nil
})
if err != nil {
t.Fatal(err)
} }
} }
...@@ -16,6 +16,7 @@ package e2e ...@@ -16,6 +16,7 @@ package e2e
import ( import (
"bytes" "bytes"
"context"
"encoding/json" "encoding/json"
"fmt" "fmt"
...@@ -50,7 +51,7 @@ func (c *prometheusClient) apiRequest(endpoint string, selector string, query st ...@@ -50,7 +51,7 @@ func (c *prometheusClient) apiRequest(endpoint string, selector string, query st
Suffix(endpoint).Param(selector, query) Suffix(endpoint).Param(selector, query)
var data Response var data Response
b, err := req.DoRaw() b, err := req.DoRaw(context.Background())
if err != nil { if err != nil {
return data, err return data, err
} }
...@@ -78,7 +79,7 @@ func (c *prometheusClient) query(query string) (int, error) { ...@@ -78,7 +79,7 @@ func (c *prometheusClient) query(query string) (int, error) {
Name("prometheus-k8s-0:9090"). Name("prometheus-k8s-0:9090").
Suffix("/api/v1/query").Param("query", query) Suffix("/api/v1/query").Param("query", query)
b, err := req.DoRaw() b, err := req.DoRaw(context.Background())
if err != nil { if err != nil {
return 0, err return 0, err
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment