diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index e5e89252e85605180ede0401260ad0a0159a11e4..ca9ea0033078325dbf160ef17a7d322587188594 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -283,7 +283,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "apiserver_request:availability7d{verb=\"read\"}", + "expr": "apiserver_request:availability30d{verb=\"read\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -654,7 +654,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "apiserver_request:availability7d{verb=\"write\"}", + "expr": "apiserver_request:availability30d{verb=\"write\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -27084,10 +27084,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}\n)\n", + "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}\n)\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -27175,10 +27175,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n)\n", + "expr": "(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n)\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -27279,10 +27279,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n", + "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -27387,7 +27387,7 @@ items: "expr": "prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -27478,7 +27478,7 @@ items: "expr": "prometheus_remote_storage_shards_max{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -27569,7 +27569,7 @@ items: "expr": "prometheus_remote_storage_shards_min{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -27660,7 +27660,7 @@ items: "expr": "prometheus_remote_storage_shards_desired{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -27764,7 +27764,7 @@ items: "expr": "prometheus_remote_storage_shard_capacity{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -27855,7 +27855,7 @@ items: "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -28050,7 +28050,7 @@ items: "expr": "prometheus_wal_watcher_current_segment{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{consumer}}", "refId": "A" } ], @@ -28154,7 +28154,7 @@ items: "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -28245,7 +28245,7 @@ items: "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -28336,7 +28336,7 @@ items: "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -28427,7 +28427,7 @@ items: "expr": "rate(prometheus_remote_storage_enqueue_retries_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{cluster}}:{{instance}}-{{queue}}", + "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", "refId": "A" } ], @@ -28580,11 +28580,11 @@ items: "includeAll": true, "label": null, "multi": false, - "name": "queue", + "name": "url", "options": [ ], - "query": "label_values(prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}, queue)", + "query": "label_values(prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}, url)", "refresh": 2, "regex": "", "sort": 0, diff --git a/manifests/prometheus-operator-serviceMonitor.yaml b/manifests/prometheus-operator-serviceMonitor.yaml index 571f5e25564a1041d492b6d180166f77093d6034..21400f02a99abac304973600a03dbe69f96a64d7 100644 --- a/manifests/prometheus-operator-serviceMonitor.yaml +++ b/manifests/prometheus-operator-serviceMonitor.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.38.0 + app.kubernetes.io/version: v0.38.1 name: prometheus-operator namespace: monitoring spec: @@ -19,4 +19,4 @@ spec: matchLabels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.38.0 + app.kubernetes.io/version: v0.38.1 diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 79a8e55790ffe63905c1afaa6b0d5d5ebe55c238..34b445e801944d9ed9055ec0d5ee272fadb9a857 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -389,12 +389,15 @@ spec: verb: write record: apiserver_request:availability30d - expr: | - sum by (code) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30d])) + sum by (code, verb) (increase(apiserver_request_total{job="apiserver"}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) labels: verb: read record: code:apiserver_request_total:increase30d - expr: | - sum by (code) (increase(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30d])) + sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) labels: verb: write record: code:apiserver_request_total:increase30d @@ -1311,29 +1314,6 @@ spec: for: 5m labels: severity: warning - - alert: KubeAPILatencyHigh - annotations: - message: The API server has a 99th percentile latency of {{ $value }} seconds - for {{ $labels.verb }} {{ $labels.resource }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh - expr: | - cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4 - for: 10m - labels: - severity: critical - - alert: KubeAPIErrorsHigh - annotations: - message: API server is returning errors for {{ $value | humanizePercentage - }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource - }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh - expr: | - sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb) - / - sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.10 - for: 10m - labels: - severity: critical - alert: KubeAPIErrorsHigh annotations: message: API server is returning errors for {{ $value | humanizePercentage @@ -1451,7 +1431,7 @@ spec: on node {{ $labels.node }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh expr: | - histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 60 + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m labels: severity: warning @@ -1616,8 +1596,8 @@ spec: - alert: PrometheusRemoteStorageFailures annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send - {{ printf "%.1f" $value }}% of the samples to {{ if $labels.queue }}{{ $labels.queue - }}{{ else }}{{ $labels.url }}{{ end }}. + {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ + $labels.url }} summary: Prometheus fails to send samples to remote storage. expr: | ( @@ -1637,8 +1617,8 @@ spec: - alert: PrometheusRemoteWriteBehind annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write - is {{ printf "%.1f" $value }}s behind for {{ if $labels.queue }}{{ $labels.queue - }}{{ else }}{{ $labels.url }}{{ end }}. + is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url + }}. summary: Prometheus remote write is behind. expr: | # Without max_over_time, failed scrapes could create false negatives, see @@ -1655,8 +1635,9 @@ spec: - alert: PrometheusRemoteWriteDesiredShards annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write - desired shards calculation wants to run {{ $value }} shards, which is more - than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` + desired shards calculation wants to run {{ $value }} shards for queue {{ + $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ + printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. diff --git a/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml index 8121bbc2643131e6c16305142f8cf70335b44292..5d630512a886e474ef5293e74f70cfb7f188f2d9 100644 --- a/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml @@ -612,6 +612,11 @@ spec: items: type: string type: array + alertQueryUrl: + description: The external Query URL the Thanos Ruler will set in the + 'Source' field of all alerts. Maps to the '--alert.query-url' CLI + arg. + type: string alertmanagersConfig: description: Define configuration for connecting to alertmanager. Only available with thanos v0.10.0 and higher. Maps to the `alertmanagers.config` diff --git a/manifests/setup/prometheus-operator-clusterRole.yaml b/manifests/setup/prometheus-operator-clusterRole.yaml index 054414f22befde4f88d514699d88dcb676395f52..f42e2fe3ef827b50cfef7aa8f62cdcad3303d090 100644 --- a/manifests/setup/prometheus-operator-clusterRole.yaml +++ b/manifests/setup/prometheus-operator-clusterRole.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.38.0 + app.kubernetes.io/version: v0.38.1 name: prometheus-operator rules: - apiGroups: diff --git a/manifests/setup/prometheus-operator-clusterRoleBinding.yaml b/manifests/setup/prometheus-operator-clusterRoleBinding.yaml index 6b634f2807591c387f544bcf31f558a22306bd8a..765d3b6f09939e5499f1c55b0ebd55a2d68c2bb0 100644 --- a/manifests/setup/prometheus-operator-clusterRoleBinding.yaml +++ b/manifests/setup/prometheus-operator-clusterRoleBinding.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.38.0 + app.kubernetes.io/version: v0.38.1 name: prometheus-operator roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/manifests/setup/prometheus-operator-deployment.yaml b/manifests/setup/prometheus-operator-deployment.yaml index 2aeec68203434374111d7060c5c7feefd2b8031d..e234a1d73cbd5f4c995c185deb7407c40919ddb0 100644 --- a/manifests/setup/prometheus-operator-deployment.yaml +++ b/manifests/setup/prometheus-operator-deployment.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.38.0 + app.kubernetes.io/version: v0.38.1 name: prometheus-operator namespace: monitoring spec: @@ -18,15 +18,15 @@ spec: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.38.0 + app.kubernetes.io/version: v0.38.1 spec: containers: - args: - --kubelet-service=kube-system/kubelet - --logtostderr=true - --config-reloader-image=jimmidyson/configmap-reload:v0.3.0 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.38.0 - image: quay.io/coreos/prometheus-operator:v0.38.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.38.1 + image: quay.io/coreos/prometheus-operator:v0.38.1 name: prometheus-operator ports: - containerPort: 8080 diff --git a/manifests/setup/prometheus-operator-service.yaml b/manifests/setup/prometheus-operator-service.yaml index fbb448d99f92d6dfb972bd449fa53d590aa9df96..1e9fd769fe32d7c6aff7b8280ba91674f6071f46 100644 --- a/manifests/setup/prometheus-operator-service.yaml +++ b/manifests/setup/prometheus-operator-service.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.38.0 + app.kubernetes.io/version: v0.38.1 name: prometheus-operator namespace: monitoring spec: diff --git a/manifests/setup/prometheus-operator-serviceAccount.yaml b/manifests/setup/prometheus-operator-serviceAccount.yaml index d4e56d45e0732456019917516ba198713107eceb..bf622bf56189f7b951be18afa215fe0ceb8327bc 100644 --- a/manifests/setup/prometheus-operator-serviceAccount.yaml +++ b/manifests/setup/prometheus-operator-serviceAccount.yaml @@ -4,6 +4,6 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.38.0 + app.kubernetes.io/version: v0.38.1 name: prometheus-operator namespace: monitoring