diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index b15f6ecfcc1420793f8c8d9978c3a6c746765e09..9c45355572fd35ffaf47f766c34a20e79aaf8a78 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -18,7 +18,7 @@ "subdir": "contrib/mixin" } }, - "version": "8f17652c6096757feaf68973161508730ba2fa57", + "version": "7572a61a39d4eaad596ab8d9364f7df9a84ff4a3", "sum": "cdKL5kPYfpWSpTCu4qctmh+gWQqL+4YWom6rw9qLYJU=" }, { @@ -38,7 +38,7 @@ "subdir": "grafana-builder" } }, - "version": "6639a82ac03f9f8e3f6672c5de48fd9e19578e82", + "version": "c4975f7c4a7ab4c21020c4afbf247aa49142174d", "sum": "y8uA/daOROErelzoo2p1rtqABhUPArg2alsfcb0PQBk=" }, { @@ -48,8 +48,8 @@ "subdir": "" } }, - "version": "0c1cd2882281028c47fb06310e919dc71565bf09", - "sum": "2mH6p/tB7MYvdT+wZP3t0oIp/wh58YxhiXUOQ2VmrGk=" + "version": "3978f13fe6b4aff8e2875a744a2327911c6d048d", + "sum": "YRRFew+XcoPGZYuINIrDWdTV7tKj8iaWs2eSnL+Exs0=" }, { "source": { @@ -58,7 +58,7 @@ "subdir": "lib/promgrafonnet" } }, - "version": "0c1cd2882281028c47fb06310e919dc71565bf09", + "version": "3978f13fe6b4aff8e2875a744a2327911c6d048d", "sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps=" }, { @@ -68,7 +68,7 @@ "subdir": "jsonnet/kube-state-metrics" } }, - "version": "0c33f919a18ba6e8fd5d18ea9ebe9ae1a8435dce", + "version": "0c9580d09c6ac8b35630813afbcc0fe217ce1874", "sum": "U1wzIpTAtOvC1yj43Y8PfvT0JfvnAcMfNH12Wi+ab0Y=" }, { @@ -78,7 +78,7 @@ "subdir": "jsonnet/kube-state-metrics-mixin" } }, - "version": "0c33f919a18ba6e8fd5d18ea9ebe9ae1a8435dce", + "version": "0c9580d09c6ac8b35630813afbcc0fe217ce1874", "sum": "u8gaydJoxEjzizQ8jY8xSjYgWooPmxw+wIWdDxifMAk=" }, { @@ -88,7 +88,7 @@ "subdir": "jsonnet/mixin" } }, - "version": "987d70d3f1332cdf5b7c62bbffacf680b01c29ef", + "version": "17caf39e45a5bff3b3c11aa7d11a03a2a964176e", "sum": "qZ4WgiweaE6eeKtFK60QUjLO8sf2L9Q8fgafWvDcyfY=", "name": "prometheus-operator-mixin" }, @@ -99,8 +99,8 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "987d70d3f1332cdf5b7c62bbffacf680b01c29ef", - "sum": "Rfp48pJ3uRMK4/8keqzjZjlAa/Ztb1q/do44YYlHdSQ=" + "version": "17caf39e45a5bff3b3c11aa7d11a03a2a964176e", + "sum": "AYICIHiqYYszo3HJQWBfoCotUh9BLyqrVuBkWP7+rlc=" }, { "source": { @@ -130,7 +130,7 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "0eac720468cf1b4bfee5cf3a4587cb5a409c1607", + "version": "2a3d62ac845689f1cd949cd50b99f43ed090f442", "sum": "ZjQoYhvgKwJNkg+h+m9lW3SYjnjv5Yx5btEipLhru88=", "name": "prometheus" }, @@ -141,7 +141,7 @@ "subdir": "mixin" } }, - "version": "243526d6a7108af41ce76586832358c1a467362d", + "version": "19dcc7902d2431265154cefff82426fbc91448a3", "sum": "2mA8HiOBtNle+J81PGsXmxkVXGuVSRExZv9+xhnCeUs=", "name": "thanos-mixin" }, diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 7bb185fd3cf95a9dd86d6bef2844549ea462b6e2..ace2bab0e46bdf84b04413212599de9015902a61 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -18527,7 +18527,7 @@ items: "options": [ ], - "query": "label_values(kube_pod_info{job=\"kube-state-metrics\"}, cluster=\"$cluster\"}, namespace)", + "query": "label_values(kube_pod_info{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)", "refresh": 2, "regex": "", "sort": 1, diff --git a/manifests/kubernetes-prometheusRule.yaml b/manifests/kubernetes-prometheusRule.yaml index 2e5461d5fd157cb1915323496947a3c9cc4f8e31..98f957e9f092139642eb615929ad474115e3028f 100644 --- a/manifests/kubernetes-prometheusRule.yaml +++ b/manifests/kubernetes-prometheusRule.yaml @@ -294,9 +294,9 @@ spec: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit summary: Cluster has overcommitted CPU resource requests. expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) / - sum(kube_node_status_allocatable{resource="cpu"}) + sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) > 1.5 for: 5m labels: @@ -307,9 +307,9 @@ spec: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit summary: Cluster has overcommitted memory resource requests. expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) / - sum(kube_node_status_allocatable{resource="memory",job="kube-state-metrics"}) + sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) > 1.5 for: 5m labels: @@ -390,6 +390,8 @@ spec: kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1m labels: severity: critical @@ -413,6 +415,8 @@ spec: predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1h labels: severity: warning @@ -516,8 +520,8 @@ spec: rules: - alert: KubeClientCertificateExpiration annotations: - description: A client certificate used to authenticate to the apiserver is - expiring in less than 7.0 days. + description: A client certificate used to authenticate to kubernetes apiserver + is expiring in less than 7.0 days. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | @@ -526,31 +530,31 @@ spec: severity: warning - alert: KubeClientCertificateExpiration annotations: - description: A client certificate used to authenticate to the apiserver is - expiring in less than 24.0 hours. + description: A client certificate used to authenticate to kubernetes apiserver + is expiring in less than 24.0 hours. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: severity: critical - - alert: AggregatedAPIErrors + - alert: KubeAggregatedAPIErrors annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} - has reported errors. It has appeared unavailable {{ $value | humanize }} - times averaged over the past 10m. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapierrors - summary: An aggregated API has reported errors. + description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace + }} has reported errors. It has appeared unavailable {{ $value | humanize + }} times averaged over the past 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors + summary: Kubernetes aggregated API has reported errors. expr: | sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 labels: severity: warning - - alert: AggregatedAPIDown + - alert: KubeAggregatedAPIDown annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} - has been only {{ $value | humanize }}% available over the last 10m. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/aggregatedapidown - summary: An aggregated API is down. + description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace + }} has been only {{ $value | humanize }}% available over the last 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown + summary: Kubernetes aggregated API is down. expr: | (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 for: 5m @@ -568,11 +572,11 @@ spec: severity: critical - alert: KubeAPITerminatedRequests annotations: - description: The apiserver has terminated {{ $value | humanizePercentage }} - of its incoming requests. + description: The kubernetes apiserver has terminated {{ $value | humanizePercentage + }} of its incoming requests. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests - summary: The apiserver has terminated {{ $value | humanizePercentage }} of - its incoming requests. + summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage + }} of its incoming requests. expr: | sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m diff --git a/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml index 1e751385411a589633dc98f9eb737ba96266c42d..3ce52e501f4c643a32c0df162c92687bc3ae1034 100644 --- a/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml @@ -121,6 +121,88 @@ spec: type: array type: object type: array + muteTimeIntervals: + description: List of MuteTimeInterval specifying when the routes should + be muted. + items: + description: MuteTimeInterval specifies the periods in time when + notifications will be muted + properties: + name: + description: Name of the time interval + type: string + timeIntervals: + description: TimeIntervals is a list of TimeInterval + items: + description: TimeInterval describes intervals of time + properties: + daysOfMonth: + description: DaysOfMonth is a list of DayOfMonthRange + items: + description: DayOfMonthRange is an inclusive range of + days of the month beginning at 1 + properties: + end: + description: End of the inclusive range + maximum: 31 + minimum: -31 + type: integer + start: + description: Start of the inclusive range + maximum: 31 + minimum: -31 + type: integer + type: object + type: array + months: + description: Months is a list of MonthRange + items: + description: MonthRange is an inclusive range of months + of the year beginning in January Months can be specified + by name (e.g 'January') by numerical month (e.g '1') + or as an inclusive range (e.g 'January:March', '1:3', + '1:March') + pattern: ^((?i)january|febuary|march|april|may|june|july|august|september|october|november|december|[1-12])(?:((:((?i)january|febuary|march|april|may|june|july|august|september|october|november|december|[1-12]))$)|$) + type: string + type: array + times: + description: Times is a list of TimeRange + items: + description: TimeRange defines a start and end time + in 24hr format + properties: + endTime: + description: EndTime is the end time in 24hr format. + pattern: ^((([01][0-9])|(2[0-3])):[0-5][0-9])$|(^24:00$) + type: string + startTime: + description: StartTime is the start time in 24hr + format. + pattern: ^((([01][0-9])|(2[0-3])):[0-5][0-9])$|(^24:00$) + type: string + type: object + type: array + weekdays: + description: Weekdays is a list of WeekdayRange + items: + description: WeekdayRange is an inclusive range of days + of the week beginning on Sunday Days can be specified + by name (e.g 'Sunday') or as an inclusive range (e.g + 'Monday:Friday') + pattern: ^((?i)sun|mon|tues|wednes|thurs|fri|satur)day(?:((:(sun|mon|tues|wednes|thurs|fri|satur)day)$)|$) + type: string + type: array + years: + description: Years is a list of YearRange + items: + description: YearRange is an inclusive range of years + pattern: ^2\d{3}(?::2\d{3}|$) + type: string + type: array + type: object + type: array + type: object + type: array receivers: description: List of receivers. items: @@ -2683,6 +2765,19 @@ spec: - name type: object type: array + muteTimeIntervals: + description: 'Note: this comment applies to the field definition + above but appears below otherwise it gets included in the generated + manifest. CRD schema doesn''t support self referential types + for now (see https://github.com/kubernetes/kubernetes/issues/62872). + We have to use an alternative type to circumvent the limitation. + The downside is that the Kube API can''t validate the data beyond + the fact that it is a valid JSON representation. MuteTimeIntervals + is a list of MuteTimeInterval names that will mute this route + when matched,' + items: + type: string + type: array receiver: description: Name of the receiver for this route. If not empty, it should be listed in the `receivers` field. diff --git a/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml index d71d09157f4bec39df4a5add1c66eb4ff1a50d41..f1dca49c697167cfaf93ee664289479140cdc46a 100644 --- a/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -4740,6 +4740,11 @@ spec: description: MinShards is the minimum number of shards, i.e. amount of concurrency. type: integer + retryOnRateLimit: + description: Retry upon receiving a 429 status code from + the remote-write storage. This is experimental feature + and might change in the future. + type: boolean type: object remoteTimeout: description: Timeout for requests to the remote write endpoint.