diff --git a/assets/alertmanager/alertmanager.yaml b/assets/alertmanager/alertmanager.yaml deleted file mode 100644 index 6b5789b57eb914414024bc837a97340808fd7adf..0000000000000000000000000000000000000000 --- a/assets/alertmanager/alertmanager.yaml +++ /dev/null @@ -1,14 +0,0 @@ -global: - resolve_timeout: 5m -route: - group_by: ['job'] - group_wait: 30s - group_interval: 5m - repeat_interval: 12h - receiver: 'null' - routes: - - match: - alertname: DeadMansSwitch - receiver: 'null' -receivers: -- name: 'null' diff --git a/assets/grafana/_grafanalib.py b/assets/grafana/_grafanalib.py deleted file mode 100644 index b304809d5735700876fe023f239ed91d3d5e5fd3..0000000000000000000000000000000000000000 --- a/assets/grafana/_grafanalib.py +++ /dev/null @@ -1,91 +0,0 @@ -from grafanalib import core -from grafanalib.core import Graph, Time, SparkLine, \ - Gauge, Templating, XAxis, YAxes - - -def Dashboard( - title, version, time, rows, graphTooltip=0, templating=None, -): - optional_args = {} - if templating is not None: - optional_args['templating'] = templating - return core.Dashboard( - title=title, refresh=None, schemaVersion=14, - version=version, time=time, timezone='browser', inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], rows=rows, graphTooltip=graphTooltip, editable=False, **optional_args, - ) - - -def Row( - panels, height=None, title='Dashboard Row', showTitle=False -): - assert isinstance(height, (type(None), int)) - return core.Row( - panels=panels, height=height, title=title, showTitle=showTitle, - titleSize='h6', editable=False, - ) - - -def SingleStat( - title, id, targets, colorValue=False, gauge=Gauge(show=True), - valueFontSize='80%', thresholds=None, valueName='avg', valueMaps=None, - rangeMaps=None, mappingTypes=None, mappingType=None, postfix=None, - sparkline=SparkLine(), prefixFontSize='50%', colors=[ - (50, 172, 45, 0.97), - (237, 129, 40, 0.89), - (245, 54, 54, 0.9), - ], span=None, format='none', transparent=None, -): - def merge_target(target): - return {**{ - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - }, **target} - targets = [merge_target(t) for t in targets] - - return core.SingleStat( - title=title, id=id, colorValue=colorValue, - dataSource='prometheus', gauge=gauge, - valueFontSize=valueFontSize, thresholds=thresholds, - valueName=valueName, valueMaps=valueMaps, rangeMaps=rangeMaps, - mappingTypes=mappingTypes, targets=targets, - mappingType=mappingType, format=format, colors=colors, span=span, - postfix=postfix, sparkline=sparkline, prefixFontSize=prefixFontSize, - hideTimeOverride=None, transparent=transparent, editable=False, - ) - - -def Graph( - id, title, targets, dashLength=None, dashes=False, spaceLength=None, - xAxis=None, yAxes=None, nullPointMode='connected', -): - def merge_target(target): - return {**{ - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, **target} - - targets = [merge_target(t) for t in targets] - assert isinstance(yAxes, YAxes) - return core.Graph( - id=id, title=title, dashLength=dashLength, dashes=dashes, - spaceLength=spaceLength, targets=targets, xAxis=xAxis, yAxes=yAxes, - dataSource='prometheus', nullPointMode=nullPointMode, editable=False, - ) - - -def YAxis(format='none', label='', min=0, show=True): - return core.YAxis( - format=format, label=label, min=min, show=show - ) diff --git a/assets/grafana/deployment.dashboard.py b/assets/grafana/deployment.dashboard.py deleted file mode 100644 index 6cecd4bf607e0436385f361e54c2f15878cdf38f..0000000000000000000000000000000000000000 --- a/assets/grafana/deployment.dashboard.py +++ /dev/null @@ -1,467 +0,0 @@ -import sys -import os.path -sys.path.insert(0, os.path.dirname(__file__)) -from _grafanalib import * - - -dashboard = Dashboard( - title='Deployment', - version=1, - graphTooltip=1, - time=Time(start='now-6h'), - templating=Templating(list=[ - { - 'allValue': '.*', - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': 'Namespace', - 'multi': False, - 'name': 'deployment_namespace', - 'options': [], - 'query': 'label_values(kube_deployment_metadata_generation, ' - 'namespace)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': None, - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - { - 'allValue': None, - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': 'Deployment', - 'multi': False, - 'name': 'deployment_name', - 'options': [], - 'query': 'label_values(kube_deployment_metadata_generation' - '{namespace="$deployment_namespace"}, deployment)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': 'deployment', - 'type': 'query', - 'useTags': False, - }, - ]), - rows=[ - Row(panels=[ - SingleStat( - title='CPU', - id=8, - gauge=Gauge(show=False), - postfix='cores', - span=4, - valueFontSize='110%', - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - targets=[ - { - 'expr': 'sum(rate(container_cpu_usage_seconds_total' - '{namespace=\"$deployment_namespace\",pod_name=~\"' - '$deployment_name.*\"}[3m]))', - }, - ], - ), - SingleStat( - title='Memory', - id=9, - postfix='GB', - prefixFontSize='80%', - gauge=Gauge(show=False), - span=4, - valueFontSize='110%', - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': 'sum(container_memory_usage_bytes{namespace=' - '\"$deployment_namespace\",pod_name=~\"$' - 'deployment_name.*\"}) / 1024^3', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Network', - format='Bps', - gauge=Gauge(thresholdMarkers=False), - id=7, - postfix='', - span=4, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': 'sum(rate(container_network_transmit_' - 'bytes_total' - '{namespace=\"$deployment_namespace\",pod_name=~\"' - '$deployment_name.*\"}[3m])) + ' - 'sum(rate(container_network_receive_bytes_total' - '{namespace=\"$deployment_namespace\",pod_name=~' - '\"$deployment_name.*\"}[3m]))', - }, - ], - ), - ], - height=200, - ), - Row( - height=100, panels=[ - SingleStat( - title='Desired Replicas', - id=5, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - span=3, - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'metric': 'kube_deployment_spec_replicas', - 'expr': 'max(kube_deployment_spec_replicas' - '{deployment="$deployment_name",namespace=' - '"$deployment_namespace"}) without ' - '(instance, pod)', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - gauge=Gauge(thresholdMarkers=False, show=False), - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - ), - SingleStat( - title='Available Replicas', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(show=False), - id=6, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'min(kube_deployment_status_replicas_' - 'available{deployment=\"$deployment_name\",' - 'namespace=\"$deployment_namespace\"}) without ' - '(instance, pod)', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - ), - SingleStat( - title='Observed Generation', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(), - id=3, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'max(kube_deployment_status_observed_' - 'generation{deployment=\"$deployment_name\",' - 'namespace=\"$deployment_namespace\"}) without ' - '(instance, pod)', - }, - ], - rangeMaps=[ - { - 'from': "null", - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - ), - SingleStat( - title='Metadata Generation', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(show=False), - id=2, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'max(kube_deployment_metadata_generation' - '{deployment=\"$deployment_name\",namespace=\"' - '$deployment_namespace\"}) without (instance, ' - 'pod)', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - ), - ], - ), - Row( - height=350, panels=[ - Graph( - title='Replicas', - dashLength=10, - dashes=False, - id=1, - spaceLength=10, - targets=[ - { - 'expr': 'max(kube_deployment_status_replicas' - '{deployment=\"$deployment_name\",namespace=\"' - '$deployment_namespace\"}) without (instance, ' - 'pod)', - 'legendFormat': 'current replicas', - 'refId': 'A', - 'step': 30, - }, - { - 'expr': 'min(kube_deployment_status_replicas_' - 'available{deployment=\"$deployment_name\",' - 'namespace=\"$deployment_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'available', - 'refId': 'B', - 'step': 30, - }, - { - 'expr': 'max(kube_deployment_status_replicas_' - 'unavailable{deployment=\"$deployment_name\",' - 'namespace=\"$deployment_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'unavailable', - 'refId': 'C', - 'step': 30, - }, - { - 'expr': 'min(kube_deployment_status_replicas_' - 'updated{deployment=\"$deployment_name\",' - 'namespace=\"$deployment_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'updated', - 'refId': 'D', - 'step': 30, - }, - { - 'expr': 'max(kube_deployment_spec_replicas' - '{deployment=\"$deployment_name\",namespace=\"' - '$deployment_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'desired', - 'refId': 'E', - 'step': 30, - } - ], - xAxis=XAxis(mode='time'), - yAxes=YAxes( - YAxis(min=None), - YAxis(format='short', min=None, show=False), - ), - ), - ] - ), - ], -) diff --git a/assets/grafana/generated/.gitignore b/assets/grafana/generated/.gitignore deleted file mode 100644 index 92063fdc2bc9be1687f8b3cac194f1a32dc4e8e4..0000000000000000000000000000000000000000 --- a/assets/grafana/generated/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*-dashboard.json -*-datasource.json diff --git a/assets/grafana/generated/.keep b/assets/grafana/generated/.keep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/assets/grafana/kubernetes-capacity-planning.dashboard.py b/assets/grafana/kubernetes-capacity-planning.dashboard.py deleted file mode 100644 index 9b02010a960df2de13a9a7c0fee5d83b6087e822..0000000000000000000000000000000000000000 --- a/assets/grafana/kubernetes-capacity-planning.dashboard.py +++ /dev/null @@ -1,465 +0,0 @@ -from grafanalib.core import * - - -dashboard = Dashboard( - title='Kubernetes Capacity Planning', - version=4, - gnetId=22, - graphTooltip=0, - refresh=False, - editable=False, - schemaVersion=14, - time=Time(start='now-1h'), - timezone='browser', - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus', - } - ], - rows=[ - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Idle CPU', - id=3, - dataSource='prometheus', - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis(format='percent', label='cpu usage',), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum(rate(node_cpu{mode=\"idle\"}[2m])) ' - '* 100', - 'hide': False, - 'intervalFactor': 10, - 'legendFormat': '', - 'refId': 'A', - 'step': 50, - }, - ], - ), - Graph( - title='System Load', - id=9, - dataSource='prometheus', - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis(format='percentunit', min=None), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum(node_load1)', - 'intervalFactor': 4, - 'legendFormat': 'load 1m', - 'refId': 'A', - 'step': 20, - 'target': '', - }, - { - 'expr': 'sum(node_load5)', - 'intervalFactor': 4, - 'legendFormat': 'load 5m', - 'refId': 'B', - 'step': 20, - 'target': '' - }, - { - 'expr': 'sum(node_load15)', - 'intervalFactor': 4, - 'legendFormat': 'load 15m', - 'refId': 'C', - 'step': 20, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Memory Usage', - id=4, - dataSource='prometheus', - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=9, - stack=True, - seriesOverrides=[ - { - 'alias': 'node_memory_SwapFree{instance=' - '\"172.17.0.1:9100\",job=\"prometheus\"}', - 'yaxis': 2, - } - ], - tooltip=Tooltip( - msResolution=False, valueType='individual' - ), - yAxes=YAxes( - YAxis(format='bytes', min='0'), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum(node_memory_MemTotal) - sum(node_' - 'memory_MemFree) - sum(node_memory_Buffers) - ' - 'sum(node_memory_Cached)', - 'intervalFactor': 2, - 'legendFormat': 'memory usage', - 'metric': 'memo', - 'refId': 'A', - 'step': 10, - 'target': '', - }, - { - 'expr': 'sum(node_memory_Buffers)', - 'interval': '', - 'intervalFactor': 2, - 'legendFormat': 'memory buffers', - 'metric': 'memo', - 'refId': 'B', - 'step': 10, - 'target': '', - }, - { - 'expr': 'sum(node_memory_Cached)', - 'interval': '', - 'intervalFactor': 2, - 'legendFormat': 'memory cached', - 'metric': 'memo', - 'refId': 'C', - 'step': 10, - 'target': '', - }, - { - 'expr': 'sum(node_memory_MemFree)', - 'interval': '', - 'intervalFactor': 2, - 'legendFormat': 'memory free', - 'metric': 'memo', - 'refId': 'D', - 'step': 10, - 'target': '', - }, - ], - ), - SingleStat( - title='Memory Usage', - dataSource='prometheus', - id=5, - format='percent', - span=3, - gauge=Gauge(show=True), - editable=False, - thresholds='80, 90', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': '((sum(node_memory_MemTotal) - ' - 'sum(node_memory_MemFree) - sum(' - 'node_memory_Buffers) - sum(node_memory_Cached)) ' - '/ sum(node_memory_MemTotal)) * 100', - 'intervalFactor': 2, - 'metric': '', - 'refId': 'A', - 'step': 60, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=246, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Disk I/O', - dataSource='prometheus', - id=6, - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=9, - tooltip=Tooltip(msResolution=False), - seriesOverrides=[ - { - 'alias': 'read', - 'yaxis': 1 - }, - { - 'alias': '{instance=\"172.17.0.1:9100\"}', - 'yaxis': 2, - }, - { - 'alias': 'io time', - 'yaxis': 2, - }, - ], - yAxes=YAxes( - YAxis(format='bytes', min=None), - YAxis(format='ms', min=None), - ), - targets=[ - { - 'expr': 'sum(rate(node_disk_bytes_read[5m]))', - 'hide': False, - 'intervalFactor': 4, - 'legendFormat': 'read', - 'refId': 'A', - 'step': 20, - 'target': '' - }, - { - 'expr': 'sum(rate(node_disk_bytes_written[5m]))', - 'intervalFactor': 4, - 'legendFormat': 'written', - 'refId': 'B', - 'step': 20 - }, - { - 'expr': 'sum(rate(node_disk_io_time_ms[5m]))', - 'intervalFactor': 4, - 'legendFormat': 'io time', - 'refId': 'C', - 'step': 20 - }, - ], - ), - SingleStat( - title='Disk Space Usage', - dataSource='prometheus', - id=12, - span=3, - editable=False, - format='percentunit', - valueName='current', - gauge=Gauge( - maxValue=1, - show=True, - ), - thresholds='0.75, 0.9', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': '(sum(node_filesystem_size{device!=' - '\"rootfs\"}) - sum(node_filesystem_free{' - 'device!=\"rootfs\"})) / sum(node_filesystem_size' - '{device!=\"rootfs\"})', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 60, - 'target': '', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - ), - ] - ), - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Network Received', - dataSource='prometheus', - id=8, - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - seriesOverrides=[ - { - 'alias': 'transmitted', - 'yaxis': 2, - }, - ], - yAxes=YAxes( - YAxis(format='bytes', min=None), - YAxis(format='bytes', min=None), - ), - targets=[ - { - 'expr': 'sum(rate(node_network_receive_bytes' - '{device!~\"lo\"}[5m]))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 10, - 'target': '', - }, - ], - ), - Graph( - title='Network Transmitted', - dataSource='prometheus', - id=10, - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - seriesOverrides=[ - { - 'alias': 'transmitted', - 'yaxis': 2, - }, - ], - yAxes=YAxes( - YAxis(format='bytes', min=None), - YAxis(format='bytes', min=None), - ), - targets=[ - { - 'expr': 'sum(rate(node_network_transmit_bytes' - '{device!~\"lo\"}[5m]))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'B', - 'step': 10, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=276, title='New Row', showTitle=False, editable=False, - titleSize='h6', - panels=[ - Graph( - title='Cluster Pod Utilization', - dataSource='prometheus', - id=11, - span=9, - dashes=False, - editable=False, - spaceLength=11, - tooltip=Tooltip( - msResolution=False, - valueType='individual', - ), - yAxes=YAxes( - YAxis(format='short', min=None), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum(kube_pod_info)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': 'Current number of Pods', - 'refId': 'A', - 'step': 10, - }, - { - 'expr': 'sum(kube_node_status_capacity_pods)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': 'Maximum capacity of pods', - 'refId': 'B', - 'step': 10, - } - ], - ), - SingleStat( - title='Pod Utilization', - dataSource='prometheus', - id=7, - editable=False, - span=3, - format='percent', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - gauge=Gauge( - show=True, - ), - thresholds='80, 90', - valueName='current', - targets=[ - { - 'expr': '100 - (sum(kube_node_status_capacity_' - 'pods) - sum(kube_pod_info)) / sum(kube_node_' - 'status_capacity_pods) * 100', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 60, - 'target': '', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - ), - ] - ), - ], -) diff --git a/assets/grafana/kubernetes-cluster-health.dashboard.py b/assets/grafana/kubernetes-cluster-health.dashboard.py deleted file mode 100644 index 7f1cfe647fdc5376157b4e4d9be8fcfd721884dc..0000000000000000000000000000000000000000 --- a/assets/grafana/kubernetes-cluster-health.dashboard.py +++ /dev/null @@ -1,405 +0,0 @@ -from grafanalib.core import * - - -dashboard = Dashboard( - title='Kubernetes Cluster Health', - version=9, - graphTooltip=0, - schemaVersion=14, - editable=False, - time=Time(start='now-6h'), - timezone='browser', - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], - rows=[ - Row( - height=254, title='Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - SingleStat( - title='Control Plane Components Down', - id=1, - dataSource='prometheus', - gauge=Gauge(), - span=3, - thresholds='1, 3', - colorValue=True, - editable=False, - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'Everything UP and healthy', - 'value': 'null', - }, - { - 'op': '=', - 'text': '', - 'value': '', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(up{job=~"apiserver|kube-scheduler|' - 'kube-controller-manager"} == 0)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Alerts Firing', - id=2, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(ALERTS{alertstate="firing",' - 'alertname!="DeadMansSwitch"})', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Alerts Pending', - id=3, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='3, 5', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(ALERTS{alertstate="pending",' - 'alertname!="DeadMansSwitch"})', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Crashlooping Pods', - id=4, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'count(increase(kube_pod_container_' - 'status_restarts[1h]) > 5)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - ], - ), - Row( - height=250, title='Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - SingleStat( - title='Node Not Ready', - id=5, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(kube_node_status_condition{' - 'condition="Ready",status!="true"})', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Node Disk Pressure', - id=6, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(kube_node_status_condition' - '{condition="DiskPressure",status="true"})', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Node Memory Pressure', - id=7, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(kube_node_status_condition' - '{condition="MemoryPressure",status="true"})', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Nodes Unschedulable', - id=8, - dataSource='prometheus', - gauge=Gauge(), - colorValue=True, - editable=False, - span=3, - valueName='current', - thresholds='1, 3', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'sum(kube_node_spec_unschedulable)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ], - ), - ], - ), - ], -) diff --git a/assets/grafana/kubernetes-cluster-status.dashboard.py b/assets/grafana/kubernetes-cluster-status.dashboard.py deleted file mode 100644 index 7288c52214e51c58c59fa6908af03fbcbdfded4c..0000000000000000000000000000000000000000 --- a/assets/grafana/kubernetes-cluster-status.dashboard.py +++ /dev/null @@ -1,450 +0,0 @@ -import sys -import os.path -sys.path.insert(0, os.path.dirname(__file__)) -from _grafanalib import * - - -dashboard = Dashboard( - title='Kubernetes Cluster Status', - version=3, - time=Time(start='now-6h'), - rows=[ - Row( - height=129, title='Cluster Health', showTitle=True, - panels=[ - SingleStat( - title='Control Plane UP', - id=5, - gauge=Gauge(show=False), - colorValue=True, - mappingType=1, - thresholds='1, 3', - valueName='total', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'UP', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'sum(up{job=~"apiserver|kube-scheduler|' - 'kube-controller-manager"} == 0)', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Alerts Firing', - id=6, - gauge=Gauge(show=False), - colorValue=True, - mappingType=1, - thresholds='3, 5', - valueName='current', - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - }, - ], - targets=[ - { - 'expr': 'sum(ALERTS{alertstate="firing",' - 'alertname!="DeadMansSwitch"})', - 'format': 'time_series', - }, - ] - ), - ], - ), - Row( - height=168, title='Control Plane Status', showTitle=True, - panels=[ - SingleStat( - title='API Servers UP', - id=1, - mappingType=1, - format='percent', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - thresholds='50, 80', - span=3, - valueName='current', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': '(sum(up{job="apiserver"} == 1) / ' - 'count(up{job="apiserver"})) * 100', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Controller Managers UP', - id=2, - span=3, - mappingType=1, - thresholds='50, 80', - format='percent', - valueName='current', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - targets=[ - { - 'expr': '(sum(up{job="kube-controller-manager"} ==' - ' 1) / count(up{job="kube-controller-manager"})) ' - '* 100', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Schedulers UP', - id=3, - span=3, - mappingType=1, - format='percent', - thresholds='50, 80', - valueName='current', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - targets=[ - { - 'expr': '(sum(up{job="kube-scheduler"} == 1) / ' - 'count(up{job="kube-scheduler"})) * 100', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Crashlooping Control Plane Pods', - id=4, - colorValue=True, - gauge=Gauge(show=False), - span=3, - mappingType=1, - thresholds='1, 3', - valueName='current', - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - }, - ], - targets=[ - { - 'expr': 'count(increase(kube_pod_container_' - 'status_restarts{namespace=~"kube-system|' - 'tectonic-system"}[1h]) > 5)', - 'format': 'time_series', - }, - ] - ), - ], - ), - Row( - height=158, title='Capacity Planning', showTitle=True, - panels=[ - SingleStat( - title='CPU Utilization', - id=8, - format='percent', - mappingType=1, - span=3, - thresholds='80, 90', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'sum(100 - (avg by (instance) (rate(' - 'node_cpu{job="node-exporter",mode="idle"}[5m])) ' - '* 100)) / count(node_cpu{job="node-exporter",' - 'mode="idle"})', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Memory Utilization', - id=7, - format='percent', - span=3, - mappingType=1, - thresholds='80, 90', - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': '((sum(node_memory_MemTotal) - sum(' - 'node_memory_MemFree) - sum(node_memory_Buffers) ' - '- sum(node_memory_Cached)) / sum(' - 'node_memory_MemTotal)) * 100', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Filesystem Utilization', - id=9, - span=3, - format='percent', - mappingType=1, - thresholds='80, 90', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': '(sum(node_filesystem_size{device!=' - '"rootfs"}) - sum(node_filesystem_free{device!=' - '"rootfs"})) / sum(node_filesystem_size{device!=' - '"rootfs"})', - 'format': 'time_series', - }, - ] - ), - SingleStat( - title='Pod Utilization', - id=10, - gauge=Gauge(show=True), - span=3, - mappingType=1, - format='percent', - thresholds='80, 90', - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': '100 - (sum(kube_node_status_capacity_pods' - ') - sum(kube_pod_info)) / sum(kube_node_status_' - 'capacity_pods) * 100', - 'format': 'time_series', - }, - ] - ), - ], - ), - ], -) diff --git a/assets/grafana/kubernetes-control-plane-status.dashboard.py b/assets/grafana/kubernetes-control-plane-status.dashboard.py deleted file mode 100644 index d2f351296564c41652c0a3d26ef15d45196e9b53..0000000000000000000000000000000000000000 --- a/assets/grafana/kubernetes-control-plane-status.dashboard.py +++ /dev/null @@ -1,344 +0,0 @@ -from grafanalib.core import * - -dashboard = Dashboard( - title='Kubernetes Control Plane Status', - version=3, - graphTooltip=0, - schemaVersion=14, - time=Time(start='now-6h'), - timezone='browser', - refresh=None, - editable=False, - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], - rows=[ - Row( - title='Dashboard Row', showTitle=False, titleSize='h6', editable=False, - panels=[ - SingleStat( - title='API Servers UP', - dataSource='prometheus', - format='percent', - editable=False, - gauge=Gauge( - show=True, - ), - id=1, - span=3, - thresholds='50, 80', - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': '(sum(up{job=\"apiserver\"} == 1) / ' - 'sum(up{job=\"apiserver\"})) * 100', - 'format': 'time_series', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - }, - ] - ), - SingleStat( - title='Controller Managers UP', - dataSource='prometheus', - format='percent', - editable=False, - gauge=Gauge( - show=True, - ), - id=2, - span=3, - thresholds='50, 80', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - rangeMaps=([ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ]), - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': '(sum(up{job=\"kube-controller-manager\"}' - ' == 1) / sum(up{job=\"kube-controller-manager\"' - '})) * 100', - 'format': 'time_series', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - } - ] - ), - SingleStat( - title='Schedulers UP', - dataSource='prometheus', - format='percent', - editable=False, - gauge=Gauge( - show=True, - ), - id=3, - span=3, - thresholds='50, 80', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - rangeMaps=([ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ]), - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': '(sum(up{job=\"kube-scheduler\"} == 1) ' - '/ sum(up{job=\"kube-scheduler\"})) * 100', - 'format': 'time_series', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - } - ] - ), - SingleStat( - title='API Server Request Error Rate', - dataSource='prometheus', - format='percent', - editable=False, - gauge=Gauge( - show=True, - ), - id=4, - span=3, - thresholds='5, 10', - valueMaps=[ - { - 'op': '=', - 'text': '0', - 'value': 'null', - } - ], - rangeMaps=([ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ]), - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - } - ], - targets=[ - { - 'expr': 'max(sum by(instance) (rate(' - 'apiserver_request_count{code=~"5.."}[5m])) / ' - 'sum by(instance) (rate(apiserver_request_count' - '[5m]))) * 100', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 600, - }, - ] - ), - ], - ), - Row( - title='Dashboard Row', showTitle=False, titleSize='h6', editable=False, - panels=[ - Graph( - title='API Server Request Latency', - id=7, - dataSource='prometheus', - dashLength=10, - dashes=False, - isNew=False, - editable=False, - lineWidth=1, - nullPointMode='null', - tooltip=Tooltip( - msResolution=False, valueType='individual', - ), - spaceLength=10, - yAxes=YAxes( - YAxis(format='short', min=None), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum by(verb) (rate(apiserver_latency_' - 'seconds:quantile[5m]) >= 0)', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 30, - } - ], - ), - ], - ), - Row( - title='Dashboard Row', showTitle=False, titleSize='h6', editable=False, - panels=[ - Graph( - title='End to End Scheduling Latency', - id=5, - dataSource='prometheus', - isNew=False, - editable=False, - dashLength=10, - lineWidth=1, - nullPointMode="null", - spaceLength=10, - span=6, - dashes=False, - tooltip=Tooltip( - msResolution=False, - valueType='individual', - ), - yAxes=YAxes( - YAxis(format='short', min=None), - YAxis(format='dtdurations', min=None), - ), - targets=[ - { - 'expr': 'cluster:scheduler_e2e_scheduling_' - 'latency_seconds:quantile', - 'format': 'time_series', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 60, - } - ], - ), - Graph( - title='API Server Request Rates', - id=6, - dataSource='prometheus', - isNew=False, - editable=False, - dashLength=10, - lineWidth=1, - nullPointMode="null", - spaceLength=10, - span=6, - dashes=False, - tooltip=Tooltip( - msResolution=False, - valueType='individual', - ), - yAxes=YAxes( - YAxis(format='short', min=None), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum by(instance) (rate(apiserver_' - 'request_count{code!~\"2..\"}[5m]))', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': 'Error Rate', - 'refId': 'A', - 'step': 60, - }, - { - 'expr': 'sum by(instance) (rate(apiserver_' - 'request_count[5m]))', - 'format': 'time_series', - 'intervalFactor': 2, - 'legendFormat': 'Request Rate', - 'refId': 'B', - 'step': 60, - }, - ], - ), - ], - ), - ], -) diff --git a/assets/grafana/kubernetes-resource-requests.dashboard.py b/assets/grafana/kubernetes-resource-requests.dashboard.py deleted file mode 100644 index 5d5b3bd8c7be1a9d715b9b63aee143f8a202d1f1..0000000000000000000000000000000000000000 --- a/assets/grafana/kubernetes-resource-requests.dashboard.py +++ /dev/null @@ -1,205 +0,0 @@ -from grafanalib.core import * - - -dashboard = Dashboard( - title='Kubernetes Resource Requests', - version=2, - graphTooltip=0, - refresh=False, - editable=False, - schemaVersion=14, - time=Time(start='now-3h'), - timezone='browser', - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], - rows=[ - Row( - height=300, title='CPU Cores', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='CPU Cores', - description='This represents the total [CPU resource ' - 'requests](https://kubernetes.io/docs/concepts/configu' - 'ration/manage-compute-resources-container/#meaning-of-' - 'cpu) in the cluster.\nFor comparison the total ' - '[allocatable CPU cores](https://github.com/kubernetes/' - 'community/blob/master/contributors/design-proposals/' - 'node-allocatable.md) is also shown.', - id=1, - dataSource='prometheus', - dashLength=10, - dashes=False, - isNew=False, - editable=False, - lineWidth=1, - spaceLength=10, - nullPointMode='null', - span=9, - tooltip=Tooltip( - msResolution=False, valueType='individual' - ), - yAxes=YAxes( - YAxis(format='short', label='CPU Cores', min=None,), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'min(sum(kube_node_status_allocatable_' - 'cpu_cores) by (instance))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': 'Allocatable CPU Cores', - 'refId': 'A', - 'step': 20, - }, - { - 'expr': 'max(sum(kube_pod_container_resource_' - 'requests_cpu_cores) by (instance))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': 'Requested CPU Cores', - 'refId': 'B', - 'step': 20, - }, - ], - ), - SingleStat( - title='CPU Cores', - dataSource='prometheus', - id=2, - format='percent', - editable=False, - span=3, - gauge=Gauge(show=True), - sparkline=SparkLine(show=True), - valueFontSize='110%', - thresholds='80, 90', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': 'max(sum(kube_pod_container_resource_' - 'requests_cpu_cores) by (instance)) / min(sum' - '(kube_node_status_allocatable_cpu_cores) by ' - '(instance)) * 100', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 240, - }, - ], - ), - ], - ), - Row( - height=300, title='Memory', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Memory', - id=3, - dataSource='prometheus', - description='This represents the total [memory resource ' - 'requests](https://kubernetes.io/docs/concepts/' - 'configuration/manage-compute-resources-container/' - '#meaning-of-memory) in the cluster.\nFor comparison ' - 'the total [allocatable memory](https://github.com/' - 'kubernetes/community/blob/master/contributors/' - 'design-proposals/node-allocatable.md) is also shown.', - dashLength=10, - dashes=False, - lineWidth=1, - isNew=False, - editable=False, - spaceLength=10, - span=9, - nullPointMode='null', - tooltip=Tooltip( - msResolution=False, valueType='individual' - ), - yAxes=YAxes( - YAxis(format='bytes', label='Memory', min=None), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'min(sum(kube_node_status_allocatable_' - 'memory_bytes) by (instance))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': 'Allocatable Memory', - 'refId': 'A', - 'step': 20, - }, - { - 'expr': 'max(sum(kube_pod_container_resource_' - 'requests_memory_bytes) by (instance))', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': 'Requested Memory', - 'refId': 'B', - 'step': 20, - }, - ], - ), - SingleStat( - title='Memory', - dataSource='prometheus', - id=4, - format='percent', - span=3, - gauge=Gauge(show=True), - sparkline=SparkLine(show=True), - editable=False, - valueFontSize='110%', - thresholds='80, 90', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - targets=[ - { - 'expr': 'max(sum(kube_pod_container_resource_' - 'requests_memory_bytes) by (instance)) / ' - 'min(sum(kube_node_status_allocatable_memory_' - 'bytes) by (instance)) * 100', - 'intervalFactor': 2, - 'legendFormat': '', - 'refId': 'A', - 'step': 240, - }, - ], - ), - ], - ), - ], -) diff --git a/assets/grafana/nodes.dashboard.py b/assets/grafana/nodes.dashboard.py deleted file mode 100644 index da7b7d247834316c06e0218757da3964f70b60c0..0000000000000000000000000000000000000000 --- a/assets/grafana/nodes.dashboard.py +++ /dev/null @@ -1,423 +0,0 @@ -from grafanalib.core import * - - -dashboard = Dashboard( - title='Nodes', - version=2, - description='Dashboard to get an overview of one server', - gnetId=22, - graphTooltip=0, - refresh=False, - editable=False, - schemaVersion=14, - time=Time(start='now-1h'), - timezone='browser', - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], - templating=Templating(list=[ - { - 'allValue': None, - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': None, - 'multi': False, - 'name': 'server', - 'options': [], - 'query': 'label_values(node_boot_time, instance)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - ]), - rows=[ - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Idle CPU', - dataSource='prometheus', - id=3, - isNew=False, - editable=False, - spaceLength=10, - span=6, - dashLength=10, - dashes=False, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis( - format='percent', - label='cpu usage', - max=100, - ), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': '100 - (avg by (cpu) (irate(node_cpu' - '{mode=\"idle\", instance=\"$server\"}[5m])) ' - '* 100)', - 'hide': False, - 'intervalFactor': 10, - 'legendFormat': '{{cpu}}', - 'refId': 'A', - 'step': 50, - } - ], - ), - Graph( - title='System Load', - dataSource='prometheus', - id=9, - isNew=False, - editable=False, - spaceLength=10, - span=6, - dashLength=10, - dashes=False, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis(format='percentunit', min=None,), - YAxis(format='short', min=None,), - ), - targets=[ - { - 'expr': 'node_load1{instance=\"$server\"}', - 'intervalFactor': 4, - 'legendFormat': 'load 1m', - 'refId': 'A', - 'step': 20, - 'target': '', - }, - { - 'expr': 'node_load5{instance=\"$server\"}', - 'intervalFactor': 4, - 'legendFormat': 'load 5m', - 'refId': 'B', - 'step': 20, - 'target': '', - }, - { - 'expr': 'node_load15{instance=\"$server\"}', - 'intervalFactor': 4, - 'legendFormat': 'load 15m', - 'refId': 'C', - 'step': 20, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Memory Usage', - dataSource='prometheus', - id=4, - isNew=False, - editable=False, - spaceLength=10, - span=9, - stack=True, - dashLength=10, - dashes=False, - tooltip=Tooltip( - msResolution=False, valueType='individual', - ), - seriesOverrides=[ - { - 'alias': 'node_memory_SwapFree{instance=' - '\"172.17.0.1:9100\",job=\"prometheus\"}', - 'yaxis': 2, - }, - ], - yAxes=YAxes( - YAxis(format='bytes', min='0',), - YAxis(format='short', min=None,), - ), - targets=[ - { - 'expr': 'node_memory_MemTotal{instance=' - '\"$server\"} - node_memory_MemFree{instance=' - '\"$server\"} - node_memory_Buffers{instance=' - '\"$server\"} - node_memory_Cached{instance=' - '\"$server\"}', - 'hide': False, - 'interval': '', - 'intervalFactor': 2, - 'legendFormat': 'memory used', - 'metric': '', - 'refId': 'C', - 'step': 10, - }, - { - 'expr': 'node_memory_Buffers{instance=' - '\"$server\"}', - 'interval': '', - 'intervalFactor': 2, - 'legendFormat': 'memory buffers', - 'metric': '', - 'refId': 'E', - 'step': 10, - }, - { - 'expr': 'node_memory_Cached{instance=\"$server\"}', - 'intervalFactor': 2, - 'legendFormat': 'memory cached', - 'metric': '', - 'refId': 'F', - 'step': 10, - }, - { - 'expr': 'node_memory_MemFree{instance=' - '\"$server\"}', - 'intervalFactor': 2, - 'legendFormat': 'memory free', - 'metric': '', - 'refId': 'D', - 'step': 10, - }, - ], - ), - SingleStat( - title='Memory Usage', - dataSource='prometheus', - id=5, - format='percent', - gauge=Gauge(show=True), - editable=False, - span=3, - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - } - ], - thresholds='80, 90', - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - targets=[ - { - 'expr': '((node_memory_MemTotal{instance=' - '\"$server\"} - node_memory_MemFree{instance=' - '\"$server\"} - node_memory_Buffers{instance=' - '\"$server\"} - node_memory_Cached{instance=' - '\"$server\"}) / node_memory_MemTotal{instance=' - '\"$server\"}) * 100', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 60, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=250, titleSize='h6', title='New Row', editable=False, - showTitle=False, panels=[ - Graph( - title='Disk I/O', - dataSource='prometheus', - id=6, - dashLength=10, - dashes=False, - editable=False, - spaceLength=10, - span=9, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis( - format='bytes', - min=None, - ), - YAxis( - format='ms', - min=None, - ), - ), - seriesOverrides=[ - { - 'alias': 'read', - 'yaxis': 1, - }, - { - 'alias': '{instance=\"172.17.0.1:9100\"}', - 'yaxis': 2, - }, - { - 'alias': 'io time', - 'yaxis': 2, - }, - ], - targets=[ - { - 'expr': 'sum by (instance) (rate(node_disk_' - 'bytes_read{instance=\"$server\"}[2m]))', - 'hide': False, - 'intervalFactor': 4, - 'legendFormat': 'read', - 'refId': 'A', - 'step': 20, - 'target': '', - }, - { - 'expr': 'sum by (instance) (rate(node_disk_' - 'bytes_written{instance=\"$server\"}[2m]))', - 'intervalFactor': 4, - 'legendFormat': 'written', - 'refId': 'B', - 'step': 20 - }, - { - 'expr': 'sum by (instance) (rate(node_disk_io_' - 'time_ms{instance=\"$server\"}[2m]))', - 'intervalFactor': 4, - 'legendFormat': 'io time', - 'refId': 'C', - 'step': 20, - }, - ], - ), - SingleStat( - title='Disk Space Usage', - dataSource='prometheus', - id=7, - thresholds='0.75, 0.9', - editable=False, - valueName='current', - format='percentunit', - span=3, - gauge=Gauge( - maxValue=1, - show=True, - ), - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - targets=[ - { - 'expr': '(sum(node_filesystem_size{device!=' - '\"rootfs\",instance=\"$server\"}) - ' - 'sum(node_filesystem_free{device!=\"rootfs\",' - 'instance=\"$server\"})) / sum(node_filesystem_' - 'size{device!=\"rootfs\",instance=\"$server\"})', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 60, - 'target': '', - }, - ], - ), - ], - ), - Row( - height=250, title='New Row', titleSize='h6', - showTitle=False, editable=False, - panels=[ - Graph( - title='Network Received', - dataSource='prometheus', - id=8, - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis(format='bytes', min=None), - YAxis(format='bytes', min=None), - ), - seriesOverrides=[ - { - 'alias': 'transmitted', - 'yaxis': 2, - }, - ], - targets=[ - { - 'expr': 'rate(node_network_receive_bytes{' - 'instance=\"$server\",device!~\"lo\"}[5m])', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': '{{device}}', - 'refId': 'A', - 'step': 10, - 'target': '' - } - ], - ), - Graph( - title='Network Transmitted', - dataSource='prometheus', - id=10, - dashLength=10, - dashes=False, - isNew=False, - editable=False, - spaceLength=10, - span=6, - tooltip=Tooltip(msResolution=False), - yAxes=YAxes( - YAxis(format='bytes', min=None), - YAxis(format='bytes', min=None), - ), - seriesOverrides=[ - { - 'alias': 'transmitted', - 'yaxis': 2, - }, - ], - targets=[ - { - 'expr': 'rate(node_network_transmit_bytes' - '{instance=\"$server\",device!~\"lo\"}[5m])', - 'hide': False, - 'intervalFactor': 2, - 'legendFormat': '{{device}}', - 'refId': 'B', - 'step': 10, - 'target': '', - }, - ], - ), - ], - ), - ], -) diff --git a/assets/grafana/pods.dashboard.py b/assets/grafana/pods.dashboard.py deleted file mode 100644 index 84b3fdefcb416ff8b5beb89c2c3e535a5c0425ad..0000000000000000000000000000000000000000 --- a/assets/grafana/pods.dashboard.py +++ /dev/null @@ -1,255 +0,0 @@ -from grafanalib.core import * - - -dashboard = Dashboard( - title='Pods', - version=1, - graphTooltip=1, - refresh=False, - editable=False, - schemaVersion=14, - time=Time(start='now-6h'), - timezone='browser', - inputs=[ - { - 'name': 'prometheus', - 'label': 'prometheus', - 'description': '', - 'type': 'datasource', - 'pluginId': 'prometheus', - 'pluginName': 'Prometheus' - }, - ], - templating=Templating(list=[ - { - 'allValue': '.*', - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': True, - 'label': 'Namespace', - 'multi': False, - 'name': 'namespace', - 'options': [], - 'query': 'label_values(kube_pod_info, namespace)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - { - 'allValue': None, - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': 'Pod', - 'multi': False, - 'name': 'pod', - 'options': [], - 'query': 'label_values(kube_pod_info{namespace=~"$namespace"}, ' - 'pod)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - { - 'allValue': '.*', - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': True, - 'label': 'Container', - 'multi': False, - 'name': 'container', - 'options': [], - 'query': 'label_values(kube_pod_container_info{namespace=' - '"$namespace", pod="$pod"}, container)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - ]), - rows=[ - Row( - height=250, title='Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Memory Usage', - dataSource='prometheus', - id=1, - isNew=False, - editable=False, - spaceLength=10, - span=12, - dashLength=10, - dashes=False, - tooltip=Tooltip(msResolution=True, valueType='cumulative'), - legend=Legend( - alignAsTable=True, avg=True, current=True, - rightSide=True, total=False, values=True, - ), - yAxes=YAxes( - YAxis( - format='bytes', min=None, - ), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum by(container_name) (container_' - 'memory_usage_bytes{pod_name="$pod", ' - 'container_name=~"$container", ' - 'container_name!="POD"})', - 'interval': '10s', - 'intervalFactor': 1, - 'legendFormat': 'Current: {{ container_name }}', - 'metric': 'container_memory_usage_bytes', - 'refId': 'A', - 'step': 15, - }, - { - 'expr': 'kube_pod_container_resource_requests_' - 'memory_bytes{pod="$pod", container=~' - '"$container"}', - 'interval': '10s', - 'intervalFactor': 2, - 'legendFormat': 'Requested: {{ container }}', - 'metric': 'kube_pod_container_resource_' - 'requests_memory_bytes', - 'refId': 'B', - 'step': 20, - }, - { - 'expr': 'kube_pod_container_resource_limits_' - 'memory_bytes{pod="$pod", container=~' - '"$container"}', - 'interval': '10s', - 'intervalFactor': 2, - 'legendFormat': 'Limit: {{ container }}', - 'metric': 'kube_pod_container_resource_' - 'limits_memory_bytes', - 'refId': 'C', - 'step': 20, - }, - ], - ), - ], - ), - Row( - height=250, title='Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='CPU Usage', - dataSource='prometheus', - id=2, - isNew=False, - editable=False, - spaceLength=10, - span=12, - dashLength=10, - dashes=False, - legend=Legend( - alignAsTable=True, avg=True, current=True, - rightSide=True, total=False, values=True, - ), - tooltip=Tooltip(msResolution=True, valueType='cumulative'), - yAxes=YAxes( - YAxis( - format='short', min=None, - ), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sum by (container_name)(' - 'rate(container_cpu_usage_seconds_total' - '{image!="",container_name!="POD",pod_name=' - '"$pod"}[1m]))', - 'intervalFactor': 2, - 'legendFormat': '{{ container_name }}', - 'refId': 'A', - 'step': 30 - }, - { - 'expr': 'kube_pod_container_resource_requests_' - 'cpu_cores{pod="$pod", container=~' - '"$container"}', - 'interval': '10s', - 'intervalFactor': 2, - 'legendFormat': 'Requested: {{ container }}', - 'metric': 'kube_pod_container_resource_' - 'requests_cpu_cores', - 'refId': 'B', - 'step': 20, - }, - { - 'expr': 'kube_pod_container_resource_limits_' - 'cpu_cores{pod="$pod", container=~' - '"$container"}', - 'interval': '10s', - 'intervalFactor': 2, - 'legendFormat': 'Limit: {{ container }}', - 'metric': 'kube_pod_container_resource_' - 'limits_memory_bytes', - 'refId': 'C', - 'step': 20, - }, - ], - ), - ], - ), - Row( - height=250, title='New Row', showTitle=False, editable=False, - titleSize='h6', panels=[ - Graph( - title='Network I/O', - dataSource='prometheus', - id=3, - isNew=False, - editable=False, - spaceLength=10, - span=12, - dashLength=10, - dashes=False, - legend=Legend( - alignAsTable=True, avg=True, current=True, - rightSide=True, total=False, values=True, - ), - tooltip=Tooltip(msResolution=True, valueType='cumulative'), - yAxes=YAxes( - YAxis( - format='bytes', min=None, - ), - YAxis(format='short', min=None), - ), - targets=[ - { - 'expr': 'sort_desc(sum by (pod_name) (rate' - '(container_network_receive_bytes_total{' - 'pod_name="$pod"}[1m])))', - 'intervalFactor': 2, - 'legendFormat': '{{ pod_name }}', - 'refId': 'A', - 'step': 30 - }, - ], - ), - ], - ), - ], -) diff --git a/assets/grafana/prometheus-datasource.json b/assets/grafana/prometheus-datasource.json deleted file mode 100644 index 47b8f1b2f6f4d705c6044f05b805fecb9806e0a8..0000000000000000000000000000000000000000 --- a/assets/grafana/prometheus-datasource.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "access": "proxy", - "basicAuth": false, - "name": "prometheus", - "type": "prometheus", - "url": "http://prometheus-k8s.monitoring.svc:9090" -} diff --git a/assets/grafana/raw-json-dashboards/etcd-dashboard.json b/assets/grafana/raw-json-dashboards/etcd-dashboard.json deleted file mode 100644 index f2a03cec5fc7576649dc37fa0d46866ec1060f5c..0000000000000000000000000000000000000000 --- a/assets/grafana/raw-json-dashboards/etcd-dashboard.json +++ /dev/null @@ -1,1158 +0,0 @@ -{ - "__inputs": [ - { - "name": "prometheus", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.5.2" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "description": "etcd sample Grafana dashboard with Prometheus", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": false, - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "prometheus", - "editable": false, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 28, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(etcd_server_has_leader)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "metric": "etcd_server_has_leader", - "refId": "A", - "step": 20 - } - ], - "thresholds": "", - "title": "Up", - "type": "singlestat", - "valueFontSize": "200%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 23, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 5, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RPC Rate", - "metric": "grpc_server_started_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(grpc_server_handled_total{grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RPC Failed Rate", - "metric": "grpc_server_handled_total", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "RPC Rate", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 41, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Watch Streams", - "metric": "grpc_server_handled_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Lease Streams", - "metric": "grpc_server_handled_total", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Active Streams", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "decimals": null, - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "etcd_debugging_mvcc_db_total_size_in_bytes", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} DB Size", - "metric": "", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "DB Size", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 1, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": true, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}} WAL fsync", - "metric": "etcd_disk_wal_fsync_duration_seconds_bucket", - "refId": "A", - "step": 4 - }, - { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} DB fsync", - "metric": "etcd_disk_backend_commit_duration_seconds_bucket", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk Sync Duration", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 29, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_resident_memory_bytes", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Resident Memory", - "metric": "process_resident_memory_bytes", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 5, - "id": 22, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(etcd_network_client_grpc_received_bytes_total[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Client Traffic In", - "metric": "etcd_network_client_grpc_received_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Client Traffic In", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 5, - "id": 21, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(etcd_network_client_grpc_sent_bytes_total[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Client Traffic Out", - "metric": "etcd_network_client_grpc_sent_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Client Traffic Out", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 20, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_network_peer_received_bytes_total[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Peer Traffic In", - "metric": "etcd_network_peer_received_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Peer Traffic In", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "decimals": null, - "editable": false, - "error": false, - "fill": 0, - "grid": {}, - "id": 16, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_network_peer_sent_bytes_total[5m])) by (instance)", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} Peer Traffic Out", - "metric": "etcd_network_peer_sent_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Peer Traffic Out", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "editable": false, - "error": false, - "fill": 0, - "id": 40, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Failure Rate", - "metric": "etcd_server_proposals_failed_total", - "refId": "A", - "step": 2 - }, - { - "expr": "sum(etcd_server_proposals_pending)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Pending Total", - "metric": "etcd_server_proposals_pending", - "refId": "B", - "step": 2 - }, - { - "expr": "sum(rate(etcd_server_proposals_committed_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Commit Rate", - "metric": "etcd_server_proposals_committed_total", - "refId": "C", - "step": 2 - }, - { - "expr": "sum(rate(etcd_server_proposals_applied_total[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Proposal Apply Rate", - "refId": "D", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Raft Proposals", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "decimals": 0, - "editable": false, - "error": false, - "fill": 0, - "id": 19, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "changes(etcd_server_leader_changes_seen_total[1d])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} Total Leader Elections Per Day", - "metric": "etcd_server_leader_changes_seen_total", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Total Leader Elections Per Day", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-15m", - "to": "now" - }, - "timepicker": { - "now": true, - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "etcd", - "version": 4 -} \ No newline at end of file diff --git a/assets/grafana/statefulset.dashboard.py b/assets/grafana/statefulset.dashboard.py deleted file mode 100644 index 780630a28741ab752e3907ec81df88c04d931399..0000000000000000000000000000000000000000 --- a/assets/grafana/statefulset.dashboard.py +++ /dev/null @@ -1,440 +0,0 @@ -import sys -import os.path -sys.path.insert(0, os.path.dirname(__file__)) -from _grafanalib import * - - -dashboard = Dashboard( - title='StatefulSet', - version=1, - graphTooltip=1, - time=Time(start='now-6h'), - templating=Templating(list=[ - { - 'allValue': '.*', - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': 'Namespace', - 'multi': False, - 'name': 'statefulset_namespace', - 'options': [], - 'query': 'label_values(kube_statefulset_metadata_generation, ' - 'namespace)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': None, - 'tags': [], - 'tagsQuery': '', - 'type': 'query', - 'useTags': False, - }, - { - 'allValue': None, - 'current': {}, - 'datasource': 'prometheus', - 'hide': 0, - 'includeAll': False, - 'label': 'StatefulSet', - 'multi': False, - 'name': 'statefulset_name', - 'options': [], - 'query': 'label_values(kube_statefulset_metadata_generation' - '{namespace="$statefulset_namespace"}, statefulset)', - 'refresh': 1, - 'regex': '', - 'sort': 0, - 'tagValuesQuery': '', - 'tags': [], - 'tagsQuery': 'statefulset', - 'type': 'query', - 'useTags': False, - }, - ]), - rows=[ - Row(panels=[ - SingleStat( - title='CPU', - id=8, - gauge=Gauge(show=False), - postfix='cores', - span=4, - valueFontSize='110%', - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - targets=[ - { - 'expr': 'sum(rate(container_cpu_usage_seconds_total' - '{namespace=\"$statefulset_namespace\",pod_name=~\"' - '$statefulset_name.*\"}[3m]))', - }, - ], - ), - SingleStat( - title='Memory', - id=9, - postfix='GB', - prefixFontSize='80%', - gauge=Gauge(show=False), - span=4, - valueFontSize='110%', - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': 'sum(container_memory_usage_bytes{namespace=' - '\"$statefulset_namespace\",pod_name=~\"$' - 'statefulset_name.*\"}) / 1024^3', - 'intervalFactor': 2, - 'refId': 'A', - 'step': 600, - }, - ], - ), - SingleStat( - title='Network', - format='Bps', - gauge=Gauge(thresholdMarkers=False), - id=7, - postfix='', - span=4, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - sparkline=SparkLine( - fillColor=(31, 118, 189, 0.18), - lineColor=(31, 120, 193), - show=True, - ), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'expr': 'sum(rate(container_network_transmit_' - 'bytes_total' - '{namespace=\"$statefulset_namespace\",pod_name=~\"' - '$statefulset_name.*\"}[3m])) + ' - 'sum(rate(container_network_receive_bytes_total' - '{namespace=\"$statefulset_namespace\",pod_name=~' - '\"$statefulset_name.*\"}[3m]))', - }, - ], - ), - ], - height=200, - ), - Row( - height=100, panels=[ - SingleStat( - title='Desired Replicas', - id=5, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - span=3, - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - targets=[ - { - 'metric': 'kube_statefulset_replicas', - 'expr': 'max(kube_statefulset_replicas' - '{statefulset="$statefulset_name",namespace=' - '"$statefulset_namespace"}) without ' - '(instance, pod)', - }, - ], - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - gauge=Gauge(thresholdMarkers=False, show=False), - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - ), - SingleStat( - title='Available Replicas', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(show=False), - id=6, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'min(kube_statefulset_status_replicas' - '{statefulset=\"$statefulset_name\",' - 'namespace=\"$statefulset_namespace\"}) without ' - '(instance, pod)', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - ), - SingleStat( - title='Observed Generation', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(), - id=3, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'max(kube_statefulset_status_observed_' - 'generation{statefulset=\"$statefulset_name\",' - 'namespace=\"$statefulset_namespace\"}) without ' - '(instance, pod)', - }, - ], - rangeMaps=[ - { - 'from': "null", - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - } - ], - ), - SingleStat( - title='Metadata Generation', - colors=[ - (245, 54, 54, 0.9), - (237, 129, 40, 0.89), - (50, 172, 45, 0.97), - ], - gauge=Gauge(show=False), - id=2, - mappingType=1, - mappingTypes=[ - { - 'name': 'value to text', - 'value': 1, - }, - { - 'name': 'range to text', - 'value': 2, - }, - ], - targets=[ - { - 'expr': 'max(kube_statefulset_metadata_generation' - '{statefulset=\"$statefulset_name\",namespace=\"' - '$statefulset_namespace\"}) without (instance, ' - 'pod)', - }, - ], - rangeMaps=[ - { - 'from': 'null', - 'text': 'N/A', - 'to': 'null', - }, - ], - span=3, - sparkline=SparkLine(), - valueMaps=[ - { - 'op': '=', - 'text': 'N/A', - 'value': 'null', - }, - ], - ), - ], - ), - Row( - height=350, panels=[ - Graph( - title='Replicas', - dashLength=10, - dashes=False, - id=1, - spaceLength=10, - targets=[ - { - 'expr': 'min(kube_statefulset_status_replicas' - '{statefulset=\"$statefulset_name\",' - 'namespace=\"$statefulset_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'available', - 'refId': 'B', - 'step': 30, - }, - { - 'expr': 'max(kube_statefulset_replicas' - '{statefulset=\"$statefulset_name\",namespace=\"' - '$statefulset_namespace\"}) without ' - '(instance, pod)', - 'legendFormat': 'desired', - 'refId': 'E', - 'step': 30, - } - ], - xAxis=XAxis(mode='time'), - yAxes=YAxes( - YAxis(min=None), - YAxis(format='short', min=None, show=False), - ), - ), - ] - ), - ], -) diff --git a/assets/prometheus/rules/alertmanager.rules.yaml b/assets/prometheus/rules/alertmanager.rules.yaml deleted file mode 100644 index 5e51f75b09c0e030c2360528c9b3c83175b02763..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/alertmanager.rules.yaml +++ /dev/null @@ -1,33 +0,0 @@ -groups: -- name: alertmanager.rules - rules: - - alert: AlertmanagerConfigInconsistent - expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) - GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", - "alertmanager-$1", "alertmanager", "(.*)") != 1 - for: 5m - labels: - severity: critical - annotations: - description: The configuration of the instances of the Alertmanager cluster - `{{$labels.service}}` are out of sync. - summary: Configuration out of sync - - alert: AlertmanagerDownOrMissing - expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", - "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 - for: 5m - labels: - severity: warning - annotations: - description: An unexpected number of Alertmanagers are scraped or Alertmanagers - disappeared from discovery. - summary: Alertmanager down or missing - - alert: AlertmanagerFailedReload - expr: alertmanager_config_last_reload_successful == 0 - for: 10m - labels: - severity: warning - annotations: - description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace - }}/{{ $labels.pod}}. - summary: Alertmanager's configuration reload failed diff --git a/assets/prometheus/rules/etcd3.rules.yaml b/assets/prometheus/rules/etcd3.rules.yaml deleted file mode 100644 index a16bf016c5e16ea5eee9b798c45919a08ce35c12..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/etcd3.rules.yaml +++ /dev/null @@ -1,123 +0,0 @@ -groups: -- name: ./etcd3.rules - rules: - - alert: InsufficientMembers - expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) - for: 3m - labels: - severity: critical - annotations: - description: If one more etcd member goes down the cluster will be unavailable - summary: etcd cluster insufficient members - - alert: NoLeader - expr: etcd_server_has_leader{job="etcd"} == 0 - for: 1m - labels: - severity: critical - annotations: - description: etcd member {{ $labels.instance }} has no leader - summary: etcd member has no leader - - alert: HighNumberOfLeaderChanges - expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader - changes within the last hour - summary: a high number of leader changes within the etcd cluster are happening - - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) - / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' - summary: a high number of gRPC requests are failing - - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) - / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05 - for: 5m - labels: - severity: critical - annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' - summary: a high number of gRPC requests are failing - - alert: GRPCRequestsSlow - expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) - > 0.15 - for: 10m - labels: - severity: critical - annotations: - description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method - }} are slow - summary: slow gRPC requests - - alert: HighNumberOfFailedHTTPRequests - expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) - BY (method) > 0.01 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}' - summary: a high number of HTTP requests are failing - - alert: HighNumberOfFailedHTTPRequests - expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) - BY (method) > 0.05 - for: 5m - labels: - severity: critical - annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}' - summary: a high number of HTTP requests are failing - - alert: HTTPRequestsSlow - expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) - > 0.15 - for: 10m - labels: - severity: warning - annotations: - description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method - }} are slow - summary: slow HTTP requests - - alert: EtcdMemberCommunicationSlow - expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) - > 0.15 - for: 10m - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} member communication with - {{ $labels.To }} is slow - summary: etcd member communication is slow - - alert: HighNumberOfFailedProposals - expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal - failures within the last hour - summary: a high number of proposals within the etcd cluster are failing - - alert: HighFsyncDurations - expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) - > 0.5 - for: 10m - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} fync durations are high - summary: high fsync durations - - alert: HighCommitDurations - expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) - > 0.25 - for: 10m - labels: - severity: warning - annotations: - description: etcd instance {{ $labels.instance }} commit durations are high - summary: high commit durations diff --git a/assets/prometheus/rules/general.rules.yaml b/assets/prometheus/rules/general.rules.yaml deleted file mode 100644 index 84ce6b47fc9013df48a2eb23e958205e83f42bba..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/general.rules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -groups: -- name: general.rules - rules: - - alert: TargetDown - expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% of {{ $labels.job }} targets are down.' - summary: Targets are down - - alert: DeadMansSwitch - expr: vector(1) - labels: - severity: none - annotations: - description: This is a DeadMansSwitch meant to ensure that the entire Alerting - pipeline is functional. - summary: Alerting DeadMansSwitch - - record: fd_utilization - expr: process_open_fds / process_max_fds - - alert: FdExhaustionClose - expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance - will exhaust in file/socket descriptors within the next 4 hours' - summary: file descriptors soon exhausted - - alert: FdExhaustionClose - expr: predict_linear(fd_utilization[10m], 3600) > 1 - for: 10m - labels: - severity: critical - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance - will exhaust in file/socket descriptors within the next hour' - summary: file descriptors soon exhausted diff --git a/assets/prometheus/rules/node.rules.yaml b/assets/prometheus/rules/node.rules.yaml deleted file mode 100644 index e678ca8410f1f21bf5ac78aad558c659e25f5bef..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/node.rules.yaml +++ /dev/null @@ -1,47 +0,0 @@ -groups: -- name: node.rules - rules: - - record: instance:node_cpu:rate:sum - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) - BY (instance) - - record: instance:node_filesystem_usage:sum - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) - BY (instance) - - record: instance:node_network_receive_bytes:rate:sum - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) - - record: instance:node_network_transmit_bytes:rate:sum - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) - - record: instance:node_cpu:ratio - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) - GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) - - record: cluster:node_cpu:sum_rate5m - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) - - record: cluster:node_cpu:ratio - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - - alert: NodeExporterDown - expr: absent(up{job="node-exporter"} == 1) - for: 10m - labels: - severity: warning - annotations: - description: Prometheus could not scrape a node-exporter for more than 10m, - or node-exporters have disappeared from discovery - summary: Prometheus could not scrape a node-exporter - - alert: NodeDiskRunningFull - expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 - for: 30m - labels: - severity: warning - annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 24 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 24 hours - - alert: NodeDiskRunningFull - expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 - for: 10m - labels: - severity: critical - annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 2 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 2 hours diff --git a/assets/prometheus/rules/prometheus.rules.yaml b/assets/prometheus/rules/prometheus.rules.yaml deleted file mode 100644 index da699c3212ec8415b22212a71c28fa588d0e9c8d..0000000000000000000000000000000000000000 --- a/assets/prometheus/rules/prometheus.rules.yaml +++ /dev/null @@ -1,101 +0,0 @@ -groups: -- name: prometheus.rules - rules: - - alert: PrometheusConfigReloadFailed - expr: prometheus_config_last_reload_successful == 0 - for: 10m - labels: - severity: warning - annotations: - description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} - summary: Reloading Promehteus' configuration failed - - - alert: PrometheusNotificationQueueRunningFull - expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity - for: 10m - labels: - severity: warning - annotations: - description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ - $labels.pod}} - summary: Prometheus' alert notification queue is running full - - - alert: PrometheusErrorSendingAlerts - expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) - > 0.01 - for: 10m - labels: - severity: warning - annotations: - description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ - $labels.pod}} to Alertmanager {{$labels.Alertmanager}} - summary: Errors while sending alert from Prometheus - - - alert: PrometheusErrorSendingAlerts - expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) - > 0.03 - for: 10m - labels: - severity: critical - annotations: - description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ - $labels.pod}} to Alertmanager {{$labels.Alertmanager}} - summary: Errors while sending alerts from Prometheus - - - alert: PrometheusNotConnectedToAlertmanagers - expr: prometheus_notifications_alertmanagers_discovered < 1 - for: 10m - labels: - severity: warning - annotations: - description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected - to any Alertmanagers - summary: Prometheus is not connected to any Alertmanagers - - - alert: PrometheusTSDBReloadsFailing - expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 - for: 12h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} - reload failures over the last four hours.' - summary: Prometheus has issues reloading data blocks from disk - - - alert: PrometheusTSDBCompactionsFailing - expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 - for: 12h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} - compaction failures over the last four hours.' - summary: Prometheus has issues compacting sample blocks - - - alert: PrometheusTSDBWALCorruptions - expr: tsdb_wal_corruptions_total > 0 - for: 4h - labels: - severity: warning - annotations: - description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead - log (WAL).' - summary: Prometheus write-ahead log is corrupted - - - alert: PrometheusNotIngestingSamples - expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0 - for: 10m - labels: - severity: warning - annotations: - description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples." - summary: "Prometheus isn't ingesting samples" - - - alert: PrometheusTargetScapesDuplicate - expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 - for: 10m - labels: - severity: warning - annotations: - description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values" - summary: Prometheus has many samples rejected diff --git a/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..d283cc188aa3e9208d9dbf139b3d9fbc423d891e --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/alertmanager.libsonnet @@ -0,0 +1,53 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'alertmanager.rules', + rules: [ + { + alert: 'AlertmanagerConfigInconsistent', + annotations: { + description: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.', + summary: 'Configuration out of sync', + }, + expr: ||| + count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'AlertmanagerDownOrMissing', + annotations: { + description: 'An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.', + summary: 'Alertmanager down or missing', + }, + expr: ||| + label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{%(alertmanagerSelector)s}) BY (job) != 1 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'AlertmanagerFailedReload', + annotations: { + description: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.", + summary: "Alertmanager's configuration reload failed", + }, + expr: ||| + alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/alerts/alerts.libsonnet b/jsonnet/kube-prometheus/alerts/alerts.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..19568a24cb680b3da24463207d008fe019a226c6 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/alerts.libsonnet @@ -0,0 +1,4 @@ +(import 'alertmanager.libsonnet') + +(import 'general.libsonnet') + +(import 'node.libsonnet') + +(import 'prometheus.libsonnet') diff --git a/jsonnet/kube-prometheus/alerts/general.libsonnet b/jsonnet/kube-prometheus/alerts/general.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..6f3e4534366f801489e65d83a310a6de320cc652 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/general.libsonnet @@ -0,0 +1,34 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'general.rules', + rules: [ + { + alert: 'TargetDown', + annotations: { + description: '{{ $value }}% of {{ $labels.job }} targets are down.', + summary: 'Targets are down', + }, + expr: '100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10', + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'DeadMansSwitch', + annotations: { + description: 'This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.', + summary: 'Alerting DeadMansSwitch', + }, + expr: 'vector(1)', + labels: { + severity: 'none', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/alerts/node.libsonnet b/jsonnet/kube-prometheus/alerts/node.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..f5387a999aa679f48b2aac8483c17f0ddc0ede6d --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/node.libsonnet @@ -0,0 +1,39 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'kube-prometheus-node-alerting.rules', + rules: [ + { + alert: 'NodeDiskRunningFull', + annotations: { + description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})', + summary: 'Node disk is running full within 24 hours', + }, + expr: ||| + predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[6h], 3600 * 24) < 0 + ||| % $._config, + 'for': '30m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'NodeDiskRunningFull', + annotations: { + description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})', + summary: 'Node disk is running full within 2 hours', + }, + expr: ||| + predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[30m], 3600 * 2) < 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/alerts/prometheus.libsonnet b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..32d8262b2f0472860e7c4b05adad032304fa0c18 --- /dev/null +++ b/jsonnet/kube-prometheus/alerts/prometheus.libsonnet @@ -0,0 +1,151 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'prometheus.rules', + rules: [ + { + alert: 'PrometheusConfigReloadFailed', + annotations: { + description: "Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}", + summary: "Reloading Promehteus' configuration failed", + }, + expr: ||| + prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusNotificationQueueRunningFull', + annotations: { + description: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}", + summary: "Prometheus' alert notification queue is running full", + }, + expr: ||| + predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) > prometheus_notifications_queue_capacity{%(prometheusSelector)s} + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusErrorSendingAlerts', + annotations: { + description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + summary: 'Errors while sending alert from Prometheus', + }, + expr: ||| + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.01 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusErrorSendingAlerts', + annotations: { + description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + summary: 'Errors while sending alerts from Prometheus', + }, + expr: ||| + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.03 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'PrometheusNotConnectedToAlertmanagers', + annotations: { + description: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers', + summary: 'Prometheus is not connected to any Alertmanagers', + }, + expr: ||| + prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTSDBReloadsFailing', + annotations: { + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.', + summary: 'Prometheus has issues reloading data blocks from disk', + }, + expr: ||| + increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0 + ||| % $._config, + 'for': '12h', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTSDBCompactionsFailing', + annotations: { + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.', + summary: 'Prometheus has issues compacting sample blocks', + }, + expr: ||| + increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0 + ||| % $._config, + 'for': '12h', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTSDBWALCorruptions', + annotations: { + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).', + summary: 'Prometheus write-ahead log is corrupted', + }, + expr: ||| + tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0 + ||| % $._config, + 'for': '4h', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusNotIngestingSamples', + annotations: { + description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.", + summary: "Prometheus isn't ingesting samples", + }, + expr: ||| + rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'PrometheusTargetScapesDuplicate', + annotations: { + description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values', + summary: 'Prometheus has many samples rejected', + }, + expr: ||| + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/jsonnet/kube-prometheus/kube-prometheus.libsonnet index e79b75676b49ad6709eb66c8b9a13796d2bbd372..6c1636de58bae9c1f54ea9d006aa3a8977d99ef1 100644 --- a/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -6,7 +6,9 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; (import 'alertmanager/alertmanager.libsonnet') + (import 'prometheus-operator/prometheus-operator.libsonnet') + (import 'prometheus/prometheus.libsonnet') + -(import 'kubernetes-mixin/mixin.libsonnet') + { +(import 'kubernetes-mixin/mixin.libsonnet') + +(import 'alerts/alerts.libsonnet') + +(import 'rules/rules.libsonnet') + { kubePrometheus+:: { namespace: k.core.v1.namespace.new($._config.namespace), }, @@ -14,11 +16,31 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; _config+:: { namespace: 'default', - kubeStateMetricsSelector: 'job="kube-state-metrics"', cadvisorSelector: 'job="kubelet"', - nodeExporterSelector: 'job="node-exporter"', kubeletSelector: 'job="kubelet"', + kubeStateMetricsSelector: 'job="kube-state-metrics"', + nodeExporterSelector: 'job="node-exporter"', notKubeDnsSelector: 'job!="kube-dns"', + kubeSchedulerSelector: 'job="kube-scheduler"', + kubeControllerManagerSelector: 'job="kube-controller-manager"', + kubeApiserverSelector: 'job="apiserver"', + podLabel: 'pod', + + alertmanagerSelector: 'job="alertmanager-main"', + prometheusSelector: 'job="prometheus-k8s"', + prometheusOperatorSelector: 'job="prometheus-operator"', + + jobs: { + Kubelet: $._config.kubeletSelector, + KubeScheduler: $._config.kubeSchedulerSelector, + KubeControllerManager: $._config.kubeControllerManagerSelector, + KubeAPI: $._config.kubeApiserverSelector, + KubeStateMetrics: $._config.kubeStateMetricsSelector, + NodeExporter: $._config.nodeExporterSelector, + Alertmanager: $._config.alertmanagerSelector, + Prometheus: $._config.prometheusSelector, + PrometheusOperator: $._config.prometheusOperatorSelector, + }, prometheus+:: { rules: $.prometheusRules + $.prometheusAlerts, diff --git a/jsonnet/kube-prometheus/rules/rules.libsonnet b/jsonnet/kube-prometheus/rules/rules.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..ec3a331e1aa6f1230976bd7cbc950a6fd458056c --- /dev/null +++ b/jsonnet/kube-prometheus/rules/rules.libsonnet @@ -0,0 +1,39 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'kube-prometheus-node-recording.rules', + rules: [ + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)', + record: 'instance:node_cpu:rate:sum', + }, + { + expr: 'sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) BY (instance)', + record: 'instance:node_filesystem_usage:sum', + }, + { + expr: 'sum(rate(node_network_receive_bytes[3m])) BY (instance)', + record: 'instance:node_network_receive_bytes:rate:sum', + }, + { + expr: 'sum(rate(node_network_transmit_bytes[3m])) BY (instance)', + record: 'instance:node_network_transmit_bytes:rate:sum', + }, + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)', + record: 'instance:node_cpu:ratio', + }, + { + expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))', + record: 'cluster:node_cpu:sum_rate5m', + }, + { + expr: 'cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))', + record: 'cluster:node_cpu:ratio', + }, + ], + }, + ], + }, +} diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index af7e274927bd13b313e02a7875de337fae283834..f40585627152d13232354abc2d66db5ef241819d 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -3868,7 +3868,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod_name=\"$pod\"}[1m])) by (container_name)", + "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[1m])) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4097,7 +4097,7 @@ data: ], "targets": [ { - "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4228,7 +4228,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}) by (container_name)", + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container_name}}", @@ -4457,7 +4457,7 @@ data: ], "targets": [ { - "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -5003,7 +5003,7 @@ data: "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "percent", "gauge": { "maxValue": 100, @@ -5206,7 +5206,7 @@ data: "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "percent", "gauge": { "maxValue": 100, @@ -6066,7 +6066,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6145,7 +6145,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6224,7 +6224,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6317,7 +6317,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6397,7 +6397,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6477,7 +6477,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, @@ -6557,7 +6557,7 @@ data: "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "prometheus", + "datasource": "$datasource", "format": "none", "gauge": { "maxValue": 100, diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 35aaa927fa64aa4493a48c5864990466bbb2ad98..d916ff29d698884c09364669b31e9c690a8bb1ca 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -49,13 +49,13 @@ data: without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n \ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n- \"name\": \"kube-apiserver.rules\"\n \"rules\": \n - \"expr\": |\n histogram_quantile(0.99, - sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) without(instance, + sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \"expr\": - |\n histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) + |\n histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) + \ - \"expr\": |\n histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n \ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n- \"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info) @@ -122,20 +122,49 @@ data: by (node) (\n (irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m]) +\n irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n \ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n - \ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kubernetes-absent\"\n - \ \"rules\": \n - \"alert\": \"KubeAPIDown\"\n \"annotations\": \n \"message\": - \"KubeAPI has disappeared from Prometheus target discovery.\"\n \"expr\": |\n - \ absent(up{job=\"kube-apiserver\"} == 1)\n \"for\": \"15m\"\n \"labels\": - \n \"severity\": \"critical\"\n - \"alert\": \"KubeControllerManagerDown\"\n - \ \"annotations\": \n \"message\": \"KubeControllerManager has disappeared - from Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"kube-controller-manager\"} + \ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kube-prometheus-node-recording.rules\"\n + \ \"rules\": \n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[3m])) + BY (instance)\"\n \"record\": \"instance:node_cpu:rate:sum\"\n - \"expr\": + \"sum((node_filesystem_size{mountpoint=\\\"/\\\"} - node_filesystem_free{mountpoint=\\\"/\\\"})) + BY (instance)\"\n \"record\": \"instance:node_filesystem_usage:sum\"\n - \"expr\": + \"sum(rate(node_network_receive_bytes[3m])) BY (instance)\"\n \"record\": \"instance:node_network_receive_bytes:rate:sum\"\n + \ - \"expr\": \"sum(rate(node_network_transmit_bytes[3m])) BY (instance)\"\n \"record\": + \"instance:node_network_transmit_bytes:rate:sum\"\n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m])) + WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, + cpu)) BY (instance)\"\n \"record\": \"instance:node_cpu:ratio\"\n - \"expr\": + \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m]))\"\n \"record\": + \"cluster:node_cpu:sum_rate5m\"\n - \"expr\": \"cluster:node_cpu:rate5m / count(sum(node_cpu) + BY (instance, cpu))\"\n \"record\": \"cluster:node_cpu:ratio\"\n- \"name\": + \"kubernetes-absent\"\n \"rules\": \n - \"alert\": \"AlertmanagerDown\"\n \"annotations\": + \n \"message\": \"Alertmanager has disappeared from Prometheus target discovery.\"\n + \ \"expr\": |\n absent(up{job=\"alertmanager-main\"} == 1)\n \"for\": + \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeAPIDown\"\n + \ \"annotations\": \n \"message\": \"KubeAPI has disappeared from Prometheus + target discovery.\"\n \"expr\": |\n absent(up{job=\"apiserver\"} == 1)\n + \ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - + \"alert\": \"KubeControllerManagerDown\"\n \"annotations\": \n \"message\": + \"KubeControllerManager has disappeared from Prometheus target discovery.\"\n + \ \"expr\": |\n absent(up{job=\"kube-controller-manager\"} == 1)\n \"for\": + \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeSchedulerDown\"\n + \ \"annotations\": \n \"message\": \"KubeScheduler has disappeared from + Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"kube-scheduler\"} == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \ - \"alert\": \"KubeSchedulerDown\"\n \"annotations\": \n \"message\": - \"KubeScheduler has disappeared from Prometheus target discovery.\"\n \"expr\": - |\n absent(up{job=\"kube-scheduler\"} == 1)\n \"for\": \"15m\"\n \"labels\": + \ - \"alert\": \"KubeStateMetricsDown\"\n \"annotations\": \n \"message\": + \"KubeStateMetrics has disappeared from Prometheus target discovery.\"\n \"expr\": + |\n absent(up{job=\"kube-state-metrics\"} == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeletDown\"\n \"annotations\": \n \"message\": \"Kubelet has disappeared from Prometheus target discovery.\"\n \ \"expr\": |\n absent(up{job=\"kubelet\"} == 1)\n \"for\": \"15m\"\n + \ \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"NodeExporterDown\"\n + \ \"annotations\": \n \"message\": \"NodeExporter has disappeared from + Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"node-exporter\"} + == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n + \ - \"alert\": \"PrometheusDown\"\n \"annotations\": \n \"message\": \"Prometheus + has disappeared from Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"prometheus-k8s\"} + == 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n + \ - \"alert\": \"PrometheusOperatorDown\"\n \"annotations\": \n \"message\": + \"PrometheusOperator has disappeared from Prometheus target discovery.\"\n \"expr\": + |\n absent(up{job=\"prometheus-operator\"} == 1)\n \"for\": \"15m\"\n \ \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"kubernetes-apps\"\n \ \"rules\": \n - \"alert\": \"KubePodCrashLooping\"\n \"annotations\": \n \ \"message\": \"{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container @@ -239,28 +268,116 @@ data: 100\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"warning\"\n \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} - {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"kube-apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} + {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} > 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n \ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\": \"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}} - {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"kube-apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} + {{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"} > 4\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n \ - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": - \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"kube-apiserver\",code=~\"^(?:5..)$\"}[5m])) - without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"kube-apiserver\"}[5m])) + \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) + without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\": - \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"kube-apiserver\",code=~\"^(?:5..)$\"}[5m])) - without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"kube-apiserver\"}[5m])) + \"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) + without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring in less than 7 days.\"\n - \ \"expr\": |\n histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) + \ \"expr\": |\n histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n \ \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring in less than 1 day.\"\n \"expr\": |\n histogram_quantile(0.01, sum by - (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) - < 86400\n \"labels\": \n \"severity\": \"critical\"" + (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) + < 86400\n \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"alertmanager.rules\"\n + \ \"rules\": \n - \"alert\": \"AlertmanagerConfigInconsistent\"\n \"annotations\": + \n \"description\": \"The configuration of the instances of the Alertmanager + cluster `{{$labels.service}}` are out of sync.\"\n \"summary\": \"Configuration + out of sync\"\n \"expr\": |\n count_values(\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\"}) + BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"}, + \"service\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") != 1\n \"for\": + \"5m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"AlertmanagerDownOrMissing\"\n + \ \"annotations\": \n \"description\": \"An unexpected number of Alertmanagers + are scraped or Alertmanagers disappeared from discovery.\"\n \"summary\": + \"Alertmanager down or missing\"\n \"expr\": |\n label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"}, + \"job\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") / ON(job) GROUP_RIGHT() + sum(up{job=\"alertmanager-main\"}) BY (job) != 1\n \"for\": \"5m\"\n \"labels\": + \n \"severity\": \"warning\"\n - \"alert\": \"AlertmanagerFailedReload\"\n + \ \"annotations\": \n \"description\": \"Reloading Alertmanager's configuration + has failed for {{ $labels.namespace }}/{{ $labels.pod}}.\"\n \"summary\": + \"Alertmanager's configuration reload failed\"\n \"expr\": |\n alertmanager_config_last_reload_successful{job=\"alertmanager-main\"} + == 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n- + \"name\": \"general.rules\"\n \"rules\": \n - \"alert\": \"TargetDown\"\n \"annotations\": + \n \"description\": \"{{ $value }}% of {{ $labels.job }} targets are down.\"\n + \ \"summary\": \"Targets are down\"\n \"expr\": \"100 * (count(up == 0) + BY (job) / count(up) BY (job)) > 10\"\n \"for\": \"10m\"\n \"labels\": \n + \ \"severity\": \"warning\"\n - \"alert\": \"DeadMansSwitch\"\n \"annotations\": + \n \"description\": \"This is a DeadMansSwitch meant to ensure that the entire + Alerting pipeline is functional.\"\n \"summary\": \"Alerting DeadMansSwitch\"\n + \ \"expr\": \"vector(1)\"\n \"labels\": \n \"severity\": \"none\"\n- + \"name\": \"kube-prometheus-node-alerting.rules\"\n \"rules\": \n - \"alert\": + \"NodeDiskRunningFull\"\n \"annotations\": \n \"description\": \"device + {{$labels.device}} on node {{$labels.instance}} is running full within the next + 24 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node disk + is running full within 24 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[6h], + 3600 * 24) < 0\n \"for\": \"30m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"NodeDiskRunningFull\"\n \"annotations\": \n \"description\": + \"device {{$labels.device}} on node {{$labels.instance}} is running full within + the next 2 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node + disk is running full within 2 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[30m], + 3600 * 2) < 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n- + \"name\": \"prometheus.rules\"\n \"rules\": \n - \"alert\": \"PrometheusConfigReloadFailed\"\n + \ \"annotations\": \n \"description\": \"Reloading Prometheus' configuration + has failed for {{$labels.namespace}}/{{$labels.pod}}\"\n \"summary\": \"Reloading + Promehteus' configuration failed\"\n \"expr\": |\n prometheus_config_last_reload_successful{job=\"prometheus-k8s\"} + == 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusNotificationQueueRunningFull\"\n \"annotations\": + \n \"description\": \"Prometheus' alert notification queue is running full + for {{$labels.namespace}}/{{ $labels.pod}}\"\n \"summary\": \"Prometheus' + alert notification queue is running full\"\n \"expr\": |\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\"}[5m], + 60 * 30) > prometheus_notifications_queue_capacity{job=\"prometheus-k8s\"}\n \"for\": + \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusErrorSendingAlerts\"\n + \ \"annotations\": \n \"description\": \"Errors while sending alerts from + Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\"\n + \ \"summary\": \"Errors while sending alert from Prometheus\"\n \"expr\": + |\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m]) + / rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.01\n + \ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": + \"PrometheusErrorSendingAlerts\"\n \"annotations\": \n \"description\": + \"Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} + to Alertmanager {{$labels.Alertmanager}}\"\n \"summary\": \"Errors while + sending alerts from Prometheus\"\n \"expr\": |\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m]) + / rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.03\n + \ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n - + \"alert\": \"PrometheusNotConnectedToAlertmanagers\"\n \"annotations\": \n + \ \"description\": \"Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is + not connected to any Alertmanagers\"\n \"summary\": \"Prometheus is not connected + to any Alertmanagers\"\n \"expr\": |\n prometheus_notifications_alertmanagers_discovered{job=\"prometheus-k8s\"} + < 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTSDBReloadsFailing\"\n \"annotations\": \n \"description\": + \"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures + over the last four hours.\"\n \"summary\": \"Prometheus has issues reloading + data blocks from disk\"\n \"expr\": |\n increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-k8s\"}[2h]) + > 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTSDBCompactionsFailing\"\n \"annotations\": \n \"description\": + \"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction + failures over the last four hours.\"\n \"summary\": \"Prometheus has issues + compacting sample blocks\"\n \"expr\": |\n increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-k8s\"}[2h]) + > 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTSDBWALCorruptions\"\n \"annotations\": \n \"description\": + \"{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).\"\n + \ \"summary\": \"Prometheus write-ahead log is corrupted\"\n \"expr\": + |\n tsdb_wal_corruptions_total{job=\"prometheus-k8s\"} > 0\n \"for\": + \"4h\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusNotIngestingSamples\"\n + \ \"annotations\": \n \"description\": \"Prometheus {{ $labels.namespace + }}/{{ $labels.pod}} isn't ingesting samples.\"\n \"summary\": \"Prometheus + isn't ingesting samples\"\n \"expr\": |\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\"}[5m]) + <= 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n + \ - \"alert\": \"PrometheusTargetScapesDuplicate\"\n \"annotations\": \n \"description\": + \"{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate + timestamps but different values\"\n \"summary\": \"Prometheus has many samples + rejected\"\n \"expr\": |\n increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-k8s\"}[5m]) + > 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"" kind: ConfigMap metadata: labels: