diff --git a/jsonnet/kube-prometheus/main.libsonnet b/jsonnet/kube-prometheus/main.libsonnet index e1dc0b5955c2e065eaac9abe98b14852088316df..0873e7e89526fad11cee96c67a8a9184a0af7cca 100644 --- a/jsonnet/kube-prometheus/main.libsonnet +++ b/jsonnet/kube-prometheus/main.libsonnet @@ -68,7 +68,7 @@ local platformPatch = import './platforms/platforms.libsonnet'; image: $.values.common.images.grafana, prometheusName: $.values.prometheus.name, // TODO(paulfantom) This should be done by iterating over all objects and looking for object.mixin.grafanaDashboards - dashboards: $.nodeExporter.mixin.grafanaDashboards + $.prometheus.mixin.grafanaDashboards + $.kubernetesControlPlane.mixin.grafanaDashboards, + dashboards: $.nodeExporter.mixin.grafanaDashboards + $.prometheus.mixin.grafanaDashboards + $.kubernetesControlPlane.mixin.grafanaDashboards + $.alertmanager.mixin.grafanaDashboards, }, kubeStateMetrics: { namespace: $.values.common.namespace, diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index a7b5053d47b8bf4e2add4eb674c78e825b3f5748..f338b75264e56ec81c2b61a0e407d2f639271a4c 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -109,8 +109,8 @@ "subdir": "doc/alertmanager-mixin" } }, - "version": "7301451eb94d2081fb740abf29755861dd122c65", - "sum": "V8jcZQ1Qrlm7AQ6wjbuQQsacPb0NvrcZovKyplmzW5w=", + "version": "8598683b2461fb68e1921735c20163c4c784f9b6", + "sum": "YIWuR6x64SRQSCr8tuuGN1cc0TK5HGR0HWvgot3fc6k=", "name": "alertmanager" }, { diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 267433dbc4f05e9c1c30078c87de4001e1d117e6..de6813ea4f7887770db4c042ee9a19341fa11363 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1,5 +1,604 @@ apiVersion: v1 items: +- apiVersion: v1 + data: + alertmanager-overview.json: |- + { + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(alertmanager_alerts{namespace=\"$namespace\",service=\"$service\"}) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Alerts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(alertmanager_alerts_received_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Received", + "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Invalid", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Alerts receive rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alerts", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "integration", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(alertmanager_notifications_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Total", + "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Failed", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "$integration: Notifications Send Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "integration", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} 99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Median", + "refId": "B" + }, + { + "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Average", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "$integration: Notification Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Notifications", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "alertmanager-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(alertmanager_alerts, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "service", + "options": [ + + ], + "query": "label_values(alertmanager_alerts, service)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": null, + "multi": false, + "name": "integration", + "options": [ + + ], + "query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Alertmanager / Overview", + "uid": "alertmanager-overview", + "version": 0 + } + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.5.4 + name: grafana-dashboard-alertmanager-overview + namespace: monitoring - apiVersion: v1 data: apiserver.json: |- diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index edb6623ea45f0c681f5075f68e79d5d6e819f6e5..c889337bf3e32aad8000e3e0a1aad1a7b7574b30 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -53,6 +53,9 @@ spec: - mountPath: /etc/grafana/provisioning/dashboards name: grafana-dashboards readOnly: false + - mountPath: /grafana-dashboard-definitions/0/alertmanager-overview + name: grafana-dashboard-alertmanager-overview + readOnly: false - mountPath: /grafana-dashboard-definitions/0/apiserver name: grafana-dashboard-apiserver readOnly: false @@ -135,6 +138,9 @@ spec: - configMap: name: grafana-dashboards name: grafana-dashboards + - configMap: + name: grafana-dashboard-alertmanager-overview + name: grafana-dashboard-alertmanager-overview - configMap: name: grafana-dashboard-apiserver name: grafana-dashboard-apiserver