From a089ac60fac690bdcda0cd932f0887ba1728a7bd Mon Sep 17 00:00:00 2001
From: Sheogorath <sheogorath@shivering-isles.com>
Date: Sun, 10 Apr 2022 16:03:02 +0200
Subject: [PATCH] feat(cert-manager): Add dashboard and alerts

This patch adds various cert-mananger alerts and dashboards these are
inspired by uneeq-oss and monitoring-mixins on GitHub.

References:
https://github.com/monitoring-mixins/website
---
 infrastructure/cert-manager/alerts.yaml       |   64 +
 .../cert-manager/dashboards/cert-manager.json | 1203 +++++++++++++++++
 .../cert-manager/kustomization.yaml           |    8 +
 3 files changed, 1275 insertions(+)
 create mode 100644 infrastructure/cert-manager/alerts.yaml
 create mode 100644 infrastructure/cert-manager/dashboards/cert-manager.json

diff --git a/infrastructure/cert-manager/alerts.yaml b/infrastructure/cert-manager/alerts.yaml
new file mode 100644
index 000000000..e056dac6f
--- /dev/null
+++ b/infrastructure/cert-manager/alerts.yaml
@@ -0,0 +1,64 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: prometheus-cert-manager-rules
+  namespace: cert-manager
+spec:
+  groups:
+    - name: cert-manager
+      rules:
+      - alert: CertManagerAbsent
+        annotations:
+          description: New certificates will not be able to be minted, and existing ones
+            can't be renewed until cert-manager is back.
+          runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent
+          summary: Cert Manager has dissapeared from Prometheus service discovery.
+        expr: absent(up{job="cert-manager"})
+        for: 10m
+        labels:
+          severity: critical
+    - name: certificates
+      rules:
+      - alert: CertManagerCertExpirySoon
+        annotations:
+          description: The domain that this cert covers will be unavailable after {{ $value
+            | humanizeDuration }}. Clients using endpoints that this cert protects will
+            start to fail in {{ $value | humanizeDuration }}.
+          runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon
+          summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from
+            expiry, it should have renewed over a week ago.
+        expr: |
+          avg by (exported_namespace, namespace, name) (
+            certmanager_certificate_expiration_timestamp_seconds - time()
+          ) < (21 * 24 * 3600) # 21 days in seconds
+        for: 1h
+        labels:
+          severity: warning
+      - alert: CertManagerCertNotReady
+        annotations:
+          description: This certificate has not been ready to serve traffic for at least
+            10m. If the cert is being renewed or there is another valid cert, the ingress
+            controller _may_ be able to serve that instead.
+          runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready
+          summary: The cert `{{ $labels.name }}` is not ready to serve traffic.
+        expr: |
+          max by (name, exported_namespace, namespace, condition) (
+            certmanager_certificate_ready_status{condition!="True"} == 1
+          )
+        for: 10m
+        labels:
+          severity: critical
+      - alert: CertManagerHittingRateLimits
+        annotations:
+          description: Depending on the rate limit, cert-manager may be unable to generate
+            certificates for up to a week.
+          runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits
+          summary: Cert manager hitting LetsEncrypt rate limits.
+        expr: |
+          sum by (host) (
+            rate(certmanager_http_acme_client_request_count{status="429"}[5m])
+          ) > 0
+        for: 5m
+        labels:
+          severity: critical
diff --git a/infrastructure/cert-manager/dashboards/cert-manager.json b/infrastructure/cert-manager/dashboards/cert-manager.json
new file mode 100644
index 000000000..2d6608b3f
--- /dev/null
+++ b/infrastructure/cert-manager/dashboards/cert-manager.json
@@ -0,0 +1,1203 @@
+{
+   "annotations": {
+      "list": [
+         {
+            "builtIn": 1,
+            "datasource": "-- Grafana --",
+            "enable": true,
+            "hide": true,
+            "iconColor": "rgba(0, 211, 255, 1)",
+            "name": "Annotations & Alerts",
+            "type": "dashboard"
+         }
+      ]
+   },
+   "description": "",
+   "editable": true,
+   "gnetId": null,
+   "graphTooltip": 1,
+   "id": 59,
+   "iteration": 1616445892702,
+   "links": [ ],
+   "panels": [
+      {
+         "datasource": "$datasource",
+         "description": "The number of certificates in the ready state.",
+         "fieldConfig": {
+            "defaults": {
+               "custom": { },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 1
+                     }
+                  ]
+               }
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byName",
+                     "options": "True"
+                  },
+                  "properties": [
+                     {
+                        "id": "thresholds",
+                        "value": {
+                           "mode": "absolute",
+                           "steps": [
+                              {
+                                 "color": "green",
+                                 "value": null
+                              }
+                           ]
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 0
+         },
+         "id": 2,
+         "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {
+               "calcs": [
+                  "lastNotNull"
+               ],
+               "fields": "",
+               "values": false
+            },
+            "text": { },
+            "textMode": "auto"
+         },
+         "pluginVersion": "7.4.5",
+         "targets": [
+            {
+               "expr": "sum by (condition) (certmanager_certificate_ready_status)",
+               "interval": "",
+               "legendFormat": "{{condition}}",
+               "refId": "A"
+            }
+         ],
+         "timeFrom": null,
+         "timeShift": null,
+         "title": "Certificates Ready",
+         "type": "stat"
+      },
+      {
+         "datasource": "$datasource",
+         "fieldConfig": {
+            "defaults": {
+               "custom": { },
+               "decimals": 1,
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "red",
+                        "value": null
+                     },
+                     {
+                        "color": "#EAB839",
+                        "value": 604800
+                     },
+                     {
+                        "color": "green",
+                        "value": 1209600
+                     }
+                  ]
+               },
+               "unit": "dtdurations"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 0
+         },
+         "id": 4,
+         "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {
+               "calcs": [
+                  "lastNotNull"
+               ],
+               "fields": "",
+               "values": false
+            },
+            "text": { },
+            "textMode": "auto"
+         },
+         "pluginVersion": "7.4.5",
+         "targets": [
+            {
+               "expr": "min(certmanager_certificate_expiration_timestamp_seconds > 0) - time()",
+               "hide": false,
+               "instant": true,
+               "interval": "",
+               "legendFormat": "",
+               "refId": "A"
+            },
+            {
+               "expr": "vector(1250000)",
+               "hide": true,
+               "instant": true,
+               "interval": "",
+               "legendFormat": "",
+               "refId": "B"
+            }
+         ],
+         "timeFrom": null,
+         "timeShift": null,
+         "title": "Soonest Cert Expiry",
+         "type": "stat"
+      },
+      {
+         "datasource": "$datasource",
+         "description": "Status of the certificates. Values are True, False or Unknown.",
+         "fieldConfig": {
+            "defaults": {
+               "custom": {
+                  "align": null,
+                  "filterable": false
+               },
+               "mappings": [
+                  {
+                     "from": "",
+                     "id": 0,
+                     "operator": "",
+                     "text": "Yes",
+                     "to": "",
+                     "type": 1,
+                     "value": ""
+                  }
+               ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "none"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byName",
+                     "options": "Ready Status"
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.width",
+                        "value": 100
+                     }
+                  ]
+               },
+               {
+                  "matcher": {
+                     "id": "byName",
+                     "options": "Valid Until"
+                  },
+                  "properties": [
+                     {
+                        "id": "unit",
+                        "value": "dateTimeAsIso"
+                     }
+                  ]
+               },
+               {
+                  "matcher": {
+                     "id": "byName",
+                     "options": "Valid Until"
+                  },
+                  "properties": [
+                     {
+                        "id": "unit",
+                        "value": "dateTimeAsIso"
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 8
+         },
+         "id": 9,
+         "options": {
+            "showHeader": true,
+            "sortBy": [
+               {
+                  "desc": false,
+                  "displayName": "Valid Until"
+               }
+            ]
+         },
+         "pluginVersion": "7.4.5",
+         "targets": [
+            {
+               "expr": "label_join(avg by (name, namespace, condition, exported_namespace) (certmanager_certificate_ready_status == 1), \"namespaced_name\", \"-\", \"namespace\", \"exported_namespace\", \"name\")",
+               "format": "table",
+               "instant": true,
+               "interval": "",
+               "legendFormat": "",
+               "refId": "A"
+            },
+            {
+               "expr": "label_join(avg by (name, namespace, exported_namespace) (certmanager_certificate_expiration_timestamp_seconds) * 1000, \"namespaced_name\", \"-\", \"namespace\", \"exported_namespace\", \"name\")",
+               "format": "table",
+               "instant": true,
+               "interval": "",
+               "legendFormat": "",
+               "refId": "B"
+            }
+         ],
+         "timeFrom": null,
+         "timeShift": null,
+         "title": "Certificates",
+         "transformations": [
+            {
+               "id": "seriesToColumns",
+               "options": {
+                  "byField": "namespaced_name"
+               }
+            },
+            {
+               "id": "organize",
+               "options": {
+                  "excludeByName": {
+                     "Time": true,
+                     "Time 1": true,
+                     "Time 2": true,
+                     "Value #A": true,
+                     "exported_namespace": false,
+                     "exported_namespace 1": false,
+                     "exported_namespace 2": true,
+                     "name 1": true,
+                     "namespace 2": true,
+                     "namespaced_name": true
+                  },
+                  "indexByName": {
+                     "Time 1": 8,
+                     "Time 2": 10,
+                     "Value #A": 6,
+                     "Value #B": 5,
+                     "condition": 4,
+                     "exported_namespace 1": 1,
+                     "exported_namespace 2": 11,
+                     "name 1": 9,
+                     "name 2": 3,
+                     "namespace": 0,
+                     "namespace 1": 2,
+                     "namespaced_name": 7
+                  },
+                  "renameByName": {
+                     "Time 1": "",
+                     "Value #B": "Valid Until",
+                     "condition": "Ready Status",
+                     "exported_namespace": "Certificate Namespace",
+                     "exported_namespace 1": "Certificate Namespace",
+                     "exported_namespace 2": "",
+                     "name": "Certificate",
+                     "name 2": "Certificate",
+                     "namespace": "Namespace",
+                     "namespace 1": "Namespace",
+                     "namespaced_name": ""
+                  }
+               }
+            }
+         ],
+         "type": "table"
+      },
+      {
+         "aliasColors": { },
+         "bars": false,
+         "dashLength": 10,
+         "dashes": false,
+         "datasource": "$datasource",
+         "description": "The rate of controller sync requests.",
+         "fieldConfig": {
+            "defaults": {
+               "custom": { },
+               "links": [ ]
+            },
+            "overrides": [ ]
+         },
+         "fill": 1,
+         "fillGradient": 0,
+         "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 8
+         },
+         "hiddenSeries": false,
+         "id": 7,
+         "interval": "20s",
+         "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+         },
+         "lines": true,
+         "linewidth": 1,
+         "maxDataPoints": 250,
+         "nullPointMode": "null",
+         "options": {
+            "alertThreshold": true
+         },
+         "percentage": false,
+         "pluginVersion": "7.4.5",
+         "pointradius": 2,
+         "points": false,
+         "renderer": "flot",
+         "seriesOverrides": [ ],
+         "spaceLength": 10,
+         "stack": false,
+         "steppedLine": false,
+         "targets": [
+            {
+               "expr": "sum by (controller) (\n  rate(certmanager_controller_sync_call_count[$__rate_interval])\n)",
+               "interval": "",
+               "legendFormat": "{{controller}}",
+               "refId": "A"
+            }
+         ],
+         "thresholds": [ ],
+         "timeFrom": null,
+         "timeRegions": [ ],
+         "timeShift": null,
+         "title": "Controller Sync Requests/sec",
+         "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+         },
+         "type": "graph",
+         "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": [ ]
+         },
+         "yaxes": [
+            {
+               "format": "reqps",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": "0",
+               "show": true
+            },
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": null,
+               "show": true
+            }
+         ],
+         "yaxis": {
+            "align": false,
+            "alignLevel": null
+         }
+      },
+      {
+         "aliasColors": { },
+         "bars": false,
+         "dashLength": 10,
+         "dashes": false,
+         "datasource": "$datasource",
+         "description": "Rate of requests to ACME provider.",
+         "fieldConfig": {
+            "defaults": {
+               "custom": { },
+               "links": [ ]
+            },
+            "overrides": [ ]
+         },
+         "fill": 1,
+         "fillGradient": 0,
+         "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 16
+         },
+         "hiddenSeries": false,
+         "id": 6,
+         "interval": "20s",
+         "legend": {
+            "avg": false,
+            "current": false,
+            "hideEmpty": true,
+            "hideZero": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+         },
+         "lines": true,
+         "linewidth": 1,
+         "maxDataPoints": 250,
+         "nullPointMode": "null",
+         "options": {
+            "alertThreshold": true
+         },
+         "percentage": false,
+         "pluginVersion": "7.4.5",
+         "pointradius": 2,
+         "points": false,
+         "renderer": "flot",
+         "seriesOverrides": [ ],
+         "spaceLength": 10,
+         "stack": false,
+         "steppedLine": false,
+         "targets": [
+            {
+               "expr": "sum by (method, path, status) (\n  rate(certmanager_http_acme_client_request_count[$__rate_interval])\n)",
+               "interval": "",
+               "legendFormat": "{{method}} {{path}} {{status}}",
+               "refId": "A"
+            }
+         ],
+         "thresholds": [ ],
+         "timeFrom": null,
+         "timeRegions": [ ],
+         "timeShift": null,
+         "title": "ACME HTTP Requests/sec",
+         "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+         },
+         "type": "graph",
+         "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": [ ]
+         },
+         "yaxes": [
+            {
+               "format": "reqps",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": "0",
+               "show": true
+            },
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": null,
+               "show": true
+            }
+         ],
+         "yaxis": {
+            "align": false,
+            "alignLevel": null
+         }
+      },
+      {
+         "aliasColors": { },
+         "bars": false,
+         "dashLength": 10,
+         "dashes": false,
+         "datasource": "$datasource",
+         "description": "Average duration of requests to ACME provider. ",
+         "fieldConfig": {
+            "defaults": {
+               "custom": { },
+               "links": [ ]
+            },
+            "overrides": [ ]
+         },
+         "fill": 1,
+         "fillGradient": 0,
+         "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 16
+         },
+         "hiddenSeries": false,
+         "id": 10,
+         "interval": "30s",
+         "legend": {
+            "avg": false,
+            "current": false,
+            "hideEmpty": true,
+            "hideZero": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+         },
+         "lines": true,
+         "linewidth": 1,
+         "maxDataPoints": 250,
+         "nullPointMode": "null",
+         "options": {
+            "alertThreshold": true
+         },
+         "percentage": false,
+         "pluginVersion": "7.4.5",
+         "pointradius": 2,
+         "points": false,
+         "renderer": "flot",
+         "seriesOverrides": [ ],
+         "spaceLength": 10,
+         "stack": false,
+         "steppedLine": false,
+         "targets": [
+            {
+               "expr": "sum by (method, path, status) (rate(certmanager_http_acme_client_request_duration_seconds_sum[$__rate_interval]))\n/\nsum by (method, path, status) (rate(certmanager_http_acme_client_request_duration_seconds_count[$__rate_interval]))",
+               "interval": "",
+               "legendFormat": "{{method}} {{path}} {{status}}",
+               "refId": "A"
+            }
+         ],
+         "thresholds": [ ],
+         "timeFrom": null,
+         "timeRegions": [ ],
+         "timeShift": null,
+         "title": "ACME HTTP Request avg duration",
+         "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+         },
+         "type": "graph",
+         "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": [ ]
+         },
+         "yaxes": [
+            {
+               "format": "s",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": "0",
+               "show": true
+            },
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": null,
+               "show": true
+            }
+         ],
+         "yaxis": {
+            "align": false,
+            "alignLevel": null
+         }
+      },
+      {
+         "aliasColors": {
+            "max": "dark-yellow"
+         },
+         "bars": false,
+         "dashLength": 10,
+         "dashes": false,
+         "datasource": "$datasource",
+         "description": "CPU Usage and limits, as percent of a vCPU core. ",
+         "fieldConfig": {
+            "defaults": {
+               "custom": { },
+               "links": [ ]
+            },
+            "overrides": [ ]
+         },
+         "fill": 0,
+         "fillGradient": 0,
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 0,
+            "y": 24
+         },
+         "hiddenSeries": false,
+         "id": 12,
+         "interval": "1m",
+         "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+         },
+         "lines": true,
+         "linewidth": 1,
+         "links": [ ],
+         "maxDataPoints": 250,
+         "nullPointMode": "null",
+         "options": {
+            "alertThreshold": true
+         },
+         "percentage": false,
+         "pluginVersion": "7.4.5",
+         "pointradius": 2,
+         "points": false,
+         "renderer": "flot",
+         "seriesOverrides": [
+            {
+               "alias": "CPU",
+               "fill": 1,
+               "fillGradient": 5
+            },
+            {
+               "alias": "/Request.*/",
+               "color": "#FF9830",
+               "dashes": true
+            },
+            {
+               "alias": "/Limit.*/",
+               "color": "#F2495C",
+               "dashes": true
+            }
+         ],
+         "spaceLength": 10,
+         "stack": false,
+         "steppedLine": false,
+         "targets": [
+            {
+               "expr": "avg by (pod) (rate(container_cpu_usage_seconds_total{container=\"cert-manager\"}[$__rate_interval]))",
+               "format": "time_series",
+               "hide": false,
+               "interval": "",
+               "intervalFactor": 2,
+               "legendFormat": "CPU {{pod}}",
+               "refId": "A"
+            },
+            {
+               "expr": "avg by (pod) (kube_pod_container_resource_limits_cpu_cores{container=\"cert-manager\"})",
+               "format": "time_series",
+               "hide": true,
+               "interval": "",
+               "intervalFactor": 1,
+               "legendFormat": "Limit {{pod}}",
+               "refId": "B"
+            },
+            {
+               "expr": "avg by (pod) (kube_pod_container_resource_requests_cpu_cores{container=\"cert-manager\"})",
+               "format": "time_series",
+               "hide": true,
+               "interval": "",
+               "intervalFactor": 1,
+               "legendFormat": "Request {{pod}}",
+               "refId": "C"
+            }
+         ],
+         "thresholds": [ ],
+         "timeFrom": null,
+         "timeRegions": [ ],
+         "timeShift": null,
+         "title": "CPU",
+         "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+         },
+         "type": "graph",
+         "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": [ ]
+         },
+         "yaxes": [
+            {
+               "format": "percentunit",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": "0",
+               "show": true
+            },
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": null,
+               "show": true
+            }
+         ],
+         "yaxis": {
+            "align": false,
+            "alignLevel": null
+         }
+      },
+      {
+         "aliasColors": {
+            "max": "dark-yellow"
+         },
+         "bars": false,
+         "dashLength": 10,
+         "dashes": false,
+         "datasource": "$datasource",
+         "description": "Percent of the time that the CPU is being throttled. Higher is badderer. ",
+         "fieldConfig": {
+            "defaults": {
+               "custom": { },
+               "links": [ ]
+            },
+            "overrides": [ ]
+         },
+         "fill": 0,
+         "fillGradient": 0,
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 6,
+            "y": 24
+         },
+         "hiddenSeries": false,
+         "id": 14,
+         "interval": "1m",
+         "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+         },
+         "lines": true,
+         "linewidth": 1,
+         "links": [ ],
+         "maxDataPoints": 250,
+         "nullPointMode": "connected",
+         "options": {
+            "alertThreshold": true
+         },
+         "percentage": false,
+         "pluginVersion": "7.4.5",
+         "pointradius": 2,
+         "points": false,
+         "renderer": "flot",
+         "seriesOverrides": [
+            {
+               "alias": "/external-dns.*/",
+               "fill": 1,
+               "fillGradient": 5
+            }
+         ],
+         "spaceLength": 10,
+         "stack": false,
+         "steppedLine": false,
+         "targets": [
+            {
+               "expr": "avg by (pod) (\n  rate(container_cpu_cfs_throttled_periods_total{container=\"cert-manager\"}[$__rate_interval])\n  /\n  rate(container_cpu_cfs_periods_total{container=\"cert-manager\"}[$__rate_interval])\n)",
+               "format": "time_series",
+               "hide": false,
+               "interval": "",
+               "intervalFactor": 2,
+               "legendFormat": "{{pod}}",
+               "refId": "A"
+            }
+         ],
+         "thresholds": [ ],
+         "timeFrom": null,
+         "timeRegions": [ ],
+         "timeShift": null,
+         "title": "CPU Throttling",
+         "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+         },
+         "type": "graph",
+         "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": [ ]
+         },
+         "yaxes": [
+            {
+               "format": "percentunit",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": "0",
+               "show": true
+            },
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": null,
+               "show": true
+            }
+         ],
+         "yaxis": {
+            "align": false,
+            "alignLevel": null
+         }
+      },
+      {
+         "aliasColors": {
+            "max": "dark-yellow"
+         },
+         "bars": false,
+         "dashLength": 10,
+         "dashes": false,
+         "datasource": "$datasource",
+         "description": "Memory utilisation and limits.",
+         "fieldConfig": {
+            "defaults": {
+               "custom": { },
+               "links": [ ]
+            },
+            "overrides": [ ]
+         },
+         "fill": 0,
+         "fillGradient": 0,
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 12,
+            "y": 24
+         },
+         "hiddenSeries": false,
+         "id": 16,
+         "interval": "1m",
+         "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+         },
+         "lines": true,
+         "linewidth": 1,
+         "links": [ ],
+         "maxDataPoints": 250,
+         "nullPointMode": "null",
+         "options": {
+            "alertThreshold": true
+         },
+         "percentage": false,
+         "pluginVersion": "7.4.5",
+         "pointradius": 2,
+         "points": false,
+         "renderer": "flot",
+         "seriesOverrides": [
+            {
+               "alias": "Memory",
+               "fill": 1,
+               "fillGradient": 5
+            },
+            {
+               "alias": "Request",
+               "color": "#FF9830",
+               "dashes": true
+            },
+            {
+               "alias": "Limit",
+               "color": "#F2495C",
+               "dashes": true
+            }
+         ],
+         "spaceLength": 10,
+         "stack": false,
+         "steppedLine": false,
+         "targets": [
+            {
+               "expr": "avg by (pod) (container_memory_usage_bytes{container=\"cert-manager\"})",
+               "format": "time_series",
+               "hide": false,
+               "interval": "",
+               "intervalFactor": 1,
+               "legendFormat": "Memory {{pod}}",
+               "refId": "A"
+            },
+            {
+               "expr": "avg by (pod) (kube_pod_container_resource_limits_memory_bytes{container=\"cert-manager\"})",
+               "format": "time_series",
+               "interval": "",
+               "intervalFactor": 1,
+               "legendFormat": "Limit {{pod}}",
+               "refId": "B"
+            },
+            {
+               "expr": "avg by (pod) (kube_pod_container_resource_requests_memory_bytes{container=\"cert-manager\"})",
+               "format": "time_series",
+               "interval": "",
+               "intervalFactor": 1,
+               "legendFormat": "Request {{pod}}",
+               "refId": "C"
+            }
+         ],
+         "thresholds": [ ],
+         "timeFrom": null,
+         "timeRegions": [ ],
+         "timeShift": null,
+         "title": "Memory",
+         "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+         },
+         "type": "graph",
+         "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": [ ]
+         },
+         "yaxes": [
+            {
+               "format": "bytes",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": "0",
+               "show": true
+            },
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": null,
+               "show": true
+            }
+         ],
+         "yaxis": {
+            "align": false,
+            "alignLevel": null
+         }
+      },
+      {
+         "aliasColors": {
+            "max": "dark-yellow"
+         },
+         "bars": false,
+         "dashLength": 10,
+         "dashes": false,
+         "datasource": "$datasource",
+         "description": "Network ingress/egress.",
+         "fieldConfig": {
+            "defaults": {
+               "custom": { },
+               "links": [ ]
+            },
+            "overrides": [ ]
+         },
+         "fill": 1,
+         "fillGradient": 5,
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 18,
+            "y": 24
+         },
+         "hiddenSeries": false,
+         "id": 18,
+         "interval": "1m",
+         "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+         },
+         "lines": true,
+         "linewidth": 1,
+         "links": [ ],
+         "nullPointMode": "null",
+         "options": {
+            "alertThreshold": true
+         },
+         "percentage": false,
+         "pluginVersion": "7.4.5",
+         "pointradius": 2,
+         "points": false,
+         "renderer": "flot",
+         "seriesOverrides": [
+            {
+               "alias": "transmit",
+               "transform": "negative-Y"
+            }
+         ],
+         "spaceLength": 10,
+         "stack": false,
+         "steppedLine": false,
+         "targets": [
+            {
+               "expr": "avg(\n  sum without (interface) (\n    rate(container_network_receive_bytes_total{namespace=\"cert-manager\"}[$__rate_interval])\n  )\n)",
+               "format": "time_series",
+               "hide": false,
+               "interval": "",
+               "intervalFactor": 2,
+               "legendFormat": "receive",
+               "refId": "A"
+            },
+            {
+               "expr": "avg(\n  sum without (interface) (\n    rate(container_network_transmit_bytes_total{namespace=\"cert-manager\"}[$__rate_interval])\n  )\n)",
+               "format": "time_series",
+               "hide": false,
+               "interval": "",
+               "intervalFactor": 2,
+               "legendFormat": "transmit",
+               "refId": "B"
+            }
+         ],
+         "thresholds": [ ],
+         "timeFrom": null,
+         "timeRegions": [ ],
+         "timeShift": null,
+         "title": "Network",
+         "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+         },
+         "type": "graph",
+         "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": [ ]
+         },
+         "yaxes": [
+            {
+               "format": "Bps",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": null,
+               "show": true
+            },
+            {
+               "format": "short",
+               "label": null,
+               "logBase": 1,
+               "max": null,
+               "min": null,
+               "show": true
+            }
+         ],
+         "yaxis": {
+            "align": false,
+            "alignLevel": null
+         }
+      }
+   ],
+   "refresh": "1m",
+   "schemaVersion": 27,
+   "style": "dark",
+   "tags": [
+      "cert-manager",
+      "infra"
+   ],
+   "templating": {
+      "list": [
+         {
+            "current": {
+               "selected": false,
+               "text": "prometheus",
+               "value": "prometheus"
+            },
+            "description": null,
+            "error": null,
+            "hide": 0,
+            "includeAll": false,
+            "label": "Data Source",
+            "multi": false,
+            "name": "datasource",
+            "options": [ ],
+            "query": "prometheus",
+            "queryValue": "",
+            "refresh": 1,
+            "regex": "",
+            "skipUrlSync": false,
+            "type": "datasource"
+         }
+      ]
+   },
+   "time": {
+      "from": "now-24h",
+      "to": "now"
+   },
+   "timepicker": {
+      "refresh_intervals": [
+         "10s",
+         "30s",
+         "1m",
+         "5m",
+         "15m",
+         "30m",
+         "1h",
+         "2h",
+         "1d"
+      ]
+   },
+   "timezone": "",
+   "title": "Cert Manager",
+   "uid": "TvuRo2iMk",
+   "version": 1
+}
diff --git a/infrastructure/cert-manager/kustomization.yaml b/infrastructure/cert-manager/kustomization.yaml
index fd51ad3b1..dd8fcaea5 100644
--- a/infrastructure/cert-manager/kustomization.yaml
+++ b/infrastructure/cert-manager/kustomization.yaml
@@ -6,8 +6,16 @@ resources:
   - repository.yaml
   - release.yaml
   - networkpolicy.yaml
+  - alerts.yaml
   - ../../shared/networkpolicies/allow-from-monitoring.yaml
   - ../../shared/networkpolicies/allow-from-same-namespace.yaml
   - ../../shared/networkpolicies/allow-from-all-namespaces.yaml
 patchesStrategicMerge:
   - networkpolicy-patch.yaml
+configMapGenerator:
+  - name: cert-manager-grafana-dashboards
+    files:
+      - ./dashboards/cert-manager.json
+    options:
+      labels:
+        grafana_dashboard: cert-manager
-- 
GitLab