From 7d8bbb0576c2fe3921a90c3a9772511e71689fe1 Mon Sep 17 00:00:00 2001 From: Sheogorath <sheogorath@shivering-isles.com> Date: Sat, 21 Aug 2021 20:30:42 +0200 Subject: [PATCH] rook: Add prometheus rules for rook monitoring --- infrastructure/rook/kustomization.yaml | 2 + .../rook/prometheus-ceph-rules-external.yaml | 37 ++ .../rook/prometheus-ceph-rules.yaml | 351 ++++++++++++++++++ 3 files changed, 390 insertions(+) create mode 100644 infrastructure/rook/prometheus-ceph-rules-external.yaml create mode 100644 infrastructure/rook/prometheus-ceph-rules.yaml diff --git a/infrastructure/rook/kustomization.yaml b/infrastructure/rook/kustomization.yaml index 7dfca2104..01dd686dc 100644 --- a/infrastructure/rook/kustomization.yaml +++ b/infrastructure/rook/kustomization.yaml @@ -11,3 +11,5 @@ resources: - storageclass.yaml - https://git.shivering-isles.com/github-mirror/rook/rook/-/raw/v1.7.1/cluster/examples/kubernetes/ceph/monitoring/service-monitor.yaml - https://git.shivering-isles.com/github-mirror/rook/rook/-/raw/v1.7.1/cluster/examples/kubernetes/ceph/monitoring/csi-metrics-service-monitor.yaml + - prometheus-ceph-rules.yaml + - prometheus-ceph-rules-external.yaml diff --git a/infrastructure/rook/prometheus-ceph-rules-external.yaml b/infrastructure/rook/prometheus-ceph-rules-external.yaml new file mode 100644 index 000000000..dc212eb36 --- /dev/null +++ b/infrastructure/rook/prometheus-ceph-rules-external.yaml @@ -0,0 +1,37 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + role: alert-rules + name: prometheus-ceph-rules + namespace: rook-ceph +spec: + groups: + - name: persistent-volume-alert.rules + rules: + - alert: PersistentVolumeUsageNearFull + annotations: + description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed + 75%. Free up some space. + message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion + is required. + severity_level: warning + storage_type: ceph + expr: | + (kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.75 + for: 5s + labels: + severity: warning + - alert: PersistentVolumeUsageCritical + annotations: + description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed + 85%. Free up some space immediately. + message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data + deletion is required. + severity_level: error + storage_type: ceph + expr: | + (kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.85 + for: 5s + labels: + severity: critical diff --git a/infrastructure/rook/prometheus-ceph-rules.yaml b/infrastructure/rook/prometheus-ceph-rules.yaml new file mode 100644 index 000000000..77f0fa87c --- /dev/null +++ b/infrastructure/rook/prometheus-ceph-rules.yaml @@ -0,0 +1,351 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + role: alert-rules + name: prometheus-ceph-rules + namespace: rook-ceph +spec: + groups: + - name: ceph.rules + rules: + - expr: | + kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node) + record: cluster:ceph_node_down:join_kube + - expr: | + avg(topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon) topk by (instance,device) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m]))))) + record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m + - name: telemeter.rules + rules: + - expr: | + count(ceph_osd_metadata{job="rook-ceph-mgr"}) + record: job:ceph_osd_metadata:count + - expr: | + count(kube_persistentvolume_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"}) + record: job:kube_pv:count + - expr: | + sum(ceph_pool_rd{job="rook-ceph-mgr"}+ ceph_pool_wr{job="rook-ceph-mgr"}) + record: job:ceph_pools_iops:total + - expr: | + sum(ceph_pool_rd_bytes{job="rook-ceph-mgr"}+ ceph_pool_wr_bytes{job="rook-ceph-mgr"}) + record: job:ceph_pools_iops_bytes:total + - expr: | + count(count(ceph_mon_metadata{job="rook-ceph-mgr"} or ceph_osd_metadata{job="rook-ceph-mgr"} or ceph_rgw_metadata{job="rook-ceph-mgr"} or ceph_mds_metadata{job="rook-ceph-mgr"} or ceph_mgr_metadata{job="rook-ceph-mgr"}) by(ceph_version)) + record: job:ceph_versions_running:count + - name: ceph-mgr-status + rules: + - alert: CephMgrIsAbsent + annotations: + description: Ceph Manager has disappeared from Prometheus target discovery. + message: Storage metrics collector service not available anymore. + severity_level: critical + storage_type: ceph + expr: | + absent(up{job="rook-ceph-mgr"} == 1) + for: 5m + labels: + severity: critical + - alert: CephMgrIsMissingReplicas + annotations: + description: Ceph Manager is missing replicas. + message: Storage metrics collector service doesn't have required no of replicas. + severity_level: warning + storage_type: ceph + expr: | + sum(up{job="rook-ceph-mgr"}) < 1 + for: 5m + labels: + severity: warning + - name: ceph-mds-status + rules: + - alert: CephMdsMissingReplicas + annotations: + description: Minimum required replicas for storage metadata service not available. + Might affect the working of storage cluster. + message: Insufficient replicas for storage metadata service. + severity_level: warning + storage_type: ceph + expr: | + sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) < 2 + for: 5m + labels: + severity: warning + - name: quorum-alert.rules + rules: + - alert: CephMonQuorumAtRisk + annotations: + description: Storage cluster quorum is low. Contact Support. + message: Storage quorum at risk + severity_level: error + storage_type: ceph + expr: | + count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) <= ((count(ceph_mon_metadata{job="rook-ceph-mgr"}) % 2) + 1) + for: 15m + labels: + severity: critical + - alert: CephMonHighNumberOfLeaderChanges + annotations: + description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname + }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently. + message: Storage Cluster has seen many leader changes recently. + severity_level: warning + storage_type: ceph + expr: | + (ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95 + for: 5m + labels: + severity: warning + - name: ceph-node-alert.rules + rules: + - alert: CephNodeDown + annotations: + description: Storage node {{ $labels.node }} went down. Please check the node + immediately. + message: Storage node {{ $labels.node }} went down + severity_level: error + storage_type: ceph + expr: | + cluster:ceph_node_down:join_kube == 0 + for: 30s + labels: + severity: critical + - name: osd-alert.rules + rules: + - alert: CephOSDCriticallyFull + annotations: + description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class + type {{$labels.device_class}} has crossed 80% on host {{ $labels.hostname + }}. Immediately free up some space or add capacity of type {{$labels.device_class}}. + message: Back-end storage device is critically full. + severity_level: error + storage_type: ceph + expr: | + (ceph_osd_metadata * on (ceph_daemon) group_right(device_class) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.80 + for: 40s + labels: + severity: critical + - alert: CephOSDFlapping + annotations: + description: Storage daemon {{ $labels.ceph_daemon }} has restarted 5 times + in last 5 minutes. Please check the pod events or ceph status to find out + the cause. + message: Ceph storage osd flapping. + severity_level: error + storage_type: ceph + expr: | + changes(ceph_osd_up[5m]) >= 10 + for: 0s + labels: + severity: critical + - alert: CephOSDNearFull + annotations: + description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class + type {{$labels.device_class}} has crossed 75% on host {{ $labels.hostname + }}. Immediately free up some space or add capacity of type {{$labels.device_class}}. + message: Back-end storage device is nearing full. + severity_level: warning + storage_type: ceph + expr: | + (ceph_osd_metadata * on (ceph_daemon) group_right(device_class) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.75 + for: 40s + labels: + severity: warning + - alert: CephOSDDiskNotResponding + annotations: + description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host + }}. + message: Disk not responding + severity_level: error + storage_type: ceph + expr: | + label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") + for: 1m + labels: + severity: critical + - alert: CephOSDDiskUnavailable + annotations: + description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host + }}. + message: Disk not accessible + severity_level: error + storage_type: ceph + expr: | + label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") + for: 1m + labels: + severity: critical + - alert: CephOSDSlowOps + annotations: + description: '{{ $value }} Ceph OSD requests are taking too long to process. + Please check ceph status to find out the cause.' + message: OSD requests are taking too long to process. + severity_level: warning + storage_type: ceph + expr: | + ceph_healthcheck_slow_ops > 0 + for: 30s + labels: + severity: warning + - alert: CephDataRecoveryTakingTooLong + annotations: + description: Data recovery has been active for too long. Contact Support. + message: Data recovery is slow + severity_level: warning + storage_type: ceph + expr: | + ceph_pg_undersized > 0 + for: 2h + labels: + severity: warning + - alert: CephPGRepairTakingTooLong + annotations: + description: Self heal operations taking too long. Contact Support. + message: Self heal problems detected + severity_level: warning + storage_type: ceph + expr: | + ceph_pg_inconsistent > 0 + for: 1h + labels: + severity: warning + - name: persistent-volume-alert.rules + rules: + - alert: PersistentVolumeUsageNearFull + annotations: + description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed + 75%. Free up some space or expand the PVC. + message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion + or PVC expansion is required. + severity_level: warning + storage_type: ceph + expr: | + (kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.75 + for: 5s + labels: + severity: warning + - alert: PersistentVolumeUsageCritical + annotations: + description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed + 85%. Free up some space or expand the PVC immediately. + message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data + deletion or PVC expansion is required. + severity_level: error + storage_type: ceph + expr: | + (kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.85 + for: 5s + labels: + severity: critical + - name: cluster-state-alert.rules + rules: + - alert: CephClusterErrorState + annotations: + description: Storage cluster is in error state for more than 10m. + message: Storage cluster is in error state + severity_level: error + storage_type: ceph + expr: | + ceph_health_status{job="rook-ceph-mgr"} > 1 + for: 10m + labels: + severity: critical + - alert: CephClusterWarningState + annotations: + description: Storage cluster is in warning state for more than 10m. + message: Storage cluster is in degraded state + severity_level: warning + storage_type: ceph + expr: | + ceph_health_status{job="rook-ceph-mgr"} == 1 + for: 10m + labels: + severity: warning + - alert: CephOSDVersionMismatch + annotations: + description: There are {{ $value }} different versions of Ceph OSD components + running. + message: There are multiple versions of storage services running. + severity_level: warning + storage_type: ceph + expr: | + count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1 + for: 10m + labels: + severity: warning + - alert: CephMonVersionMismatch + annotations: + description: There are {{ $value }} different versions of Ceph Mon components + running. + message: There are multiple versions of storage services running. + severity_level: warning + storage_type: ceph + expr: | + count(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1 + for: 10m + labels: + severity: warning + - name: cluster-utilization-alert.rules + rules: + - alert: CephClusterNearFull + annotations: + description: Storage cluster utilization has crossed 75% and will become read-only + at 85%. Free up some space or expand the storage cluster. + message: Storage cluster is nearing full. Data deletion or cluster expansion + is required. + severity_level: warning + storage_type: ceph + expr: | + ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.75 + for: 5s + labels: + severity: warning + - alert: CephClusterCriticallyFull + annotations: + description: Storage cluster utilization has crossed 80% and will become read-only + at 85%. Free up some space or expand the storage cluster immediately. + message: Storage cluster is critically full and needs immediate data deletion + or cluster expansion. + severity_level: error + storage_type: ceph + expr: | + ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.80 + for: 5s + labels: + severity: critical + - alert: CephClusterReadOnly + annotations: + description: Storage cluster utilization has crossed 85% and will become read-only + now. Free up some space or expand the storage cluster immediately. + message: Storage cluster is read-only now and needs immediate data deletion + or cluster expansion. + severity_level: error + storage_type: ceph + expr: | + ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes >= 0.85 + for: 0s + labels: + severity: critical + - name: pool-quota.rules + rules: + - alert: CephPoolQuotaBytesNearExhaustion + annotations: + description: Storage pool {{ $labels.name }} quota usage has crossed 70%. + message: Storage pool quota(bytes) is near exhaustion. + severity_level: warning + storage_type: ceph + expr: | + (ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.70 + for: 1m + labels: + severity: warning + - alert: CephPoolQuotaBytesCriticallyExhausted + annotations: + description: Storage pool {{ $labels.name }} quota usage has crossed 90%. + message: Storage pool quota(bytes) is critically exhausted. + severity_level: critical + storage_type: ceph + expr: | + (ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.90 + for: 1m + labels: + severity: critical -- GitLab