From b54ad2ea7151a2ed392b3dcad5dbd975b6c40fc6 Mon Sep 17 00:00:00 2001 From: Arunprasad Rajkumar <arajkuma@redhat.com> Date: Wed, 13 Apr 2022 12:01:06 +0530 Subject: [PATCH] Adjust NodeFilesystemSpaceFillingUp thresholds according default kubelet GC behavior Previously[1] we attempted to do the same, but there was a misunderstanding about the GC behavior and it caused the alert to be fired even before GC comes into play. According to[2][3] kubelet GC kicks in only when `imageGCHighThresholdPercent` is hit which is set to 85% by default. However `NodeFilesystemSpaceFillingUp` is set to fire as soon as 80% usage is hit. This commit changes the `fsSpaceFillingUpWarningThreshold` to 15% so that we give ample time to GC to reclaim unwanted images. This commit also changes `fsSpaceFillingUpCriticalThreshold` to 10% which gives more time to admins to react to warning before sending critical alert. [1] https://github.com/prometheus-operator/kube-prometheus/pull/1357 [2] https://docs.openshift.com/container-platform/4.10/nodes/nodes/nodes-nodes-garbage-collection.html#nodes-nodes-garbage-collection-images_nodes-nodes-configuring [3] https://kubernetes.io/docs/reference/config-api/kubelet-config.v1beta1/ Signed-off-by: Arunprasad Rajkumar <arajkuma@redhat.com> (cherry picked from commit 6ff8bfbb0256f35640c97104ac2d3b880ac681a4) --- jsonnet/kube-prometheus/components/node-exporter.libsonnet | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/jsonnet/kube-prometheus/components/node-exporter.libsonnet b/jsonnet/kube-prometheus/components/node-exporter.libsonnet index 863cd12b..2da5dd84 100644 --- a/jsonnet/kube-prometheus/components/node-exporter.libsonnet +++ b/jsonnet/kube-prometheus/components/node-exporter.libsonnet @@ -35,9 +35,12 @@ local defaults = { // GC values, // imageGCLowThresholdPercent: 80 // imageGCHighThresholdPercent: 85 + // GC kicks in when imageGCHighThresholdPercent is hit and attempts to free upto imageGCLowThresholdPercent. // See https://kubernetes.io/docs/reference/config-api/kubelet-config.v1beta1/ for more details. - fsSpaceFillingUpWarningThreshold: 20, - fsSpaceFillingUpCriticalThreshold: 15, + // Warn only after imageGCHighThresholdPercent is hit, but filesystem is not freed up for a prolonged duration. + fsSpaceFillingUpWarningThreshold: 15, + // Send critical alert only after (imageGCHighThresholdPercent + 5) is hit, but filesystem is not freed up for a prolonged duration. + fsSpaceFillingUpCriticalThreshold: 10, diskDeviceSelector: 'device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"', runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/node/%s', }, -- GitLab