From 463ad065d39022b507399ac9f15b441ec7bbfc69 Mon Sep 17 00:00:00 2001
From: Philip Gough <philip.p.gough@gmail.com>
Date: Tue, 20 Jul 2021 10:43:02 +0100
Subject: [PATCH] jsonnet: Drop cAdvisor metrics with no (pod, namespace)
 labels while preserving ability to monitor system services resource usage

The following provides a description and cardinality estimation based on the tests in a local cluster:

container_blkio_device_usage_total - useful for containers, but not for system services (nodes*disks*services*operations*2)
container_fs_.*                    - add filesystem read/write data (nodes*disks*services*4)
container_file_descriptors         - file descriptors limits and global numbers are exposed via (nodes*services)
container_threads_max              - max number of threads in cgroup. Usually for system services it is not limited (nodes*services)
container_threads                  - used threads in cgroup. Usually not important for system services (nodes*services)
container_sockets                  - used sockets in cgroup. Usually not important for system services (nodes*services)
container_start_time_seconds       - container start. Possibly not needed for system services (nodes*services)
container_last_seen                - Not needed as system services are always running (nodes*services)
container_spec_.*                  - Everything related to cgroup specification and thus static data (nodes*services*5)
---
 .../components/k8s-control-plane.libsonnet      | 17 +++++++++++++++++
 manifests/kubernetes-serviceMonitorKubelet.yaml |  6 ++++++
 2 files changed, 23 insertions(+)

diff --git a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet
index dbf3474b..475a3e5a 100644
--- a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet
+++ b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet
@@ -120,6 +120,23 @@ function(params) {
               regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)',
               action: 'drop',
             },
+            // Drop cAdvisor metrics with no (pod, namespace) labels while preserving ability to monitor system services resource usage (cardinality estimation)
+            {
+              sourceLabels: ['__name__', 'pod', 'namespace'],
+              action: 'drop',
+              regex: '(' + std.join('|',
+                                    [
+                                      'container_fs_.*',  // add filesystem read/write data (nodes*disks*services*4)
+                                      'container_spec_.*',  // everything related to cgroup specification and thus static data (nodes*services*5)
+                                      'container_blkio_device_usage_total',  // useful for containers, but not for system services (nodes*disks*services*operations*2)
+                                      'container_file_descriptors',  // file descriptors limits and global numbers are exposed via (nodes*services)
+                                      'container_sockets',  // used sockets in cgroup. Usually not important for system services (nodes*services)
+                                      'container_threads_max',  // max number of threads in cgroup. Usually for system services it is not limited (nodes*services)
+                                      'container_threads',  // used threads in cgroup. Usually not important for system services (nodes*services)
+                                      'container_start_time_seconds',  // container start. Possibly not needed for system services (nodes*services)
+                                      'container_last_seen',  // not needed as system services are always running (nodes*services)
+                                    ]) + ');;',
+            },
           ],
         },
         {
diff --git a/manifests/kubernetes-serviceMonitorKubelet.yaml b/manifests/kubernetes-serviceMonitorKubelet.yaml
index 9e3ec0d5..ea050274 100644
--- a/manifests/kubernetes-serviceMonitorKubelet.yaml
+++ b/manifests/kubernetes-serviceMonitorKubelet.yaml
@@ -60,6 +60,12 @@ spec:
       regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
       sourceLabels:
       - __name__
+    - action: drop
+      regex: (container_fs_.*|container_spec_.*|container_blkio_device_usage_total|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);;
+      sourceLabels:
+      - __name__
+      - pod
+      - namespace
     path: /metrics/cadvisor
     port: https-metrics
     relabelings:
-- 
GitLab