From b7fe018d29836975ed9efa2e344daf781aaae2c8 Mon Sep 17 00:00:00 2001 From: Maxime Brunet <maxime.brunet@paytm.com> Date: Sat, 31 Jul 2021 11:37:12 -0700 Subject: [PATCH] eks: Revert back to `awscni_total_ip_addresses`-based alert --- .../kube-prometheus/platforms/eks.libsonnet | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/jsonnet/kube-prometheus/platforms/eks.libsonnet b/jsonnet/kube-prometheus/platforms/eks.libsonnet index d99885d8..ad188837 100644 --- a/jsonnet/kube-prometheus/platforms/eks.libsonnet +++ b/jsonnet/kube-prometheus/platforms/eks.libsonnet @@ -1,8 +1,13 @@ (import '../addons/managed-cluster.libsonnet') + { values+:: { - eks: { - minimumAvailableIPs: 10, - minimumAvailableIPsTime: '10m', + awsVpcCni: { + // `minimumWarmIPs` should be inferior or equal to `WARM_IP_TARGET`. + // + // References: + // https://github.com/aws/amazon-vpc-cni-k8s/blob/v1.9.0/docs/eni-and-ip-target.md + // https://github.com/aws/amazon-vpc-cni-k8s/blob/v1.9.0/pkg/ipamd/ipamd.go#L61-L71 + minimumWarmIPs: 10, + minimumWarmIPsTime: '10m', }, }, kubernetesControlPlane+: { @@ -17,7 +22,8 @@ ], }, }, - AwsEksCniMetricService: { + + serviceAwsVpcCniMetrics: { apiVersion: 'v1', kind: 'Service', metadata: { @@ -38,14 +44,14 @@ }, }, - serviceMonitorAwsEksCNI: { + serviceMonitorAwsVpcCni: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', metadata: { - name: 'awsekscni', + name: 'aws-node', namespace: $.values.common.namespace, labels: { - 'app.kubernetes.io/name': 'eks-cni', + 'app.kubernetes.io/name': 'aws-node', }, }, spec: { @@ -78,30 +84,34 @@ ], }, }, - prometheusRuleEksCNI: { + + prometheusRuleAwsVpcCni: { apiVersion: 'monitoring.coreos.com/v1', kind: 'PrometheusRule', metadata: { labels: $.prometheus._config.commonLabels + $.prometheus._config.mixin.ruleLabels, - name: 'eks-rules', + name: 'aws-vpc-cni-rules', namespace: $.prometheus._config.namespace, }, spec: { groups: [ { - name: 'kube-prometheus-eks.rules', + name: 'kube-prometheus-aws-vpc-cni.rules', rules: [ { - expr: 'sum by(instance) (awscni_ip_max) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $.values.eks.minimumAvailableIPs, + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $.values.awsVpcCni.minimumWarmIPs, labels: { severity: 'critical', }, annotations: { - summary: 'EKS CNI is running low on available IPs', - description: 'Instance {{ $labels.instance }} has only {{ $value }} IPs available which is lower than set threshold of %s' % $.values.eks.minimumAvailableIPs, + summary: 'AWS VPC CNI has a low warm IP pool', + description: ||| + Instance {{ $labels.instance }} has only {{ $value }} warm IPs which is lower than set threshold of %s. + It could mean the current subnet is out of available IP addresses or the CNI is unable to request them from the EC2 API. + ||| % $.values.awsVpcCni.minimumWarmIPs, }, - 'for': $.values.eks.minimumAvailableIPsTime, - alert: 'EksCNILowAvailableIPs', + 'for': $.values.awsVpcCni.minimumWarmIPsTime, + alert: 'AwsVpcCniWarmIPsLow', }, ], }, -- GitLab