diff --git a/README.md b/README.md index 50659f951e20f0ec3d90e939e27411a98bd76fc0..9a31c1277d4850fb147c6889cc6213a422bc2532 100644 --- a/README.md +++ b/README.md @@ -653,6 +653,7 @@ As described in the [Prerequisites](#prerequisites) section, in order to retriev If you are using Google's GKE product, see [cAdvisor support](docs/GKE-cadvisor-support.md). +If you are using AWS EKS, see [AWS EKS CNI support](docs/EKS-cni-support.md) #### Authentication problem The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations. diff --git a/docs/EKS-cni-support.md b/docs/EKS-cni-support.md new file mode 100644 index 0000000000000000000000000000000000000000..eafa74ad15b93071c89e0ff70db5bc6938d8c6cd --- /dev/null +++ b/docs/EKS-cni-support.md @@ -0,0 +1,42 @@ +# CNI monitoring special configuration updates for EKS + +AWS EKS uses [CNI](https://github.com/aws/amazon-vpc-cni-k8s) networking plugin for pod networking in Kubernetes using Elastic Network Interfaces on AWS + +One fatal issue that can occur is that you run out of IP addresses in your eks cluster. (Generally happens due to error configs where pods keep scheduling). + +You can monitor the `awscni` using kube-promethus with : +[embedmd]:# (../examples/eks-cni-example.jsonnet) +```jsonnet +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + prometheusRules+:: { + groups+: [ + { + name: 'example-group', + rules: [ + { + record: 'aws_eks_available_ip', + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + }, + ], + }, + ], + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } +``` + +After you have the required yaml file please run + +``` +kubectl apply -f manifests/prometheus-serviceMonitorAwsEksCNI.yaml +``` diff --git a/examples/eks-cni-example.jsonnet b/examples/eks-cni-example.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..df6ca07238bddc0b54a6d7019deebf8b7f566cf1 --- /dev/null +++ b/examples/eks-cni-example.jsonnet @@ -0,0 +1,26 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'kube-prometheus/kube-prometheus-eks.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + prometheusRules+:: { + groups+: [ + { + name: 'example-group', + rules: [ + { + record: 'aws_eks_available_ip', + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + }, + ], + }, + ], + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } diff --git a/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..74c50e2f5761bdf32ebdb235f953b42ff7ed8973 --- /dev/null +++ b/jsonnet/kube-prometheus/kube-prometheus-eks.libsonnet @@ -0,0 +1,65 @@ +local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; + +{ + prometheus+: { + AwsEksCniMetricService: + service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) + + service.mixin.spec.withClusterIp('None'), + serviceMonitorAwsEksCNI: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'awsekscni', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'eks-cni', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'aws-node', + }, + }, + namespaceSelector: { + matchNames: [ + 'kube-system', + ], + }, + endpoints: [ + { + port: 'cni-metrics-port', + interval: '30s', + path: '/metrics', + }, + ], + }, + }, + }, + prometheusRules+: { + groups+: [ + { + name: 'kube-prometheus-eks.rules', + rules: [ + { + expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Instance {{ $labels.instance }} has less than 10 IPs available.' + }, + 'for': '10m', + alert: 'EksAvailableIPs' + }, + ], + }, + ], + }, +} diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index daa3bb8f951cb77534cf53fad450ca093f2cd4d0..3ce78152a8812d4be9e4ac077d920ae336930bc8 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "fa972cf29666e821c44195c51df15b6e28ed29c4", + "version": "cbc1340af53f50728181f97f6bce442ac33d8993", "sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw=" }, { @@ -30,7 +30,7 @@ "subdir": "grafana-builder" } }, - "version": "1f273dd3c7a619bcd05c3e1c2650204104a273d8", + "version": "67ab3dc52f3cdbc3b29d30afd3261375b5ad13fd", "sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE=" }, { @@ -83,8 +83,8 @@ "subdir": "docs/node-mixin" } }, - "version": "5a7b85876d6108a91f0d8673c0d7eca38687671b", - "sum": "3N77msMjqClzQHbZOxn4GTlV+FZpU+y1gCekvCvxwy0=" + "version": "20fe5bfb5be4caf3c8c11533b7fb35cb97d810f5", + "sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg=" }, { "name": "prometheus", @@ -94,7 +94,7 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "74726367cf7a7e8d0332238defd2e7f4169030bd", + "version": "e94503ff5c412590ce7616accdd3c62a2189bcd3", "sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM=" }, { diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 9fcb6c5ca382cbe1efc1b158f87fb7eee604b04f..1ac6ae6621aa529ca2e83248dadc8935fa75835b 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -20280,7 +20280,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n sum without (instance) (instance:node_num_cpu:sum{job=\"node-exporter\"})\n)\n", + "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -20366,7 +20366,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n count without (instance) (instance:node_load1_per_cpu:ratio{job=\"node-exporter\"})\n)\n", + "expr": "instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_load1_per_cpu:ratio{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -20464,7 +20464,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ ignoring (instance) group_left\n count without (instance) (instance:node_memory_utilisation:ratio{job=\"node-exporter\"})\n)\n", + "expr": "instance:node_memory_utilisation:ratio{job=\"node-exporter\"}\n/ scalar(count(instance:node_memory_utilisation:ratio{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -20864,7 +20864,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ ignoring (instance, device) group_left\n count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"})\n)\n", + "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{device}}", @@ -20950,7 +20950,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ ignoring (instance, device) group_left\n count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"})\n)\n", + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{device}}", @@ -21048,7 +21048,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n ) \n/ ignoring (instance) group_left\n sum without (instance, device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n )\n) \n", + "expr": "sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\"}\n )\n) \n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\"})))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}",