From 26769e4c1b109390b50dcf09db1d83842bcd3c30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20K=C5=82obuszewski?= <danielmk@google.com>
Date: Wed, 9 Feb 2022 11:25:38 +0100
Subject: [PATCH] Expose nodes with unready GPU in CA status

This change simplifies debugging GPU issues: without it, all nodes can
be Ready as far as Kubernetes API is concerned, but CA will still report
some of them as unready if are missing GPU resource. Explicitly calling
them out in the status ConfigMap will point into the right direction.
---
 cluster-autoscaler/FAQ.md                     |  2 +-
 .../clusterstate/clusterstate.go              | 23 ++++++---
 .../customresources/gpu_processor.go          |  2 +-
 cluster-autoscaler/utils/kubernetes/ready.go  | 51 +++++++++++++++++--
 cluster-autoscaler/utils/taints/taints.go     |  2 +-
 5 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
index 16722edb66..94ded9a4a8 100644
--- a/cluster-autoscaler/FAQ.md
+++ b/cluster-autoscaler/FAQ.md
@@ -836,7 +836,7 @@ Most likely it's due to a problem with the cluster. Steps to debug:
 
 * Check if cluster autoscaler is up and running. In version 0.5 and later, it periodically publishes the kube-system/cluster-autoscaler-status config map. Check last update time annotation. It should be no more than 3 min (usually 10 sec old).
 
-* Check in the above config map if cluster and node groups are in the healthy state. If not, check if there are unready nodes.
+* Check in the above config map if cluster and node groups are in the healthy state. If not, check if there are unready nodes. If some nodes appear unready despite being Ready in the Node object, check `resourceUnready` count. If there are any nodes marked as `resourceUnready`, it is most likely a problem with the device driver failing to install a new resource (e.g. GPU). `resourceUnready` count is only available in CA version 1.24 and later.
 
 If both the cluster and CA appear healthy:
 
diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
index 7eaa2ce330..322d6df34c 100644
--- a/cluster-autoscaler/clusterstate/clusterstate.go
+++ b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -537,6 +537,10 @@ type Readiness struct {
 	Unregistered int
 	// Time when the readiness was measured.
 	Time time.Time
+	// Number of nodes that are Unready due to missing resources.
+	// This field is only used for exposing information externally and
+	// doesn't influence CA behavior.
+	ResourceUnready int
 }
 
 func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
@@ -544,23 +548,26 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
 	perNodeGroup := make(map[string]Readiness)
 	total := Readiness{Time: currentTime}
 
-	update := func(current Readiness, node *apiv1.Node, ready bool) Readiness {
+	update := func(current Readiness, node *apiv1.Node, nr kube_util.NodeReadiness) Readiness {
 		current.Registered++
 		if deletetaint.HasToBeDeletedTaint(node) {
 			current.Deleted++
-		} else if ready {
+		} else if nr.Ready {
 			current.Ready++
 		} else if node.CreationTimestamp.Time.Add(MaxNodeStartupTime).After(currentTime) {
 			current.NotStarted++
 		} else {
 			current.Unready++
+			if nr.Reason == kube_util.ResourceUnready {
+				current.ResourceUnready++
+			}
 		}
 		return current
 	}
 
 	for _, node := range csr.nodes {
 		nodeGroup, errNg := csr.cloudProvider.NodeGroupForNode(node)
-		ready, _, errReady := kube_util.GetReadinessState(node)
+		nr, errReady := kube_util.GetNodeReadiness(node)
 
 		// Node is most likely not autoscaled, however check the errors.
 		if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() {
@@ -571,9 +578,9 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) {
 				klog.Warningf("Failed to get readiness info for %s: %v", node.Name, errReady)
 			}
 		} else {
-			perNodeGroup[nodeGroup.Id()] = update(perNodeGroup[nodeGroup.Id()], node, ready)
+			perNodeGroup[nodeGroup.Id()] = update(perNodeGroup[nodeGroup.Id()], node, nr)
 		}
-		total = update(total, node, ready)
+		total = update(total, node, nr)
 	}
 
 	var longUnregisteredNodeNames []string
@@ -740,9 +747,10 @@ func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness {
 func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition {
 	condition := api.ClusterAutoscalerCondition{
 		Type: api.ClusterAutoscalerHealth,
-		Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
+		Message: fmt.Sprintf("ready=%d unready=%d (resourceUnready=%d) notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)",
 			readiness.Ready,
 			readiness.Unready,
+			readiness.ResourceUnready,
 			readiness.NotStarted,
 			readiness.Registered,
 			readiness.LongUnregistered,
@@ -794,9 +802,10 @@ func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time) ap
 func buildHealthStatusClusterwide(isReady bool, readiness Readiness) api.ClusterAutoscalerCondition {
 	condition := api.ClusterAutoscalerCondition{
 		Type: api.ClusterAutoscalerHealth,
-		Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d",
+		Message: fmt.Sprintf("ready=%d unready=%d (resourceUnready=%d) notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d",
 			readiness.Ready,
 			readiness.Unready,
+			readiness.ResourceUnready,
 			readiness.NotStarted,
 			readiness.Registered,
 			readiness.LongUnregistered,
diff --git a/cluster-autoscaler/processors/customresources/gpu_processor.go b/cluster-autoscaler/processors/customresources/gpu_processor.go
index cbe60e58a7..a694d3e5a3 100644
--- a/cluster-autoscaler/processors/customresources/gpu_processor.go
+++ b/cluster-autoscaler/processors/customresources/gpu_processor.go
@@ -49,7 +49,7 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
 		if hasGpuLabel && (!hasGpuAllocatable || gpuAllocatable.IsZero()) {
 			klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
 				node.Name)
-			nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node)
+			nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready)
 		} else {
 			newReadyNodes = append(newReadyNodes, node)
 		}
diff --git a/cluster-autoscaler/utils/kubernetes/ready.go b/cluster-autoscaler/utils/kubernetes/ready.go
index 0bd5e6f99a..09175de422 100644
--- a/cluster-autoscaler/utils/kubernetes/ready.go
+++ b/cluster-autoscaler/utils/kubernetes/ready.go
@@ -23,6 +23,24 @@ import (
 	apiv1 "k8s.io/api/core/v1"
 )
 
+// NodeNotReadyReason reprents a reason for node to be unready. While it is
+// simply a string on the node object, custom type ensures no one accidentally
+// performs any string operation on variables of this type and allows them to
+// be treated as enums.
+type NodeNotReadyReason string
+
+const (
+	// ResourceUnready is a fake identifier used internally by Cluster Autoscaler
+	// to indicate nodes that appear Ready in the API, but are treated as
+	// still upcoming due to a missing resource (e.g. GPU).
+	ResourceUnready NodeNotReadyReason = "cluster-autoscaler.kubernetes.io/resource-not-ready"
+
+	// IgnoreTaint is a fake identifier used internally by Cluster Autoscaler
+	// to indicate nodes that appear Ready in the API, but are treated as
+	// still upcoming due to applied ignore taint.
+	IgnoreTaint NodeNotReadyReason = "cluster-autoscaler.kubernetes.io/ignore-taint"
+)
+
 // IsNodeReadyAndSchedulable returns true if the node is ready and schedulable.
 func IsNodeReadyAndSchedulable(node *apiv1.Node) bool {
 	ready, _, _ := GetReadinessState(node)
@@ -36,10 +54,29 @@ func IsNodeReadyAndSchedulable(node *apiv1.Node) bool {
 	return true
 }
 
+// NodeReadiness represents the last known node readiness.
+type NodeReadiness struct {
+	// Is the node ready or not.
+	Ready bool
+	// Time of the last state transition related to readiness.
+	LastTransitionTime time.Time
+	// Reason for the node to be unready. Defined only when Ready is false.
+	Reason NodeNotReadyReason
+}
+
 // GetReadinessState gets readiness state for the node
+//
+// Deprecated: Use GetNodeReadiness instead.
 func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime time.Time, err error) {
+	nr, err := GetNodeReadiness(node)
+	return nr.Ready, nr.LastTransitionTime, err
+}
+
+// GetNodeReadiness gets readiness for the node
+func GetNodeReadiness(node *apiv1.Node) (NodeReadiness, error) {
 	canNodeBeReady, readyFound := true, false
-	lastTransitionTime = time.Time{}
+	lastTransitionTime := time.Time{}
+	var reason NodeNotReadyReason
 
 	for _, cond := range node.Status.Conditions {
 		switch cond.Type {
@@ -47,6 +84,7 @@ func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime t
 			readyFound = true
 			if cond.Status == apiv1.ConditionFalse || cond.Status == apiv1.ConditionUnknown {
 				canNodeBeReady = false
+				reason = NodeNotReadyReason(cond.Reason)
 			}
 			if lastTransitionTime.Before(cond.LastTransitionTime.Time) {
 				lastTransitionTime = cond.LastTransitionTime.Time
@@ -83,18 +121,23 @@ func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime t
 	}
 
 	if !readyFound {
-		return false, time.Time{}, fmt.Errorf("readiness information not found")
+		return NodeReadiness{}, fmt.Errorf("readiness information not found")
 	}
-	return canNodeBeReady, lastTransitionTime, nil
+	return NodeReadiness{
+		Ready:              canNodeBeReady,
+		LastTransitionTime: lastTransitionTime,
+		Reason:             reason,
+	}, nil
 }
 
 // GetUnreadyNodeCopy create a copy of the given node and override its NodeReady condition to False
-func GetUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node {
+func GetUnreadyNodeCopy(node *apiv1.Node, reason NodeNotReadyReason) *apiv1.Node {
 	newNode := node.DeepCopy()
 	newReadyCondition := apiv1.NodeCondition{
 		Type:               apiv1.NodeReady,
 		Status:             apiv1.ConditionFalse,
 		LastTransitionTime: node.CreationTimestamp,
+		Reason:             string(reason),
 	}
 	newNodeConditions := []apiv1.NodeCondition{newReadyCondition}
 	for _, condition := range newNode.Status.Conditions {
diff --git a/cluster-autoscaler/utils/taints/taints.go b/cluster-autoscaler/utils/taints/taints.go
index ff11c53c51..e4ac1ab433 100644
--- a/cluster-autoscaler/utils/taints/taints.go
+++ b/cluster-autoscaler/utils/taints/taints.go
@@ -115,7 +115,7 @@ func FilterOutNodesWithIgnoredTaints(ignoredTaints TaintKeySet, allNodes, readyN
 			_, hasIgnoredTaint := ignoredTaints[t.Key]
 			if hasIgnoredTaint || strings.HasPrefix(t.Key, IgnoreTaintPrefix) {
 				ready = false
-				nodesWithIgnoredTaints[node.Name] = kubernetes.GetUnreadyNodeCopy(node)
+				nodesWithIgnoredTaints[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.IgnoreTaint)
 				klog.V(3).Infof("Overriding status of node %v, which seems to have ignored taint %q", node.Name, t.Key)
 				break
 			}
-- 
GitLab