diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md index 8317293556e0c800dc28422380cfeb4348d20156..db649c5e9df5aa02c1a074769194284dc850b98c 100644 --- a/cluster-autoscaler/FAQ.md +++ b/cluster-autoscaler/FAQ.md @@ -843,7 +843,7 @@ Most likely it's due to a problem with the cluster. Steps to debug: * Check if cluster autoscaler is up and running. In version 0.5 and later, it periodically publishes the kube-system/cluster-autoscaler-status config map. Check last update time annotation. It should be no more than 3 min (usually 10 sec old). -* Check in the above config map if cluster and node groups are in the healthy state. If not, check if there are unready nodes. +* Check in the above config map if cluster and node groups are in the healthy state. If not, check if there are unready nodes. If some nodes appear unready despite being Ready in the Node object, check `resourceUnready` count. If there are any nodes marked as `resourceUnready`, it is most likely a problem with the device driver failing to install a new resource (e.g. GPU). `resourceUnready` count is only available in CA version 1.24 and later. If both the cluster and CA appear healthy: diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 7eaa2ce330a4a4c5b725fd0fd728e7cd93517348..322d6df34c5f9af28b15ea1fabc020562db7aa6a 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -537,6 +537,10 @@ type Readiness struct { Unregistered int // Time when the readiness was measured. Time time.Time + // Number of nodes that are Unready due to missing resources. + // This field is only used for exposing information externally and + // doesn't influence CA behavior. + ResourceUnready int } func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) { @@ -544,23 +548,26 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) { perNodeGroup := make(map[string]Readiness) total := Readiness{Time: currentTime} - update := func(current Readiness, node *apiv1.Node, ready bool) Readiness { + update := func(current Readiness, node *apiv1.Node, nr kube_util.NodeReadiness) Readiness { current.Registered++ if deletetaint.HasToBeDeletedTaint(node) { current.Deleted++ - } else if ready { + } else if nr.Ready { current.Ready++ } else if node.CreationTimestamp.Time.Add(MaxNodeStartupTime).After(currentTime) { current.NotStarted++ } else { current.Unready++ + if nr.Reason == kube_util.ResourceUnready { + current.ResourceUnready++ + } } return current } for _, node := range csr.nodes { nodeGroup, errNg := csr.cloudProvider.NodeGroupForNode(node) - ready, _, errReady := kube_util.GetReadinessState(node) + nr, errReady := kube_util.GetNodeReadiness(node) // Node is most likely not autoscaled, however check the errors. if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { @@ -571,9 +578,9 @@ func (csr *ClusterStateRegistry) updateReadinessStats(currentTime time.Time) { klog.Warningf("Failed to get readiness info for %s: %v", node.Name, errReady) } } else { - perNodeGroup[nodeGroup.Id()] = update(perNodeGroup[nodeGroup.Id()], node, ready) + perNodeGroup[nodeGroup.Id()] = update(perNodeGroup[nodeGroup.Id()], node, nr) } - total = update(total, node, ready) + total = update(total, node, nr) } var longUnregisteredNodeNames []string @@ -740,9 +747,10 @@ func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness { func buildHealthStatusNodeGroup(isReady bool, readiness Readiness, acceptable AcceptableRange, minSize, maxSize int) api.ClusterAutoscalerCondition { condition := api.ClusterAutoscalerCondition{ Type: api.ClusterAutoscalerHealth, - Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)", + Message: fmt.Sprintf("ready=%d unready=%d (resourceUnready=%d) notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d cloudProviderTarget=%d (minSize=%d, maxSize=%d)", readiness.Ready, readiness.Unready, + readiness.ResourceUnready, readiness.NotStarted, readiness.Registered, readiness.LongUnregistered, @@ -794,9 +802,10 @@ func buildScaleDownStatusNodeGroup(candidates []string, lastProbed time.Time) ap func buildHealthStatusClusterwide(isReady bool, readiness Readiness) api.ClusterAutoscalerCondition { condition := api.ClusterAutoscalerCondition{ Type: api.ClusterAutoscalerHealth, - Message: fmt.Sprintf("ready=%d unready=%d notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d", + Message: fmt.Sprintf("ready=%d unready=%d (resourceUnready=%d) notStarted=%d longNotStarted=0 registered=%d longUnregistered=%d", readiness.Ready, readiness.Unready, + readiness.ResourceUnready, readiness.NotStarted, readiness.Registered, readiness.LongUnregistered, diff --git a/cluster-autoscaler/processors/customresources/gpu_processor.go b/cluster-autoscaler/processors/customresources/gpu_processor.go index cbe60e58a7819a60f412c3c543e6330c6c36f769..a694d3e5a3a4aec513806c5c31b37f0933689186 100644 --- a/cluster-autoscaler/processors/customresources/gpu_processor.go +++ b/cluster-autoscaler/processors/customresources/gpu_processor.go @@ -49,7 +49,7 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context if hasGpuLabel && (!hasGpuAllocatable || gpuAllocatable.IsZero()) { klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU", node.Name) - nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node) + nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready) } else { newReadyNodes = append(newReadyNodes, node) } diff --git a/cluster-autoscaler/utils/kubernetes/ready.go b/cluster-autoscaler/utils/kubernetes/ready.go index 0bd5e6f99a07cc0fb6729501d37883b4d50e6ec6..09175de42297fc828705b8fe31843907150522d5 100644 --- a/cluster-autoscaler/utils/kubernetes/ready.go +++ b/cluster-autoscaler/utils/kubernetes/ready.go @@ -23,6 +23,24 @@ import ( apiv1 "k8s.io/api/core/v1" ) +// NodeNotReadyReason reprents a reason for node to be unready. While it is +// simply a string on the node object, custom type ensures no one accidentally +// performs any string operation on variables of this type and allows them to +// be treated as enums. +type NodeNotReadyReason string + +const ( + // ResourceUnready is a fake identifier used internally by Cluster Autoscaler + // to indicate nodes that appear Ready in the API, but are treated as + // still upcoming due to a missing resource (e.g. GPU). + ResourceUnready NodeNotReadyReason = "cluster-autoscaler.kubernetes.io/resource-not-ready" + + // IgnoreTaint is a fake identifier used internally by Cluster Autoscaler + // to indicate nodes that appear Ready in the API, but are treated as + // still upcoming due to applied ignore taint. + IgnoreTaint NodeNotReadyReason = "cluster-autoscaler.kubernetes.io/ignore-taint" +) + // IsNodeReadyAndSchedulable returns true if the node is ready and schedulable. func IsNodeReadyAndSchedulable(node *apiv1.Node) bool { ready, _, _ := GetReadinessState(node) @@ -36,10 +54,29 @@ func IsNodeReadyAndSchedulable(node *apiv1.Node) bool { return true } +// NodeReadiness represents the last known node readiness. +type NodeReadiness struct { + // Is the node ready or not. + Ready bool + // Time of the last state transition related to readiness. + LastTransitionTime time.Time + // Reason for the node to be unready. Defined only when Ready is false. + Reason NodeNotReadyReason +} + // GetReadinessState gets readiness state for the node +// +// Deprecated: Use GetNodeReadiness instead. func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime time.Time, err error) { + nr, err := GetNodeReadiness(node) + return nr.Ready, nr.LastTransitionTime, err +} + +// GetNodeReadiness gets readiness for the node +func GetNodeReadiness(node *apiv1.Node) (NodeReadiness, error) { canNodeBeReady, readyFound := true, false - lastTransitionTime = time.Time{} + lastTransitionTime := time.Time{} + var reason NodeNotReadyReason for _, cond := range node.Status.Conditions { switch cond.Type { @@ -47,6 +84,7 @@ func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime t readyFound = true if cond.Status == apiv1.ConditionFalse || cond.Status == apiv1.ConditionUnknown { canNodeBeReady = false + reason = NodeNotReadyReason(cond.Reason) } if lastTransitionTime.Before(cond.LastTransitionTime.Time) { lastTransitionTime = cond.LastTransitionTime.Time @@ -83,18 +121,23 @@ func GetReadinessState(node *apiv1.Node) (isNodeReady bool, lastTransitionTime t } if !readyFound { - return false, time.Time{}, fmt.Errorf("readiness information not found") + return NodeReadiness{}, fmt.Errorf("readiness information not found") } - return canNodeBeReady, lastTransitionTime, nil + return NodeReadiness{ + Ready: canNodeBeReady, + LastTransitionTime: lastTransitionTime, + Reason: reason, + }, nil } // GetUnreadyNodeCopy create a copy of the given node and override its NodeReady condition to False -func GetUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node { +func GetUnreadyNodeCopy(node *apiv1.Node, reason NodeNotReadyReason) *apiv1.Node { newNode := node.DeepCopy() newReadyCondition := apiv1.NodeCondition{ Type: apiv1.NodeReady, Status: apiv1.ConditionFalse, LastTransitionTime: node.CreationTimestamp, + Reason: string(reason), } newNodeConditions := []apiv1.NodeCondition{newReadyCondition} for _, condition := range newNode.Status.Conditions { diff --git a/cluster-autoscaler/utils/taints/taints.go b/cluster-autoscaler/utils/taints/taints.go index ff11c53c516d03db8f7b3056cd675dec52c216c4..e4ac1ab433416577e333bc608ef7c78d1bac5a2b 100644 --- a/cluster-autoscaler/utils/taints/taints.go +++ b/cluster-autoscaler/utils/taints/taints.go @@ -115,7 +115,7 @@ func FilterOutNodesWithIgnoredTaints(ignoredTaints TaintKeySet, allNodes, readyN _, hasIgnoredTaint := ignoredTaints[t.Key] if hasIgnoredTaint || strings.HasPrefix(t.Key, IgnoreTaintPrefix) { ready = false - nodesWithIgnoredTaints[node.Name] = kubernetes.GetUnreadyNodeCopy(node) + nodesWithIgnoredTaints[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.IgnoreTaint) klog.V(3).Infof("Overriding status of node %v, which seems to have ignored taint %q", node.Name, t.Key) break }