diff --git a/cluster-autoscaler/README.md b/cluster-autoscaler/README.md index 6eed4ebc95c53b7e3defef185a4fa37b27754ab3..ba96d94a9a22ef15b1c71035fad93833d0d79977 100644 --- a/cluster-autoscaler/README.md +++ b/cluster-autoscaler/README.md @@ -21,6 +21,9 @@ there is a big chance that it won't work as expected. # Notable changes: +CA Version 0.5.4: +* Fixes problems with node drain when pods are ignoring SIGTERM. + CA Version 0.5.3: * Fixes problems with pod anti-affinity in scale up https://github.com/kubernetes/autoscaler/issues/33. diff --git a/cluster-autoscaler/core/scale_down.go b/cluster-autoscaler/core/scale_down.go index ea514fd2c2b46668f4b26bdecbdbdb27a66df68b..73fa16e049624fc1d7a1176b8a3443573073dfe1 100644 --- a/cluster-autoscaler/core/scale_down.go +++ b/cluster-autoscaler/core/scale_down.go @@ -62,6 +62,9 @@ const ( MaxPodEvictionTime = 2 * time.Minute // EvictionRetryTime is the time after CA retries failed pod eviction. EvictionRetryTime = 10 * time.Second + // PodEvictionHeadroom is the extra time we wait to catch situations when the pod is ignoring SIGTERM and + // is killed with SIGKILL after MaxGracefulTerminationTime + PodEvictionHeadroom = 20 * time.Second ) // ScaleDown is responsible for maintaining the state needed to perform unneded node removals. @@ -431,9 +434,9 @@ func drainNode(node *apiv1.Node, pods []*apiv1.Pod, client kube_client.Interface return fmt.Errorf("Failed to drain node %s/%s, due to following errors: %v", node.Namespace, node.Name, evictionErrs) } - // Evictions created successfully, wait maxGratefulTerminationSec to see if nodes really disappeared + // Evictions created successfully, wait maxGratefulTerminationSec + PodEvictionHeadroom to see if pods really disappeared. allGone := true - for start := time.Now(); time.Now().Sub(start) < time.Duration(maxGratefulTerminationSec)*time.Second; time.Sleep(5 * time.Second) { + for start := time.Now(); time.Now().Sub(start) < time.Duration(maxGratefulTerminationSec)*time.Second+PodEvictionHeadroom; time.Sleep(5 * time.Second) { allGone = true for _, pod := range pods { podreturned, err := client.Core().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{}) diff --git a/cluster-autoscaler/version.go b/cluster-autoscaler/version.go index c9f6815fbdcdbd3fbe5552ddcd37b94408fbd58a..b56e5b3937bbacc552738e1d3b3df5c2a6da0c96 100644 --- a/cluster-autoscaler/version.go +++ b/cluster-autoscaler/version.go @@ -17,4 +17,4 @@ limitations under the License. package main // ClusterAutoscalerVersion contains version of CA. -const ClusterAutoscalerVersion = "0.5.3" +const ClusterAutoscalerVersion = "0.5.4"