From d2569dedb4b0e8b59fe6e5009c21317c223ff486 Mon Sep 17 00:00:00 2001
From: Connor Kuehl <connor.kuehl@suse.com>
Date: Tue, 20 Jun 2023 13:27:57 -0500
Subject: [PATCH] Allow Plan to specify Job deadline

If a workload is known to be slow in excess of the
system-upgrade-controller deployment's default deadline, allow the Plan
creator to specify a job deadline with the plan so that the entailing
job may run to completion.

Signed-off-by: Connor Kuehl <connor.kuehl@suse.com>
---
 pkg/apis/upgrade.cattle.io/v1/types.go |  7 ++-
 pkg/upgrade/job/job.go                 | 27 ++++++++-
 pkg/upgrade/job/job_suite_test.go      | 78 ++++++++++++++++++++++++++
 3 files changed, 107 insertions(+), 5 deletions(-)

diff --git a/pkg/apis/upgrade.cattle.io/v1/types.go b/pkg/apis/upgrade.cattle.io/v1/types.go
index e0b53116..85883e11 100644
--- a/pkg/apis/upgrade.cattle.io/v1/types.go
+++ b/pkg/apis/upgrade.cattle.io/v1/types.go
@@ -33,9 +33,10 @@ type Plan struct {
 
 // PlanSpec represents the user-configurable details of a Plan.
 type PlanSpec struct {
-	Concurrency        int64                 `json:"concurrency,omitempty"`
-	NodeSelector       *metav1.LabelSelector `json:"nodeSelector,omitempty"`
-	ServiceAccountName string                `json:"serviceAccountName,omitempty"`
+	Concurrency           int64                 `json:"concurrency,omitempty"`
+	JobActiveDeadlineSecs int64                 `json:"jobActiveDeadlineSecs,omitempty"`
+	NodeSelector          *metav1.LabelSelector `json:"nodeSelector,omitempty"`
+	ServiceAccountName    string                `json:"serviceAccountName,omitempty"`
 
 	Channel string       `json:"channel,omitempty"`
 	Version string       `json:"version,omitempty"`
diff --git a/pkg/upgrade/job/job.go b/pkg/upgrade/job/job.go
index 3b3ff330..dca975bc 100644
--- a/pkg/upgrade/job/job.go
+++ b/pkg/upgrade/job/job.go
@@ -41,6 +41,17 @@ var (
 		return defaultValue
 	}(defaultActiveDeadlineSeconds)
 
+	ActiveDeadlineSecondsMax = func(defaultValue int64) int64 {
+		if str, ok := os.LookupEnv("SYSTEM_UPGRADE_JOB_ACTIVE_DEADLINE_SECONDS_MAX"); ok {
+			if i, err := strconv.ParseInt(str, 10, 64); err != nil {
+				logrus.Errorf("failed to parse $%s: %v", "SYSTEM_UPGRADE_JOB_ACTIVE_DEADLINE_SECONDS_MAX", err)
+			} else {
+				return i
+			}
+		}
+		return defaultValue
+	}(0 /* no maximum */)
+
 	BackoffLimit = func(defaultValue int32) int32 {
 		if str, ok := os.LookupEnv("SYSTEM_UPGRADE_JOB_BACKOFF_LIMIT"); ok {
 			if i, err := strconv.ParseInt(str, 10, 32); err != nil {
@@ -327,8 +338,20 @@ func New(plan *upgradeapiv1.Plan, node *corev1.Node, controllerName string) *bat
 		),
 	}
 
-	if ActiveDeadlineSeconds > 0 {
-		job.Spec.ActiveDeadlineSeconds = &ActiveDeadlineSeconds
+	activeDeadlineSeconds := ActiveDeadlineSeconds
+
+	if plan.Spec.JobActiveDeadlineSecs > 0 {
+		activeDeadlineSeconds = plan.Spec.JobActiveDeadlineSecs
+	}
+
+	// If configured with a maximum deadline via "SYSTEM_UPGRADE_JOB_ACTIVE_DEADLINE_SECONDS_MAX",
+	// clamp the Plan's given deadline to the maximum.
+	if ActiveDeadlineSecondsMax > 0 && activeDeadlineSeconds > ActiveDeadlineSecondsMax {
+		activeDeadlineSeconds = ActiveDeadlineSecondsMax
+	}
+
+	if activeDeadlineSeconds > 0 {
+		job.Spec.ActiveDeadlineSeconds = &activeDeadlineSeconds
 		if drain != nil && drain.Timeout != nil && drain.Timeout.Milliseconds() > ActiveDeadlineSeconds*1000 {
 			logrus.Warnf("drain timeout exceeds active deadline seconds")
 		}
diff --git a/pkg/upgrade/job/job_suite_test.go b/pkg/upgrade/job/job_suite_test.go
index 8ec2e6ed..183fc2c6 100644
--- a/pkg/upgrade/job/job_suite_test.go
+++ b/pkg/upgrade/job/job_suite_test.go
@@ -5,9 +5,87 @@ import (
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
+
+	upgradev1 "github.com/rancher/system-upgrade-controller/pkg/apis/upgrade.cattle.io/v1"
+	sucjob "github.com/rancher/system-upgrade-controller/pkg/upgrade/job"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
 func TestJob(t *testing.T) {
 	RegisterFailHandler(Fail)
 	RunSpecs(t, "Job Suite")
 }
+
+var _ = Describe("Jobs", func() {
+	var plan *upgradev1.Plan
+	var node *corev1.Node
+
+	BeforeEach(func() {
+		plan = &upgradev1.Plan{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      "test-1",
+				Namespace: "default",
+			},
+			Spec: upgradev1.PlanSpec{
+				Concurrency:        1,
+				ServiceAccountName: "system-upgrade-controller-foo",
+				Upgrade: &upgradev1.ContainerSpec{
+					Image: "test-image:latest",
+				},
+			},
+		}
+
+		node = &corev1.Node{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: "prod.test.local",
+			},
+		}
+	})
+
+	Describe("Setting the batchv1.Job ActiveDeadlineSeconds field", func() {
+		Context("When the Plan has a positive non-zero value for deadline", func() {
+			It("Constructs the batchv1.Job with the Plan's given value", func() {
+				plan.Spec.JobActiveDeadlineSecs = 12345
+				job := sucjob.New(plan, node, "foo")
+				Expect(*job.Spec.ActiveDeadlineSeconds).To(Equal(int64(12345)))
+			})
+		})
+
+		Context("When the Plan has a zero-value given as its deadline", func() {
+			It("Constructs the batchv1.Job with a global default", func() {
+				oldActiveDeadlineSeconds := sucjob.ActiveDeadlineSeconds
+				sucjob.ActiveDeadlineSeconds = 300
+				defer func() { sucjob.ActiveDeadlineSeconds = oldActiveDeadlineSeconds }()
+
+				plan.Spec.JobActiveDeadlineSecs = 0
+				job := sucjob.New(plan, node, "bar")
+				Expect(*job.Spec.ActiveDeadlineSeconds).To(Equal(int64(300)))
+			})
+		})
+
+		Context("When the Plan has a negative value given as its deadline", func() {
+			It("Constructs the batchv1.Job with a global default", func() {
+				oldActiveDeadlineSeconds := sucjob.ActiveDeadlineSeconds
+				sucjob.ActiveDeadlineSeconds = 3600
+				defer func() { sucjob.ActiveDeadlineSeconds = oldActiveDeadlineSeconds }()
+
+				plan.Spec.JobActiveDeadlineSecs = -1
+				job := sucjob.New(plan, node, "baz")
+				Expect(*job.Spec.ActiveDeadlineSeconds).To(Equal(int64(3600)))
+			})
+		})
+
+		Context("When cluster has a maximum deadline and the Plan deadline exceeds that value", func() {
+			It("Constructs the batchv1.Job with the cluster's maximum deadline value", func() {
+				oldActiveDeadlineSecondsMax := sucjob.ActiveDeadlineSecondsMax
+				sucjob.ActiveDeadlineSecondsMax = 300
+				defer func() { sucjob.ActiveDeadlineSecondsMax = oldActiveDeadlineSecondsMax }()
+
+				plan.Spec.JobActiveDeadlineSecs = 600
+				job := sucjob.New(plan, node, "foobar")
+				Expect(*job.Spec.ActiveDeadlineSeconds).To(Equal(int64(300)))
+			})
+		})
+	})
+})
-- 
GitLab