From b0cfeb30ead0abbd2f347b5f04d01ba60cf268a3 Mon Sep 17 00:00:00 2001 From: Polina Bungina <27892524+hughcapet@users.noreply.github.com> Date: Thu, 23 Jan 2025 18:35:33 +0300 Subject: [PATCH] Partially revert #2810 (#2849) Only schedule switchover for pod migration, consider mainWindow for PGVERSION env change --- docs/administrator.md | 3 --- docs/reference/cluster_manifest.md | 2 +- e2e/tests/test_e2e.py | 21 +++++++++-------- pkg/cluster/cluster.go | 38 ++++++++++++++++++------------ pkg/cluster/pod.go | 11 ++++++--- pkg/cluster/sync.go | 10 ++++---- 6 files changed, 48 insertions(+), 37 deletions(-) diff --git a/docs/administrator.md b/docs/administrator.md index 9f8e8657..55abebc8 100644 --- a/docs/administrator.md +++ b/docs/administrator.md @@ -208,9 +208,6 @@ Note that, changes in `SPILO_CONFIGURATION` env variable under `bootstrap.dcs` path are ignored for the diff. They will be applied through Patroni's rest api interface, following a restart of all instances. -Rolling update is postponed until the next maintenance window if any is defined -under the `maintenanceWindows` cluster manifest parameter. - The operator also support lazy updates of the Spilo image. In this case the StatefulSet is only updated, but no rolling update follows. This feature saves you a switchover - and hence downtime - when you know pods are re-started later diff --git a/docs/reference/cluster_manifest.md b/docs/reference/cluster_manifest.md index d45bc094..19ea8d77 100644 --- a/docs/reference/cluster_manifest.md +++ b/docs/reference/cluster_manifest.md @@ -116,7 +116,7 @@ These parameters are grouped directly under the `spec` key in the manifest. * **maintenanceWindows** a list which defines specific time frames when certain maintenance operations - such as automatic major upgrades or rolling updates are allowed. Accepted formats + such as automatic major upgrades or master pod migration. Accepted formats are "01:00-06:00" for daily maintenance windows or "Sat:00:00-04:00" for specific days, with all times in UTC. diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py index 4743bb4b..b29fdae7 100644 --- a/e2e/tests/test_e2e.py +++ b/e2e/tests/test_e2e.py @@ -1187,7 +1187,7 @@ class EndToEndTestCase(unittest.TestCase): Test major version upgrade: with full upgrade, maintenance window, and annotation """ def check_version(): - p = k8s.patroni_rest("acid-upgrade-test-0", "") + p = k8s.patroni_rest("acid-upgrade-test-0", "") or {} version = p.get("server_version", 0) // 10000 return version @@ -1237,7 +1237,7 @@ class EndToEndTestCase(unittest.TestCase): # should not upgrade because current time is not in maintenanceWindow current_time = datetime.now() maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}" - pg_patch_version_15 = { + pg_patch_version_15_outside_mw = { "spec": { "postgresql": { "version": "15" @@ -1248,7 +1248,7 @@ class EndToEndTestCase(unittest.TestCase): } } k8s.api.custom_objects_api.patch_namespaced_custom_object( - "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15) + "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_outside_mw) self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") # no pod replacement outside of the maintenance window @@ -1259,12 +1259,12 @@ class EndToEndTestCase(unittest.TestCase): second_annotations = get_annotations() self.assertIsNone(second_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure should not be set") - # change the version again to trigger operator sync + # change maintenanceWindows to current maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}" - pg_patch_version_16 = { + pg_patch_version_15_in_mw = { "spec": { "postgresql": { - "version": "16" + "version": "15" }, "maintenanceWindows": [ maintenance_window_current @@ -1273,13 +1273,13 @@ class EndToEndTestCase(unittest.TestCase): } k8s.api.custom_objects_api.patch_namespaced_custom_object( - "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16) + "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_in_mw) self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) - self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16") + self.eventuallyEqual(check_version, 15, "Version should be upgraded from 14 to 15") # check if annotation for last upgrade's success is updated after second upgrade third_annotations = get_annotations() @@ -1306,16 +1306,17 @@ class EndToEndTestCase(unittest.TestCase): k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label) k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) - self.eventuallyEqual(check_version, 16, "Version should not be upgraded because annotation for last upgrade's failure is set") + self.eventuallyEqual(check_version, 15, "Version should not be upgraded because annotation for last upgrade's failure is set") # change the version back to 15 and should remove failure annotation k8s.api.custom_objects_api.patch_namespaced_custom_object( - "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15) + "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_in_mw) self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) + self.eventuallyEqual(check_version, 15, "Version should not be upgraded from 15") fourth_annotations = get_annotations() self.assertIsNone(fourth_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure is not removed") diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index f835eaa0..55355cf3 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -957,6 +957,11 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error { defer c.mu.Unlock() c.KubeClient.SetPostgresCRDStatus(c.clusterName(), acidv1.ClusterStatusUpdating) + + if !isInMaintenanceWindow(newSpec.Spec.MaintenanceWindows) { + // do not apply any major version related changes yet + newSpec.Spec.PostgresqlParam.PgVersion = oldSpec.Spec.PostgresqlParam.PgVersion + } c.setSpec(newSpec) defer func() { @@ -1761,28 +1766,28 @@ func (c *Cluster) GetSwitchoverSchedule() string { } // Switchover does a switchover (via Patroni) to a candidate pod -func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) error { +func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName, scheduled bool) error { var err error - c.logger.Debugf("switching over from %q to %q", curMaster.Name, candidate) - - if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) { - c.logger.Infof("postponing switchover, not in maintenance window") - schedule := c.GetSwitchoverSchedule() - if err := c.patroni.Switchover(curMaster, candidate.Name, schedule); err != nil { - return fmt.Errorf("could not schedule switchover: %v", err) - } - c.logger.Infof("switchover is scheduled at %s", schedule) - return nil - } - - c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switching over from %q to %q", curMaster.Name, candidate) stopCh := make(chan struct{}) ch := c.registerPodSubscriber(candidate) defer c.unregisterPodSubscriber(candidate) defer close(stopCh) - if err = c.patroni.Switchover(curMaster, candidate.Name, ""); err == nil { + var scheduled_at string + if scheduled { + scheduled_at = c.GetSwitchoverSchedule() + } else { + c.logger.Debugf("switching over from %q to %q", curMaster.Name, candidate) + c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switching over from %q to %q", curMaster.Name, candidate) + scheduled_at = "" + } + + if err = c.patroni.Switchover(curMaster, candidate.Name, scheduled_at); err == nil { + if scheduled { + c.logger.Infof("switchover from %q to %q is scheduled at %s", curMaster.Name, candidate, scheduled_at) + return nil + } c.logger.Debugf("successfully switched over from %q to %q", curMaster.Name, candidate) c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Successfully switched over from %q to %q", curMaster.Name, candidate) _, err = c.waitForPodLabel(ch, stopCh, nil) @@ -1790,6 +1795,9 @@ func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) e err = fmt.Errorf("could not get master pod label: %v", err) } } else { + if scheduled { + return fmt.Errorf("could not schedule switchover: %v", err) + } err = fmt.Errorf("could not switch over from %q to %q: %v", curMaster.Name, candidate, err) c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switchover from %q to %q FAILED: %v", curMaster.Name, candidate, err) } diff --git a/pkg/cluster/pod.go b/pkg/cluster/pod.go index bd2172c1..7fc95090 100644 --- a/pkg/cluster/pod.go +++ b/pkg/cluster/pod.go @@ -280,11 +280,16 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error { return fmt.Errorf("could not move pod: %v", err) } + scheduleSwitchover := false + if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) { + c.logger.Infof("postponing switchover, not in maintenance window") + scheduleSwitchover = true + } err = retryutil.Retry(1*time.Minute, 5*time.Minute, func() (bool, error) { - err := c.Switchover(oldMaster, masterCandidateName) + err := c.Switchover(oldMaster, masterCandidateName, scheduleSwitchover) if err != nil { - c.logger.Errorf("could not failover to pod %q: %v", masterCandidateName, err) + c.logger.Errorf("could not switchover to pod %q: %v", masterCandidateName, err) return false, nil } return true, nil @@ -445,7 +450,7 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp // do not recreate master now so it will keep the update flag and switchover will be retried on next sync return fmt.Errorf("skipping switchover: %v", err) } - if err := c.Switchover(masterPod, masterCandidate); err != nil { + if err := c.Switchover(masterPod, masterCandidate, false); err != nil { return fmt.Errorf("could not perform switch over: %v", err) } } else if newMasterPod == nil && len(replicas) == 0 { diff --git a/pkg/cluster/sync.go b/pkg/cluster/sync.go index 32aae605..dada0499 100644 --- a/pkg/cluster/sync.go +++ b/pkg/cluster/sync.go @@ -97,6 +97,11 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error { } } + if !isInMaintenanceWindow(newSpec.Spec.MaintenanceWindows) { + // do not apply any major version related changes yet + newSpec.Spec.PostgresqlParam.PgVersion = oldSpec.Spec.PostgresqlParam.PgVersion + } + if err = c.syncStatefulSet(); err != nil { if !k8sutil.ResourceAlreadyExists(err) { err = fmt.Errorf("could not sync statefulsets: %v", err) @@ -658,11 +663,6 @@ func (c *Cluster) syncStatefulSet() error { isSafeToRecreatePods = false } - if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) { - postponeReasons = append(postponeReasons, "not in maintenance window") - isSafeToRecreatePods = false - } - // if we get here we also need to re-create the pods (either leftovers from the old // statefulset or those that got their configuration from the outdated statefulset) if len(podsToRecreate) > 0 { -- GitLab