From 3ca86678ccfc7dfedfb49794ca072dff9a1b8983 Mon Sep 17 00:00:00 2001
From: Polina Bungina <27892524+hughcapet@users.noreply.github.com>
Date: Fri, 11 Oct 2024 17:11:46 +0200
Subject: [PATCH] Add major upgrade prechecks (#2772)

Don't fail major upgrade (don't set annotation) if replica(s) are not
(yet) streaming or replication lag is too high
---
 go.mod                             |  1 +
 go.sum                             |  2 ++
 pkg/cluster/majorversionupgrade.go | 41 +++++++++++++++++++++++++++---
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/go.mod b/go.mod
index 69037040..d6390f45 100644
--- a/go.mod
+++ b/go.mod
@@ -22,6 +22,7 @@ require (
 )
 
 require (
+	github.com/Masterminds/semver v1.5.0
 	github.com/davecgh/go-spew v1.1.1 // indirect
 	github.com/emicklei/go-restful/v3 v3.11.0 // indirect
 	github.com/evanphx/json-patch v4.12.0+incompatible // indirect
diff --git a/go.sum b/go.sum
index d90bfdb5..c7992fea 100644
--- a/go.sum
+++ b/go.sum
@@ -1,3 +1,5 @@
+github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww=
+github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y=
 github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
 github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
 github.com/aws/aws-sdk-go v1.53.8 h1:eoqGb1WOHIrCFKo1d51cMcnt1ralfLFaEqRkC5Zzv8k=
diff --git a/pkg/cluster/majorversionupgrade.go b/pkg/cluster/majorversionupgrade.go
index 1c5a670e..e8876dc4 100644
--- a/pkg/cluster/majorversionupgrade.go
+++ b/pkg/cluster/majorversionupgrade.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"strings"
 
+	"github.com/Masterminds/semver"
 	"github.com/zalando/postgres-operator/pkg/spec"
 	"github.com/zalando/postgres-operator/pkg/util"
 	v1 "k8s.io/api/core/v1"
@@ -170,6 +171,38 @@ func (c *Cluster) majorVersionUpgrade() error {
 		return nil
 	}
 
+	members, err := c.patroni.GetClusterMembers(masterPod)
+	if err != nil {
+		c.logger.Error("could not get cluster members data from Patroni API, skipping major version upgrade")
+		return err
+	}
+	patroniData, err := c.patroni.GetMemberData(masterPod)
+	if err != nil {
+		c.logger.Error("could not get members data from Patroni API, skipping major version upgrade")
+		return err
+	}
+	patroniVer, err := semver.NewVersion(patroniData.Patroni.Version)
+	if err != nil {
+		c.logger.Error("error parsing Patroni version")
+		patroniVer, _ = semver.NewVersion("3.0.4")
+	}
+	verConstraint, _ := semver.NewConstraint(">= 3.0.4")
+	checkStreaming, _ := verConstraint.Validate(patroniVer)
+
+	for _, member := range members {
+		if PostgresRole(member.Role) == Leader {
+			continue
+		}
+		if checkStreaming && member.State != "streaming" {
+			c.logger.Infof("skipping major version upgrade, replica %s is not streaming from primary", member.Name)
+			return nil
+		}
+		if member.Lag > 16*1024*1024 {
+			c.logger.Infof("skipping major version upgrade, replication lag on member %s is too high", member.Name)
+			return nil
+		}
+	}
+
 	isUpgradeSuccess := true
 	numberOfPods := len(pods)
 	if allRunning && masterPod != nil {
@@ -187,19 +220,21 @@ func (c *Cluster) majorVersionUpgrade() error {
 			}
 
 			resultIdCheck = strings.TrimSuffix(resultIdCheck, "\n")
-			var result string
+			var result, scriptErrMsg string
 			if resultIdCheck != "0" {
 				c.logger.Infof("user id was identified as: %s, hence default user is non-root already", resultIdCheck)
 				result, err = c.ExecCommand(podName, "/bin/bash", "-c", upgradeCommand)
+				scriptErrMsg, _ = c.ExecCommand(podName, "/bin/bash", "-c", "tail -n 1 last_upgrade.log")
 			} else {
 				c.logger.Infof("user id was identified as: %s, using su to reach the postgres user", resultIdCheck)
 				result, err = c.ExecCommand(podName, "/bin/su", "postgres", "-c", upgradeCommand)
+				scriptErrMsg, _ = c.ExecCommand(podName, "/bin/bash", "-c", "tail -n 1 last_upgrade.log")
 			}
 			if err != nil {
 				isUpgradeSuccess = false
 				c.annotatePostgresResource(isUpgradeSuccess)
-				c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, err)
-				return err
+				c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, scriptErrMsg)
+				return fmt.Errorf(scriptErrMsg)
 			}
 
 			c.annotatePostgresResource(isUpgradeSuccess)
-- 
GitLab