From acdb957d8ebf6ba784f72cd545af40def6973791 Mon Sep 17 00:00:00 2001
From: fahed dorgaa <fahed.dorgaa@gmail.com>
Date: Fri, 1 Nov 2024 17:06:20 +0100
Subject: [PATCH] fix switch over candidate retrieving (#2760)

* fix switch over candidate retrieving

Signed-off-by: fahed dorgaa <fahed.dorgaa.ext@corp.ovh.com>

---------

Signed-off-by: fahed dorgaa <fahed.dorgaa.ext@corp.ovh.com>
Co-authored-by: fahed dorgaa <fahed.dorgaa.ext@corp.ovh.com>
Co-authored-by: Felix Kunde <felix-kunde@gmx.de>
---
 pkg/cluster/pod.go      | 33 +++++++++++++++------------------
 pkg/cluster/pod_test.go |  4 ++--
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/pkg/cluster/pod.go b/pkg/cluster/pod.go
index 890b6012..bd2172c1 100644
--- a/pkg/cluster/pod.go
+++ b/pkg/cluster/pod.go
@@ -480,6 +480,9 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e
 				if PostgresRole(member.Role) == SyncStandby {
 					syncCandidates = append(syncCandidates, member)
 				}
+				if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && slices.Contains([]string{"running", "streaming", "in archive recovery"}, member.State) {
+					candidates = append(candidates, member)
+				}
 			}
 
 			// if synchronous mode is enabled and no SyncStandy was found
@@ -489,6 +492,12 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e
 				return false, nil
 			}
 
+			// retry also in asynchronous mode when no replica candidate was found
+			if !c.Spec.Patroni.SynchronousMode && len(candidates) == 0 {
+				c.logger.Warnf("no replica candidate found - retrying fetching cluster members")
+				return false, nil
+			}
+
 			return true, nil
 		},
 	)
@@ -502,24 +511,12 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e
 			return syncCandidates[i].Lag < syncCandidates[j].Lag
 		})
 		return spec.NamespacedName{Namespace: master.Namespace, Name: syncCandidates[0].Name}, nil
-	} else {
-		// in asynchronous mode find running replicas
-		for _, member := range members {
-			if PostgresRole(member.Role) == Leader || PostgresRole(member.Role) == StandbyLeader {
-				continue
-			}
-
-			if slices.Contains([]string{"running", "streaming", "in archive recovery"}, member.State) {
-				candidates = append(candidates, member)
-			}
-		}
-
-		if len(candidates) > 0 {
-			sort.Slice(candidates, func(i, j int) bool {
-				return candidates[i].Lag < candidates[j].Lag
-			})
-			return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil
-		}
+	}
+	if len(candidates) > 0 {
+		sort.Slice(candidates, func(i, j int) bool {
+			return candidates[i].Lag < candidates[j].Lag
+		})
+		return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil
 	}
 
 	return spec.NamespacedName{}, fmt.Errorf("no switchover candidate found")
diff --git a/pkg/cluster/pod_test.go b/pkg/cluster/pod_test.go
index e64e7eee..6816b4d7 100644
--- a/pkg/cluster/pod_test.go
+++ b/pkg/cluster/pod_test.go
@@ -62,7 +62,7 @@ func TestGetSwitchoverCandidate(t *testing.T) {
 			expectedError:     nil,
 		},
 		{
-			subtest:           "choose first replica when lag is equal evrywhere",
+			subtest:           "choose first replica when lag is equal everywhere",
 			clusterJson:       `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "streaming", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 5}]}`,
 			syncModeEnabled:   false,
 			expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"},
@@ -73,7 +73,7 @@ func TestGetSwitchoverCandidate(t *testing.T) {
 			clusterJson:       `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 2}, {"name": "acid-test-cluster-1", "role": "replica", "state": "starting", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 2}]}`,
 			syncModeEnabled:   false,
 			expectedCandidate: spec.NamespacedName{},
-			expectedError:     fmt.Errorf("no switchover candidate found"),
+			expectedError:     fmt.Errorf("failed to get Patroni cluster members: unexpected end of JSON input"),
 		},
 		{
 			subtest:           "replicas with different status",
-- 
GitLab