From acdb957d8ebf6ba784f72cd545af40def6973791 Mon Sep 17 00:00:00 2001 From: fahed dorgaa <fahed.dorgaa@gmail.com> Date: Fri, 1 Nov 2024 17:06:20 +0100 Subject: [PATCH] fix switch over candidate retrieving (#2760) * fix switch over candidate retrieving Signed-off-by: fahed dorgaa <fahed.dorgaa.ext@corp.ovh.com> --------- Signed-off-by: fahed dorgaa <fahed.dorgaa.ext@corp.ovh.com> Co-authored-by: fahed dorgaa <fahed.dorgaa.ext@corp.ovh.com> Co-authored-by: Felix Kunde <felix-kunde@gmx.de> --- pkg/cluster/pod.go | 33 +++++++++++++++------------------ pkg/cluster/pod_test.go | 4 ++-- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/pkg/cluster/pod.go b/pkg/cluster/pod.go index 890b6012..bd2172c1 100644 --- a/pkg/cluster/pod.go +++ b/pkg/cluster/pod.go @@ -480,6 +480,9 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e if PostgresRole(member.Role) == SyncStandby { syncCandidates = append(syncCandidates, member) } + if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && slices.Contains([]string{"running", "streaming", "in archive recovery"}, member.State) { + candidates = append(candidates, member) + } } // if synchronous mode is enabled and no SyncStandy was found @@ -489,6 +492,12 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e return false, nil } + // retry also in asynchronous mode when no replica candidate was found + if !c.Spec.Patroni.SynchronousMode && len(candidates) == 0 { + c.logger.Warnf("no replica candidate found - retrying fetching cluster members") + return false, nil + } + return true, nil }, ) @@ -502,24 +511,12 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e return syncCandidates[i].Lag < syncCandidates[j].Lag }) return spec.NamespacedName{Namespace: master.Namespace, Name: syncCandidates[0].Name}, nil - } else { - // in asynchronous mode find running replicas - for _, member := range members { - if PostgresRole(member.Role) == Leader || PostgresRole(member.Role) == StandbyLeader { - continue - } - - if slices.Contains([]string{"running", "streaming", "in archive recovery"}, member.State) { - candidates = append(candidates, member) - } - } - - if len(candidates) > 0 { - sort.Slice(candidates, func(i, j int) bool { - return candidates[i].Lag < candidates[j].Lag - }) - return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil - } + } + if len(candidates) > 0 { + sort.Slice(candidates, func(i, j int) bool { + return candidates[i].Lag < candidates[j].Lag + }) + return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil } return spec.NamespacedName{}, fmt.Errorf("no switchover candidate found") diff --git a/pkg/cluster/pod_test.go b/pkg/cluster/pod_test.go index e64e7eee..6816b4d7 100644 --- a/pkg/cluster/pod_test.go +++ b/pkg/cluster/pod_test.go @@ -62,7 +62,7 @@ func TestGetSwitchoverCandidate(t *testing.T) { expectedError: nil, }, { - subtest: "choose first replica when lag is equal evrywhere", + subtest: "choose first replica when lag is equal everywhere", clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "streaming", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 5}]}`, syncModeEnabled: false, expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"}, @@ -73,7 +73,7 @@ func TestGetSwitchoverCandidate(t *testing.T) { clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 2}, {"name": "acid-test-cluster-1", "role": "replica", "state": "starting", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 2}]}`, syncModeEnabled: false, expectedCandidate: spec.NamespacedName{}, - expectedError: fmt.Errorf("no switchover candidate found"), + expectedError: fmt.Errorf("failed to get Patroni cluster members: unexpected end of JSON input"), }, { subtest: "replicas with different status", -- GitLab