diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index b51755a8f4d0520758699e86afea14d89b908231..78690e4aae779af8aa88642446474db809cf5c30 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -102,17 +102,15 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e // Sentinel has not death nodes // Sentinel knows the correct slave number - err := r.rfChecker.CheckRedisNumber(rf) - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) - if err != nil { - r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset") + if !r.rfChecker.IsRedisRunning(rf) { + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile") return nil } - err = r.rfChecker.CheckSentinelNumber(rf) - setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) - if err != nil { - r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment") + if !r.rfChecker.IsSentinelRunning(rf) { + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") return nil } @@ -122,7 +120,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e } switch nMasters { case 0: - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("No masters detected")) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("no masters detected")) redisesIP, err := r.rfChecker.GetRedisesIPs(rf) if err != nil { return err @@ -138,21 +136,21 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e return err2 } if minTime > timeToPrepare { - r.logger.Debugf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds()) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds()) // We can consider there's an error if err2 := r.rfHealer.SetOldestAsMaster(rf); err2 != nil { return err2 } } else { // We'll wait until failover is done - r.logger.Debug("No master found, wait until failover") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("No master found, wait until failover") return nil } case 1: setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil) default: - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("Multiple masters detected")) - return errors.New("More than one master, fix manually") + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("multiple masters detected")) + return errors.New("more than one master, fix manually") } master, err := r.rfChecker.GetMasterIP(rf) @@ -160,12 +158,12 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e return err } - err2 := r.rfChecker.CheckAllSlavesFromMaster(master, rf) + err = r.rfChecker.CheckAllSlavesFromMaster(master, rf) setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.SLAVE_WRONG_MASTER, metrics.NOT_APPLICABLE, err) - if err2 != nil { - r.logger.Debug("Not all slaves have the same master") - if err3 := r.rfHealer.SetMasterOnAll(master, rf); err3 != nil { - return err3 + if err != nil { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Slave not associated to master: %s", err.Error()) + if err = r.rfHealer.SetMasterOnAll(master, rf); err != nil { + return err } } @@ -190,7 +188,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e err = r.rfChecker.CheckSentinelMonitor(sip, master, port) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err) if err != nil { - r.logger.Debug("Sentinel is not monitoring the correct master") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error()) if err := r.rfHealer.NewSentinelMonitor(sip, master, rf); err != nil { return err } @@ -203,7 +201,7 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red err := r.rfChecker.CheckRedisNumber(rf) setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) if err != nil { - r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile") return nil } @@ -226,7 +224,7 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red err = r.rfChecker.CheckSentinelNumber(rf) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) if err != nil { - r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") return nil } @@ -238,7 +236,7 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red err = r.rfChecker.CheckSentinelMonitor(sip, bootstrapSettings.Host, bootstrapSettings.Port) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err) if err != nil { - r.logger.Debug("Sentinel is not monitoring the correct master") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error()) if err := r.rfHealer.NewSentinelMonitorWithPort(sip, bootstrapSettings.Host, bootstrapSettings.Port, rf); err != nil { return err } @@ -267,7 +265,7 @@ func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFa err := r.rfChecker.CheckSentinelNumberInMemory(sip, rf) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_NUMBER_IN_MEMORY_MISMATCH, sip, err) if err != nil { - r.logger.Debug("Sentinel has more sentinel in memory than spected") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of sentinels in memory. resetting", sip) if err := r.rfHealer.RestoreSentinel(sip); err != nil { return err } @@ -278,7 +276,7 @@ func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFa err := r.rfChecker.CheckSentinelSlavesNumberInMemory(sip, rf) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH, sip, err) if err != nil { - r.logger.Debug("Sentinel has more slaves in memory than spected") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Sentinel %s mismatch number of expected slaves in memory. resetting", sip) if err := r.rfHealer.RestoreSentinel(sip); err != nil { return err } diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index ad86e858a564c49b31bd6382d3d65565f65e919b..63946a25559f59828ea150694501809bcfb8908f 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -168,7 +168,7 @@ func (r *RedisFailoverChecker) CheckSentinelMonitor(sentinel string, monitor ... return err } if actualMonitorIP != monitorIP || (monitorPort != "" && monitorPort != actualMonitorPort) { - return errors.New("the monitor on the sentinel config does not match with the expected one") + return fmt.Errorf("sentinel monitoring %s:%s instead %s:%s", actualMonitorIP, actualMonitorPort, monitorIP, monitorPort) } return nil } @@ -209,11 +209,13 @@ func (r *RedisFailoverChecker) GetNumberMasters(rf *redisfailoverv1.RedisFailove nMasters := 0 rips, err := r.GetRedisesIPs(rf) if err != nil { + r.logger.Errorf(err.Error()) return nMasters, err } password, err := k8s.GetRedisPassword(r.k8sService, rf) if err != nil { + r.logger.Errorf("Error getting password: %s", err.Error()) return nMasters, err } @@ -274,7 +276,7 @@ func (r *RedisFailoverChecker) GetMinimumRedisPodTime(rf *redisfailoverv1.RedisF } start := redisNode.Status.StartTime.Round(time.Second) alive := time.Since(start) - r.logger.Debugf("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds()) + r.logger.Infof("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds()) if alive < minTime { minTime = alive } diff --git a/operator/redisfailover/service/heal.go b/operator/redisfailover/service/heal.go index 22a3ffd9f38bd6ea05b6ab75585d7993b2371c37..7c87da427f54c5bcbfb8616b4875a401437e4be5 100644 --- a/operator/redisfailover/service/heal.go +++ b/operator/redisfailover/service/heal.go @@ -203,7 +203,6 @@ func (r *RedisFailoverHealer) SetExternalMasterOnAll(masterIP, masterPort string // NewSentinelMonitor changes the master that Sentinel has to monitor func (r *RedisFailoverHealer) NewSentinelMonitor(ip string, monitor string, rf *redisfailoverv1.RedisFailover) error { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Sentinel is not monitoring the correct master, changing...") quorum := strconv.Itoa(int(getQuorum(rf))) password, err := k8s.GetRedisPassword(r.k8sService, rf) @@ -217,7 +216,6 @@ func (r *RedisFailoverHealer) NewSentinelMonitor(ip string, monitor string, rf * // NewSentinelMonitorWithPort changes the master that Sentinel has to monitor by the provided IP and Port func (r *RedisFailoverHealer) NewSentinelMonitorWithPort(ip string, monitor string, monitorPort string, rf *redisfailoverv1.RedisFailover) error { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Sentinel is not monitoring the correct master, changing...") quorum := strconv.Itoa(int(getQuorum(rf))) password, err := k8s.GetRedisPassword(r.k8sService, rf) @@ -230,13 +228,13 @@ func (r *RedisFailoverHealer) NewSentinelMonitorWithPort(ip string, monitor stri // RestoreSentinel clear the number of sentinels on memory func (r *RedisFailoverHealer) RestoreSentinel(ip string) error { - r.logger.Infof("Restoring sentinel %s...", ip) + r.logger.Debugf("Restoring sentinel %s", ip) return r.redisClient.ResetSentinel(ip) } // SetSentinelCustomConfig will call sentinel to set the configuration given in config func (r *RedisFailoverHealer) SetSentinelCustomConfig(ip string, rf *redisfailoverv1.RedisFailover) error { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Setting the custom config on sentinel %s...", ip) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Setting the custom config on sentinel %s...", ip) return r.redisClient.SetCustomSentinelConfig(ip, rf.Spec.Sentinel.CustomConfig) }