From 2ddc201a6134f91a11e555d6fcbc2d2048b669a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20T=C3=B6lle?= <julian.toelle@hetzner-cloud.de> Date: Thu, 30 Nov 2023 14:36:24 +0100 Subject: [PATCH] feat(robot): handle ratelimiting with constant backoff (#572) Add a constant backoff in case we are being rate limited by the Robot API. This is done through an opaque wrapping of the `robot.Client` interface, so callers do not need to do this manually. Co-authored-by: Janis Kemper <janis.kemper@syself.com> --- hcloud/cloud.go | 6 +- internal/config/config.go | 22 +++--- internal/config/config_test.go | 14 ++-- internal/robot/{client.go => cache.go} | 5 +- internal/robot/ratelimit.go | 93 ++++++++++++++++++++++++++ internal/robot/ratelimit_test.go | 68 +++++++++++++++++++ 6 files changed, 191 insertions(+), 17 deletions(-) rename internal/robot/{client.go => cache.go} (92%) create mode 100644 internal/robot/ratelimit.go create mode 100644 internal/robot/ratelimit_test.go diff --git a/hcloud/cloud.go b/hcloud/cloud.go index bd467e6d..17f43f8e 100644 --- a/hcloud/cloud.go +++ b/hcloud/cloud.go @@ -85,7 +85,11 @@ func newCloud(_ io.Reader) (cloudprovider.Interface, error) { var robotClient robot.Client if cfg.Robot.Enabled { c := hrobot.NewBasicAuthClient(cfg.Robot.User, cfg.Robot.Password) - robotClient = robot.NewClient(c, cfg.Robot.CacheTimeout) + + robotClient = robot.NewRateLimitedClient( + cfg.Robot.RateLimitWaitTime, + robot.NewCachedClient(cfg.Robot.CacheTimeout, c), + ) } var networkID int64 diff --git a/internal/config/config.go b/internal/config/config.go index dacfab4f..9c97e2a6 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -14,10 +14,11 @@ const ( hcloudNetwork = "HCLOUD_NETWORK" hcloudDebug = "HCLOUD_DEBUG" - robotEnabled = "ROBOT_ENABLED" - robotUser = "ROBOT_USER" - robotPassword = "ROBOT_PASSWORD" - robotCacheTimeout = "ROBOT_CACHE_TIMEOUT" + robotEnabled = "ROBOT_ENABLED" + robotUser = "ROBOT_USER" + robotPassword = "ROBOT_PASSWORD" + robotCacheTimeout = "ROBOT_CACHE_TIMEOUT" + robotRateLimitWaitTime = "ROBOT_RATE_LIMIT_WAIT_TIME" hcloudInstancesAddressFamily = "HCLOUD_INSTANCES_ADDRESS_FAMILY" @@ -43,10 +44,11 @@ type HCloudClientConfiguration struct { } type RobotConfiguration struct { - Enabled bool - User string - Password string - CacheTimeout time.Duration + Enabled bool + User string + Password string + CacheTimeout time.Duration + RateLimitWaitTime time.Duration } type MetricsConfiguration struct { @@ -124,6 +126,10 @@ func Read() (HCCMConfiguration, error) { if cfg.Robot.CacheTimeout == 0 { cfg.Robot.CacheTimeout = 5 * time.Minute } + cfg.Robot.RateLimitWaitTime, err = getEnvDuration(robotRateLimitWaitTime) + if err != nil { + errs = append(errs, err) + } cfg.Metrics.Enabled, err = getEnvBool(hcloudMetricsEnabled, true) if err != nil { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 860e4647..367b640d 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -74,14 +74,16 @@ func TestRead(t *testing.T) { "ROBOT_ENABLED", "true", "ROBOT_USER", "foobar", "ROBOT_PASSWORD", "secret-password", + "ROBOT_RATE_LIMIT_WAIT_TIME", "5m", "ROBOT_CACHE_TIMEOUT", "1m", }, want: HCCMConfiguration{ Robot: RobotConfiguration{ - Enabled: true, - User: "foobar", - Password: "secret-password", - CacheTimeout: 1 * time.Minute, + Enabled: true, + User: "foobar", + Password: "secret-password", + CacheTimeout: 1 * time.Minute, + RateLimitWaitTime: 5 * time.Minute, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, @@ -194,8 +196,10 @@ failed to parse HCLOUD_NETWORK_ROUTES_ENABLED: strconv.ParseBool: parsing "si": name: "error parsing duration values", env: []string{ "ROBOT_CACHE_TIMEOUT", "biweekly", + "ROBOT_RATE_LIMIT_WAIT_TIME", "42fortnights", }, - wantErr: errors.New(`failed to parse ROBOT_CACHE_TIMEOUT: time: invalid duration "biweekly"`), + wantErr: errors.New(`failed to parse ROBOT_CACHE_TIMEOUT: time: invalid duration "biweekly" +failed to parse ROBOT_RATE_LIMIT_WAIT_TIME: time: unknown unit "fortnights" in duration "42fortnights"`), }, } diff --git a/internal/robot/client.go b/internal/robot/cache.go similarity index 92% rename from internal/robot/client.go rename to internal/robot/cache.go index aa3522bf..2bb96c71 100644 --- a/internal/robot/client.go +++ b/internal/robot/cache.go @@ -4,12 +4,11 @@ import ( "sync" "time" - hrobot "github.com/syself/hrobot-go" hrobotmodels "github.com/syself/hrobot-go/models" ) type cacheRobotClient struct { - robotClient hrobot.RobotClient + robotClient Client timeout time.Duration lastUpdate time.Time @@ -21,7 +20,7 @@ type cacheRobotClient struct { serversByID map[int]*hrobotmodels.Server } -func NewClient(robotClient hrobot.RobotClient, cacheTimeout time.Duration) Client { +func NewCachedClient(cacheTimeout time.Duration, robotClient Client) Client { return &cacheRobotClient{ timeout: cacheTimeout, robotClient: robotClient, diff --git a/internal/robot/ratelimit.go b/internal/robot/ratelimit.go new file mode 100644 index 00000000..81dfd869 --- /dev/null +++ b/internal/robot/ratelimit.go @@ -0,0 +1,93 @@ +package robot + +import ( + "fmt" + "strings" + "time" + + hrobotmodels "github.com/syself/hrobot-go/models" +) + +type rateLimitClient struct { + robotClient Client + + waitTime time.Duration + exceeded bool + lastChecked time.Time +} + +func NewRateLimitedClient(rateLimitWaitTime time.Duration, robotClient Client) Client { + return &rateLimitClient{ + robotClient: robotClient, + + waitTime: rateLimitWaitTime, + } +} + +func (c *rateLimitClient) ServerGet(id int) (*hrobotmodels.Server, error) { + if c.isExceeded() { + return nil, c.getRateLimitError() + } + + server, err := c.robotClient.ServerGet(id) + c.handleError(err) + return server, err +} + +func (c *rateLimitClient) ServerGetList() ([]hrobotmodels.Server, error) { + if c.isExceeded() { + return nil, c.getRateLimitError() + } + + servers, err := c.robotClient.ServerGetList() + c.handleError(err) + return servers, err +} + +func (c *rateLimitClient) ResetGet(id int) (*hrobotmodels.Reset, error) { + if c.isExceeded() { + return nil, c.getRateLimitError() + } + + reset, err := c.robotClient.ResetGet(id) + c.handleError(err) + return reset, err +} + +func (c *rateLimitClient) set() { + c.exceeded = true + c.lastChecked = time.Now() +} + +func (c *rateLimitClient) isExceeded() bool { + if !c.exceeded { + return false + } + + if time.Now().Before(c.lastChecked.Add(c.waitTime)) { + return true + } + // Waiting time is over. Should try again + c.exceeded = false + c.lastChecked = time.Time{} + return false +} + +func (c *rateLimitClient) handleError(err error) { + if err == nil { + return + } + + if hrobotmodels.IsError(err, hrobotmodels.ErrorCodeRateLimitExceeded) || strings.Contains(err.Error(), "server responded with status code 403") { + c.set() + } +} + +func (c *rateLimitClient) getRateLimitError() error { + if !c.isExceeded() { + return nil + } + + nextPossibleCall := c.lastChecked.Add(c.waitTime) + return fmt.Errorf("rate limit exceeded, next try at %q", nextPossibleCall.String()) +} diff --git a/internal/robot/ratelimit_test.go b/internal/robot/ratelimit_test.go new file mode 100644 index 00000000..aac711c3 --- /dev/null +++ b/internal/robot/ratelimit_test.go @@ -0,0 +1,68 @@ +package robot + +import ( + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + hrobotmodels "github.com/syself/hrobot-go/models" + + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/mocks" +) + +func TestRateLimit(t *testing.T) { + mock := mocks.RobotClient{} + mock.On("ServerGetList").Return([]hrobotmodels.Server{}, nil).Once() + + client := NewRateLimitedClient(5*time.Minute, &mock) + + servers, err := client.ServerGetList() + assert.NoError(t, err) + assert.Len(t, servers, 0) + mock.AssertNumberOfCalls(t, "ServerGetList", 1) + + mock.On("ServerGetList").Return(nil, hrobotmodels.Error{Code: hrobotmodels.ErrorCodeRateLimitExceeded, Message: "Rate limit exceeded"}).Once() + _, err = client.ServerGetList() + assert.Error(t, err) + mock.AssertNumberOfCalls(t, "ServerGetList", 2) + + // No further call should be made + _, err = client.ServerGetList() + assert.Error(t, err) + mock.AssertNumberOfCalls(t, "ServerGetList", 2) +} + +func TestRateLimitIsExceeded(t *testing.T) { + client := rateLimitClient{ + waitTime: 5 * time.Minute, + exceeded: true, + lastChecked: time.Now(), + } + // Just exceeded + assert.True(t, client.isExceeded()) + + // Exceeded longer than wait time ago + client.lastChecked = time.Now().Add(-6 * time.Minute) + assert.False(t, client.isExceeded()) + + // Not exceeded ever + client.exceeded = false + client.lastChecked = time.Time{} + assert.False(t, client.isExceeded()) +} + +func TestRateLimitGetRateLimitError(t *testing.T) { + client := rateLimitClient{ + waitTime: 5 * time.Minute, + } + err := client.getRateLimitError() + assert.NoError(t, err) + + client.exceeded = true + client.lastChecked = time.Now() + + err = client.getRateLimitError() + assert.Error(t, err) + assert.True(t, strings.HasPrefix(err.Error(), "rate limit exceeded, next try at ")) +} -- GitLab