aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2020-12-08 13:49:25 +0100
committerMartin Polden <mpolden@mpolden.no>2020-12-08 13:49:25 +0100
commit65a86f6ac1fd4de0c7e3f98ed767709ec8cc3db2 (patch)
tree6f7e83db4241caeb92820ca8c16e3792483b0eac
parentd1e33a9420805f5d416ca55ed79497fd28f20216 (diff)
Use exponential backoff for reconfig retries
Combined with connecting to localhost this should reduce the time it takes for all nodes to complete reconfig.
-rw-r--r--zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ExponentialBackoff.java47
-rw-r--r--zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java32
-rw-r--r--zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ExponentialBackoffTest.java32
3 files changed, 89 insertions, 22 deletions
diff --git a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ExponentialBackoff.java b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ExponentialBackoff.java
new file mode 100644
index 00000000000..9f3ca594d38
--- /dev/null
+++ b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ExponentialBackoff.java
@@ -0,0 +1,47 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.zookeeper;
+
+import java.time.Duration;
+import java.util.Random;
+
+/**
+ * Calculate a delay using an exponential backoff algorithm. Based on ExponentialBackOff in google-http-client.
+ *
+ * @author mpolden
+ */
+public class ExponentialBackoff {
+
+ private static final double RANDOMIZATION_FACTOR = 0.5;
+
+ private final Duration initialDelay;
+ private final Duration maxDelay;
+ private final Random random;
+
+ public ExponentialBackoff(Duration initialDelay, Duration maxDelay) {
+ this(initialDelay, maxDelay, new Random());
+ }
+
+ ExponentialBackoff(Duration initialDelay, Duration maxDelay, Random random) {
+ this.initialDelay = requireNonNegative(initialDelay);
+ this.maxDelay = requireNonNegative(maxDelay);
+ this.random = random;
+ }
+
+ /** Return the delay of given attempt */
+ public Duration delay(int attempt) {
+ if (attempt < 1) throw new IllegalArgumentException("Attempt must be positive");
+ double currentDelay = attempt * initialDelay.toMillis();
+ double delta = RANDOMIZATION_FACTOR * currentDelay;
+ double lowerDelay = currentDelay - delta;
+ double upperDelay = currentDelay + delta;
+ long millis = (long) Math.min(lowerDelay + (random.nextDouble() * (upperDelay - lowerDelay + 1)),
+ maxDelay.toMillis());
+ return Duration.ofMillis(millis);
+ }
+
+ private static Duration requireNonNegative(Duration d) {
+ if (d.isNegative()) throw new IllegalArgumentException("Invalid duration: " + d);
+ return d;
+ }
+
+}
diff --git a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java
index 7dcb9d2ec11..4ef31f10f18 100644
--- a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java
+++ b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java
@@ -26,18 +26,12 @@ public class Reconfigurer extends AbstractComponent {
private static final Logger log = java.util.logging.Logger.getLogger(Reconfigurer.class.getName());
- // How long to wait before triggering reconfig. This is multiplied by the node ID
- private static final Duration reconfigInterval = Duration.ofSeconds(5);
-
- // Total timeout for a reconfiguration
- private static final Duration reconfigTimeout = Duration.ofSeconds(30);
-
- // How long to wait between each retry
- private static final Duration retryWait = Duration.ofSeconds(1);
+ private static final Duration RECONFIG_TIMEOUT = Duration.ofMinutes(3);
private ZooKeeperRunner zooKeeperRunner;
private ZookeeperServerConfig activeConfig;
+ private final ExponentialBackoff backoff = new ExponentialBackoff(Duration.ofSeconds(1), Duration.ofSeconds(10));
protected final VespaZooKeeperAdmin vespaZooKeeperAdmin;
@Inject
@@ -86,37 +80,31 @@ public class Reconfigurer extends AbstractComponent {
String joiningServers = String.join(",", difference(servers(newConfig), servers(activeConfig)));
leavingServers = leavingServers.isEmpty() ? null : leavingServers;
joiningServers = joiningServers.isEmpty() ? null : joiningServers;
- log.log(Level.INFO, "Will reconfigure ZooKeeper cluster in " + reconfigWaitPeriod() +
- ". Joining servers: " + joiningServers + ", leaving servers: " + leavingServers);
- sleeper.accept(reconfigWaitPeriod());
+ log.log(Level.INFO, "Will reconfigure ZooKeeper cluster. Joining servers: " + joiningServers +
+ ", leaving servers: " + leavingServers);
String connectionSpec = localConnectionSpec(activeConfig);
- Instant end = Instant.now().plus(reconfigTimeout);
+ Instant end = Instant.now().plus(RECONFIG_TIMEOUT);
// Loop reconfiguring since we might need to wait until another reconfiguration is finished before we can succeed
- for (int attempts = 1; Instant.now().isBefore(end); attempts++) {
+ for (int attempt = 1; Instant.now().isBefore(end); attempt++) {
try {
Instant reconfigStarted = Instant.now();
vespaZooKeeperAdmin.reconfigure(connectionSpec, joiningServers, leavingServers);
Instant reconfigEnded = Instant.now();
log.log(Level.INFO, "Reconfiguration completed in " +
Duration.between(reconfigTriggered, reconfigEnded) +
- ", after " + attempts + " attempt(s). ZooKeeper reconfig call took " +
+ ", after " + attempt + " attempt(s). ZooKeeper reconfig call took " +
Duration.between(reconfigStarted, reconfigEnded));
activeConfig = newConfig;
return;
} catch (ReconfigException e) {
- log.log(Level.INFO, "Reconfiguration failed. Retrying in " + retryWait + ": " +
+ Duration delay = backoff.delay(attempt);
+ log.log(Level.INFO, "Reconfiguration attempt " + attempt + " failed. Retrying in " + delay + ": " +
Exceptions.toMessageString(e));
- sleeper.accept(retryWait);
+ sleeper.accept(delay);
}
}
}
- /** Returns how long this node should wait before reconfiguring the cluster */
- private Duration reconfigWaitPeriod() {
- if (activeConfig == null) return Duration.ZERO;
- return reconfigInterval.multipliedBy(activeConfig.myid());
- }
-
private static String localConnectionSpec(ZookeeperServerConfig config) {
return HostName.getLocalhost() + ":" + config.clientPort();
}
diff --git a/zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ExponentialBackoffTest.java b/zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ExponentialBackoffTest.java
new file mode 100644
index 00000000000..65e367f8ae8
--- /dev/null
+++ b/zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ExponentialBackoffTest.java
@@ -0,0 +1,32 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.zookeeper;
+
+import org.junit.Test;
+
+import java.time.Duration;
+import java.util.List;
+import java.util.Random;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author mpolden
+ */
+public class ExponentialBackoffTest {
+
+ @Test
+ public void delay() {
+ ExponentialBackoff b = new ExponentialBackoff(Duration.ofSeconds(1), Duration.ofSeconds(10), new Random(1000));
+ assertEquals(List.of(Duration.ofMillis(1210),
+ Duration.ofMillis(2150),
+ Duration.ofMillis(4340),
+ Duration.ofMillis(2157),
+ Duration.ofMillis(4932)),
+ IntStream.rangeClosed(1, 5)
+ .mapToObj(b::delay)
+ .collect(Collectors.toList()));
+ }
+
+}