diff options
author | Harald Musum <musum@verizonmedia.com> | 2020-12-13 19:19:54 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-12-13 19:19:54 +0100 |
commit | f564cb4c8a05ddfffa6b0ef1c10ca03ff7594cdd (patch) | |
tree | 9fe23e4d1ca9e22ce578060c62b7fdaadfe8159d | |
parent | 1a5f1b49f221a50379006f39b5386d0723f1410b (diff) | |
parent | 382c6017f6fa89dc3288a96230ac3873a7e0502f (diff) |
Merge pull request #15799 from vespa-engine/mpolden/increase-timeout
Adjust total timeout according to joining servers
3 files changed, 69 insertions, 57 deletions
diff --git a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java index 4ef31f10f18..dfbdad6de5b 100644 --- a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java +++ b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java @@ -11,6 +11,7 @@ import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.List; +import java.util.Objects; import java.util.function.Consumer; import java.util.logging.Level; import java.util.logging.Logger; @@ -26,30 +27,33 @@ public class Reconfigurer extends AbstractComponent { private static final Logger log = java.util.logging.Logger.getLogger(Reconfigurer.class.getName()); - private static final Duration RECONFIG_TIMEOUT = Duration.ofMinutes(3); + private static final Duration MIN_TIMEOUT = Duration.ofMinutes(3); + private static final Duration NODE_TIMEOUT = Duration.ofMinutes(1); + + private final ExponentialBackoff backoff = new ExponentialBackoff(Duration.ofSeconds(1), Duration.ofSeconds(10)); + private final VespaZooKeeperAdmin vespaZooKeeperAdmin; + private final Consumer<Duration> sleeper; private ZooKeeperRunner zooKeeperRunner; private ZookeeperServerConfig activeConfig; - private final ExponentialBackoff backoff = new ExponentialBackoff(Duration.ofSeconds(1), Duration.ofSeconds(10)); - protected final VespaZooKeeperAdmin vespaZooKeeperAdmin; - @Inject public Reconfigurer(VespaZooKeeperAdmin vespaZooKeeperAdmin) { - this.vespaZooKeeperAdmin = vespaZooKeeperAdmin; - log.log(Level.FINE, "Created ZooKeeperReconfigurer"); + this(vespaZooKeeperAdmin, Reconfigurer::defaultSleeper); } - void startOrReconfigure(ZookeeperServerConfig newConfig, VespaZooKeeperServer server) { - startOrReconfigure(newConfig, Reconfigurer::defaultSleeper, server); + Reconfigurer(VespaZooKeeperAdmin vespaZooKeeperAdmin, Consumer<Duration> sleeper) { + this.vespaZooKeeperAdmin = Objects.requireNonNull(vespaZooKeeperAdmin); + this.sleeper = Objects.requireNonNull(sleeper); + log.log(Level.FINE, "Created ZooKeeperReconfigurer"); } - void startOrReconfigure(ZookeeperServerConfig newConfig, Consumer<Duration> sleeper, VespaZooKeeperServer server) { + void startOrReconfigure(ZookeeperServerConfig newConfig, VespaZooKeeperServer server) { if (zooKeeperRunner == null) zooKeeperRunner = startServer(newConfig, server); if (shouldReconfigure(newConfig)) - reconfigure(newConfig, sleeper); + reconfigure(newConfig); } ZookeeperServerConfig activeConfig() { @@ -74,18 +78,20 @@ public class Reconfigurer extends AbstractComponent { return runner; } - private void reconfigure(ZookeeperServerConfig newConfig, Consumer<Duration> sleeper) { + private void reconfigure(ZookeeperServerConfig newConfig) { Instant reconfigTriggered = Instant.now(); + List<String> newServers = difference(servers(newConfig), servers(activeConfig)); String leavingServers = String.join(",", difference(serverIds(activeConfig), serverIds(newConfig))); - String joiningServers = String.join(",", difference(servers(newConfig), servers(activeConfig))); + String joiningServers = String.join(",", newServers); leavingServers = leavingServers.isEmpty() ? null : leavingServers; joiningServers = joiningServers.isEmpty() ? null : joiningServers; log.log(Level.INFO, "Will reconfigure ZooKeeper cluster. Joining servers: " + joiningServers + ", leaving servers: " + leavingServers); String connectionSpec = localConnectionSpec(activeConfig); - Instant end = Instant.now().plus(RECONFIG_TIMEOUT); + Instant now = Instant.now(); + Instant end = now.plus(reconfigTimeout(newServers.size())); // Loop reconfiguring since we might need to wait until another reconfiguration is finished before we can succeed - for (int attempt = 1; Instant.now().isBefore(end); attempt++) { + for (int attempt = 1; now.isBefore(end); attempt++) { try { Instant reconfigStarted = Instant.now(); vespaZooKeeperAdmin.reconfigure(connectionSpec, joiningServers, leavingServers); @@ -98,13 +104,23 @@ public class Reconfigurer extends AbstractComponent { return; } catch (ReconfigException e) { Duration delay = backoff.delay(attempt); - log.log(Level.INFO, "Reconfiguration attempt " + attempt + " failed. Retrying in " + delay + ": " + - Exceptions.toMessageString(e)); + log.log(Level.WARNING, "Reconfiguration attempt " + attempt + " failed. Retrying in " + delay + + ", time left " + Duration.between(now, end) + ": " + + Exceptions.toMessageString(e)); sleeper.accept(delay); + } finally { + now = Instant.now(); } } } + /** Returns the timeout to use for the given joining server count */ + private static Duration reconfigTimeout(int joiningServers) { + // For reconfig to succeed, the current ensemble must have a majority. When an ensemble grows and the joining + // servers outnumber the existing ones, we have to wait for enough of them to start to have a majority. + return Duration.ofMillis(Math.max(joiningServers * NODE_TIMEOUT.toMillis(), MIN_TIMEOUT.toMillis())); + } + private static String localConnectionSpec(ZookeeperServerConfig config) { return HostName.getLocalhost() + ":" + config.clientPort(); } diff --git a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ZooKeeperRunner.java b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ZooKeeperRunner.java index 2f3c08dca26..fb6180976b1 100644 --- a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ZooKeeperRunner.java +++ b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ZooKeeperRunner.java @@ -61,7 +61,8 @@ public class ZooKeeperRunner implements Runnable { try { log.log(Level.INFO, "Starting ZooKeeper server with config file " + path.toFile().getAbsolutePath() + ". Trying to establish ZooKeeper quorum (members: " + zookeeperServerHostnames(zookeeperServerConfig) + ")"); - startServer(path); + startServer(path); // Will block in a real implementation of VespaZooKeeperServer + return; } catch (RuntimeException e) { log.log(Level.INFO, "Starting ZooKeeper server failed, will retry"); try { diff --git a/zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ReconfigurerTest.java b/zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ReconfigurerTest.java index 020c84f74fb..1ca79a885fb 100644 --- a/zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ReconfigurerTest.java +++ b/zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ReconfigurerTest.java @@ -71,18 +71,22 @@ public class ReconfigurerTest { @Test public void testReconfigureFailsWithReconfigInProgressThenSucceeds() { - reconfigurer = new TestableReconfigurer(new TemporarilyFailVespaZooKeeperAdmin()); - ZookeeperServerConfig initialConfig = createConfig(3, true); - reconfigurer.startOrReconfigure(initialConfig); - assertSame(initialConfig, reconfigurer.activeConfig()); - - ZookeeperServerConfig nextConfig = createConfig(5, true); - reconfigurer.startOrReconfigure(nextConfig); - assertEquals("node1:2181", reconfigurer.connectionSpec()); - assertEquals("3=node3:2182:2183;2181,4=node4:2182:2183;2181", reconfigurer.joiningServers()); - assertNull("No servers are leaving", reconfigurer.leavingServers()); - assertEquals(1, reconfigurer.reconfigurations()); - assertSame(nextConfig, reconfigurer.activeConfig()); + try { + TestableReconfigurer reconfigurer = new TestableReconfigurer(new TestableVespaZooKeeperAdmin().failures(3)); + ZookeeperServerConfig initialConfig = createConfig(3, true); + reconfigurer.startOrReconfigure(initialConfig); + assertSame(initialConfig, reconfigurer.activeConfig()); + + ZookeeperServerConfig nextConfig = createConfig(5, true); + reconfigurer.startOrReconfigure(nextConfig); + assertEquals("node1:2181", reconfigurer.connectionSpec()); + assertEquals("3=node3:2182:2183;2181,4=node4:2182:2183;2181", reconfigurer.joiningServers()); + assertNull("No servers are leaving", reconfigurer.leavingServers()); + assertEquals(1, reconfigurer.reconfigurations()); + assertSame(nextConfig, reconfigurer.activeConfig()); + } finally { + reconfigurer.shutdown(); + } } @Test @@ -119,13 +123,13 @@ public class ReconfigurerTest { return builder; } - private static class TestableReconfigurer extends Reconfigurer implements VespaZooKeeperServer{ + private static class TestableReconfigurer extends Reconfigurer implements VespaZooKeeperServer { - private final TestableVespaZooKeeperAdmin zkReconfigurer; + private final TestableVespaZooKeeperAdmin zooKeeperAdmin; - TestableReconfigurer(TestableVespaZooKeeperAdmin zkReconfigurer) { - super(zkReconfigurer); - this.zkReconfigurer = zkReconfigurer; + TestableReconfigurer(TestableVespaZooKeeperAdmin zooKeeperAdmin) { + super(zooKeeperAdmin, (ignored) -> {}); + this.zooKeeperAdmin = zooKeeperAdmin; HostName.setHostNameForTestingOnly("node1"); } @@ -133,25 +137,20 @@ public class ReconfigurerTest { startOrReconfigure(newConfig, this); } - @Override - void startOrReconfigure(ZookeeperServerConfig newConfig, VespaZooKeeperServer server) { - super.startOrReconfigure(newConfig, l -> {}, this); - } - String connectionSpec() { - return zkReconfigurer.connectionSpec; + return zooKeeperAdmin.connectionSpec; } String joiningServers() { - return zkReconfigurer.joiningServers; + return zooKeeperAdmin.joiningServers; } String leavingServers() { - return zkReconfigurer.leavingServers; + return zooKeeperAdmin.leavingServers; } int reconfigurations() { - return zkReconfigurer.reconfigurations; + return zooKeeperAdmin.reconfigurations; } @Override @@ -166,8 +165,18 @@ public class ReconfigurerTest { String leavingServers; int reconfigurations = 0; + private int failures = 0; + private int attempts = 0; + + public TestableVespaZooKeeperAdmin failures(int failures) { + this.failures = failures; + return this; + } + @Override public void reconfigure(String connectionSpec, String joiningServers, String leavingServers) throws ReconfigException { + if (++attempts < failures) + throw new ReconfigException("Reconfig failed"); this.connectionSpec = connectionSpec; this.joiningServers = joiningServers; this.leavingServers = leavingServers; @@ -176,19 +185,5 @@ public class ReconfigurerTest { } - // Fails 3 times with KeeperException.ReconfigInProgress(), then succeeds - private static class TemporarilyFailVespaZooKeeperAdmin extends TestableVespaZooKeeperAdmin { - - private int attempts = 0; - - public void reconfigure(String connectionSpec, String joiningServers, String leavingServers) throws ReconfigException { - if (++attempts < 3) - throw new ReconfigException("Reconfig failed"); - else - super.reconfigure(connectionSpec, joiningServers, leavingServers); - } - - } - } |