summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarald Musum <musum@verizonmedia.com>2020-12-13 19:19:54 +0100
committerGitHub <noreply@github.com>2020-12-13 19:19:54 +0100
commitf564cb4c8a05ddfffa6b0ef1c10ca03ff7594cdd (patch)
tree9fe23e4d1ca9e22ce578060c62b7fdaadfe8159d
parent1a5f1b49f221a50379006f39b5386d0723f1410b (diff)
parent382c6017f6fa89dc3288a96230ac3873a7e0502f (diff)
Merge pull request #15799 from vespa-engine/mpolden/increase-timeout
Adjust total timeout according to joining servers
-rw-r--r--zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java48
-rw-r--r--zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ZooKeeperRunner.java3
-rw-r--r--zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ReconfigurerTest.java75
3 files changed, 69 insertions, 57 deletions
diff --git a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java
index 4ef31f10f18..dfbdad6de5b 100644
--- a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java
+++ b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java
@@ -11,6 +11,7 @@ import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
+import java.util.Objects;
import java.util.function.Consumer;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -26,30 +27,33 @@ public class Reconfigurer extends AbstractComponent {
private static final Logger log = java.util.logging.Logger.getLogger(Reconfigurer.class.getName());
- private static final Duration RECONFIG_TIMEOUT = Duration.ofMinutes(3);
+ private static final Duration MIN_TIMEOUT = Duration.ofMinutes(3);
+ private static final Duration NODE_TIMEOUT = Duration.ofMinutes(1);
+
+ private final ExponentialBackoff backoff = new ExponentialBackoff(Duration.ofSeconds(1), Duration.ofSeconds(10));
+ private final VespaZooKeeperAdmin vespaZooKeeperAdmin;
+ private final Consumer<Duration> sleeper;
private ZooKeeperRunner zooKeeperRunner;
private ZookeeperServerConfig activeConfig;
- private final ExponentialBackoff backoff = new ExponentialBackoff(Duration.ofSeconds(1), Duration.ofSeconds(10));
- protected final VespaZooKeeperAdmin vespaZooKeeperAdmin;
-
@Inject
public Reconfigurer(VespaZooKeeperAdmin vespaZooKeeperAdmin) {
- this.vespaZooKeeperAdmin = vespaZooKeeperAdmin;
- log.log(Level.FINE, "Created ZooKeeperReconfigurer");
+ this(vespaZooKeeperAdmin, Reconfigurer::defaultSleeper);
}
- void startOrReconfigure(ZookeeperServerConfig newConfig, VespaZooKeeperServer server) {
- startOrReconfigure(newConfig, Reconfigurer::defaultSleeper, server);
+ Reconfigurer(VespaZooKeeperAdmin vespaZooKeeperAdmin, Consumer<Duration> sleeper) {
+ this.vespaZooKeeperAdmin = Objects.requireNonNull(vespaZooKeeperAdmin);
+ this.sleeper = Objects.requireNonNull(sleeper);
+ log.log(Level.FINE, "Created ZooKeeperReconfigurer");
}
- void startOrReconfigure(ZookeeperServerConfig newConfig, Consumer<Duration> sleeper, VespaZooKeeperServer server) {
+ void startOrReconfigure(ZookeeperServerConfig newConfig, VespaZooKeeperServer server) {
if (zooKeeperRunner == null)
zooKeeperRunner = startServer(newConfig, server);
if (shouldReconfigure(newConfig))
- reconfigure(newConfig, sleeper);
+ reconfigure(newConfig);
}
ZookeeperServerConfig activeConfig() {
@@ -74,18 +78,20 @@ public class Reconfigurer extends AbstractComponent {
return runner;
}
- private void reconfigure(ZookeeperServerConfig newConfig, Consumer<Duration> sleeper) {
+ private void reconfigure(ZookeeperServerConfig newConfig) {
Instant reconfigTriggered = Instant.now();
+ List<String> newServers = difference(servers(newConfig), servers(activeConfig));
String leavingServers = String.join(",", difference(serverIds(activeConfig), serverIds(newConfig)));
- String joiningServers = String.join(",", difference(servers(newConfig), servers(activeConfig)));
+ String joiningServers = String.join(",", newServers);
leavingServers = leavingServers.isEmpty() ? null : leavingServers;
joiningServers = joiningServers.isEmpty() ? null : joiningServers;
log.log(Level.INFO, "Will reconfigure ZooKeeper cluster. Joining servers: " + joiningServers +
", leaving servers: " + leavingServers);
String connectionSpec = localConnectionSpec(activeConfig);
- Instant end = Instant.now().plus(RECONFIG_TIMEOUT);
+ Instant now = Instant.now();
+ Instant end = now.plus(reconfigTimeout(newServers.size()));
// Loop reconfiguring since we might need to wait until another reconfiguration is finished before we can succeed
- for (int attempt = 1; Instant.now().isBefore(end); attempt++) {
+ for (int attempt = 1; now.isBefore(end); attempt++) {
try {
Instant reconfigStarted = Instant.now();
vespaZooKeeperAdmin.reconfigure(connectionSpec, joiningServers, leavingServers);
@@ -98,13 +104,23 @@ public class Reconfigurer extends AbstractComponent {
return;
} catch (ReconfigException e) {
Duration delay = backoff.delay(attempt);
- log.log(Level.INFO, "Reconfiguration attempt " + attempt + " failed. Retrying in " + delay + ": " +
- Exceptions.toMessageString(e));
+ log.log(Level.WARNING, "Reconfiguration attempt " + attempt + " failed. Retrying in " + delay +
+ ", time left " + Duration.between(now, end) + ": " +
+ Exceptions.toMessageString(e));
sleeper.accept(delay);
+ } finally {
+ now = Instant.now();
}
}
}
+ /** Returns the timeout to use for the given joining server count */
+ private static Duration reconfigTimeout(int joiningServers) {
+ // For reconfig to succeed, the current ensemble must have a majority. When an ensemble grows and the joining
+ // servers outnumber the existing ones, we have to wait for enough of them to start to have a majority.
+ return Duration.ofMillis(Math.max(joiningServers * NODE_TIMEOUT.toMillis(), MIN_TIMEOUT.toMillis()));
+ }
+
private static String localConnectionSpec(ZookeeperServerConfig config) {
return HostName.getLocalhost() + ":" + config.clientPort();
}
diff --git a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ZooKeeperRunner.java b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ZooKeeperRunner.java
index 2f3c08dca26..fb6180976b1 100644
--- a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ZooKeeperRunner.java
+++ b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/ZooKeeperRunner.java
@@ -61,7 +61,8 @@ public class ZooKeeperRunner implements Runnable {
try {
log.log(Level.INFO, "Starting ZooKeeper server with config file " + path.toFile().getAbsolutePath() +
". Trying to establish ZooKeeper quorum (members: " + zookeeperServerHostnames(zookeeperServerConfig) + ")");
- startServer(path);
+ startServer(path); // Will block in a real implementation of VespaZooKeeperServer
+ return;
} catch (RuntimeException e) {
log.log(Level.INFO, "Starting ZooKeeper server failed, will retry");
try {
diff --git a/zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ReconfigurerTest.java b/zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ReconfigurerTest.java
index 020c84f74fb..1ca79a885fb 100644
--- a/zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ReconfigurerTest.java
+++ b/zookeeper-server/zookeeper-server-common/src/test/java/com/yahoo/vespa/zookeeper/ReconfigurerTest.java
@@ -71,18 +71,22 @@ public class ReconfigurerTest {
@Test
public void testReconfigureFailsWithReconfigInProgressThenSucceeds() {
- reconfigurer = new TestableReconfigurer(new TemporarilyFailVespaZooKeeperAdmin());
- ZookeeperServerConfig initialConfig = createConfig(3, true);
- reconfigurer.startOrReconfigure(initialConfig);
- assertSame(initialConfig, reconfigurer.activeConfig());
-
- ZookeeperServerConfig nextConfig = createConfig(5, true);
- reconfigurer.startOrReconfigure(nextConfig);
- assertEquals("node1:2181", reconfigurer.connectionSpec());
- assertEquals("3=node3:2182:2183;2181,4=node4:2182:2183;2181", reconfigurer.joiningServers());
- assertNull("No servers are leaving", reconfigurer.leavingServers());
- assertEquals(1, reconfigurer.reconfigurations());
- assertSame(nextConfig, reconfigurer.activeConfig());
+ try {
+ TestableReconfigurer reconfigurer = new TestableReconfigurer(new TestableVespaZooKeeperAdmin().failures(3));
+ ZookeeperServerConfig initialConfig = createConfig(3, true);
+ reconfigurer.startOrReconfigure(initialConfig);
+ assertSame(initialConfig, reconfigurer.activeConfig());
+
+ ZookeeperServerConfig nextConfig = createConfig(5, true);
+ reconfigurer.startOrReconfigure(nextConfig);
+ assertEquals("node1:2181", reconfigurer.connectionSpec());
+ assertEquals("3=node3:2182:2183;2181,4=node4:2182:2183;2181", reconfigurer.joiningServers());
+ assertNull("No servers are leaving", reconfigurer.leavingServers());
+ assertEquals(1, reconfigurer.reconfigurations());
+ assertSame(nextConfig, reconfigurer.activeConfig());
+ } finally {
+ reconfigurer.shutdown();
+ }
}
@Test
@@ -119,13 +123,13 @@ public class ReconfigurerTest {
return builder;
}
- private static class TestableReconfigurer extends Reconfigurer implements VespaZooKeeperServer{
+ private static class TestableReconfigurer extends Reconfigurer implements VespaZooKeeperServer {
- private final TestableVespaZooKeeperAdmin zkReconfigurer;
+ private final TestableVespaZooKeeperAdmin zooKeeperAdmin;
- TestableReconfigurer(TestableVespaZooKeeperAdmin zkReconfigurer) {
- super(zkReconfigurer);
- this.zkReconfigurer = zkReconfigurer;
+ TestableReconfigurer(TestableVespaZooKeeperAdmin zooKeeperAdmin) {
+ super(zooKeeperAdmin, (ignored) -> {});
+ this.zooKeeperAdmin = zooKeeperAdmin;
HostName.setHostNameForTestingOnly("node1");
}
@@ -133,25 +137,20 @@ public class ReconfigurerTest {
startOrReconfigure(newConfig, this);
}
- @Override
- void startOrReconfigure(ZookeeperServerConfig newConfig, VespaZooKeeperServer server) {
- super.startOrReconfigure(newConfig, l -> {}, this);
- }
-
String connectionSpec() {
- return zkReconfigurer.connectionSpec;
+ return zooKeeperAdmin.connectionSpec;
}
String joiningServers() {
- return zkReconfigurer.joiningServers;
+ return zooKeeperAdmin.joiningServers;
}
String leavingServers() {
- return zkReconfigurer.leavingServers;
+ return zooKeeperAdmin.leavingServers;
}
int reconfigurations() {
- return zkReconfigurer.reconfigurations;
+ return zooKeeperAdmin.reconfigurations;
}
@Override
@@ -166,8 +165,18 @@ public class ReconfigurerTest {
String leavingServers;
int reconfigurations = 0;
+ private int failures = 0;
+ private int attempts = 0;
+
+ public TestableVespaZooKeeperAdmin failures(int failures) {
+ this.failures = failures;
+ return this;
+ }
+
@Override
public void reconfigure(String connectionSpec, String joiningServers, String leavingServers) throws ReconfigException {
+ if (++attempts < failures)
+ throw new ReconfigException("Reconfig failed");
this.connectionSpec = connectionSpec;
this.joiningServers = joiningServers;
this.leavingServers = leavingServers;
@@ -176,19 +185,5 @@ public class ReconfigurerTest {
}
- // Fails 3 times with KeeperException.ReconfigInProgress(), then succeeds
- private static class TemporarilyFailVespaZooKeeperAdmin extends TestableVespaZooKeeperAdmin {
-
- private int attempts = 0;
-
- public void reconfigure(String connectionSpec, String joiningServers, String leavingServers) throws ReconfigException {
- if (++attempts < 3)
- throw new ReconfigException("Reconfig failed");
- else
- super.reconfigure(connectionSpec, joiningServers, leavingServers);
- }
-
- }
-
}