diff options
2 files changed, 5 insertions, 3 deletions
diff --git a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterController.java b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterController.java index df0e7b7d0b5..bd105c5c6c1 100644 --- a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterController.java +++ b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterController.java @@ -16,6 +16,7 @@ import com.yahoo.vespa.zookeeper.VespaZooKeeperServer; import java.util.LinkedHashMap; import java.util.Map; import java.util.TreeMap; +import java.util.concurrent.TimeUnit; import java.util.logging.Logger; /** @@ -98,7 +99,8 @@ public class ClusterController extends AbstractComponent private void verifyThatZooKeeperWorks(FleetControllerOptions options) throws Exception { if (options.zooKeeperServerAddress != null && !"".equals(options.zooKeeperServerAddress)) { try (Curator curator = Curator.create(options.zooKeeperServerAddress)) { - curator.framework().blockUntilConnected(); + if ( ! curator.framework().blockUntilConnected(60, TimeUnit.SECONDS)) + com.yahoo.protect.Process.logAndDie("Failed to connect to ZK, dying and restarting container"); } } } diff --git a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java index 4ef73ec2374..5296e3646c8 100644 --- a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java +++ b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java @@ -132,8 +132,8 @@ public class Reconfigurer extends AbstractComponent { /** Returns the timeout to use for the given joining server count */ private static Duration reconfigTimeout(int joiningServers) { - // For reconfig to succeed, the current ensemble must have a majority. When an ensemble grows and the joining - // servers outnumber the existing ones, we have to wait for enough of them to start to have a majority. + // For reconfig to succeed, the current and resulting ensembles must have a majority. When an ensemble grows and + // the joining servers outnumber the existing ones, we have to wait for enough of them to start to have a majority. return Duration.ofMillis(Math.max(joiningServers * NODE_TIMEOUT.toMillis(), MIN_TIMEOUT.toMillis())); } |