diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2021-03-18 12:07:45 +0100 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2021-03-18 12:07:45 +0100 |
commit | c3c4996cb992820bd3543b9355ec1a23bcb5ef71 (patch) | |
tree | efcbb129f519d1e783554402a9552266768cbfc5 /clustercontroller-core/src/main | |
parent | d4dacaaa3a7fa0dc79c467f25bb68de99c023771 (diff) |
Inhibit ZooKeeper connections until our local Slobrok mirror is ready.
Otherwise, if there are transient Slobrok issues during CC startup and
we end up winning the election, we risk publishing a cluster state where
the entire cluster appears down (since we do not have any knowledge of
Slobrok node mapping state). This will adversely affect availability for
all the obvious reasons.
Diffstat (limited to 'clustercontroller-core/src/main')
4 files changed, 28 insertions, 1 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 5d5ffb917d2..898ebe243b0 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -597,6 +597,10 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd public void tick() throws Exception { synchronized (monitor) { boolean didWork; + // We defer ZooKeeper connections until our local Slobrok mirror is ready. + // Otherwise we risk winning an election without having any clue about the state of + // the nodes in the cluster, causing us to spuriously publish cluster down-states. + database.setConnectionEstablishmentIsAllowed(nodeLookup.isReady()); didWork = database.doNextZooKeeperTask(databaseContext); didWork |= updateMasterElectionState(); didWork |= handleLeadershipEdgeTransitions(); diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java index ceb81e91b7d..65b97a3ae82 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java @@ -12,4 +12,11 @@ public interface NodeLookup { boolean updateCluster(ContentCluster cluster, NodeAddedOrRemovedListener listener); + /** + * Returns whether the lookup instance has been able to bootstrap itself with information about nodes. + * + * Calling updateCluster() _before_ isReady has returned true may not provide any useful data. + */ + boolean isReady(); + } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java index 4f02e31b426..a3ef158c3f4 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java @@ -91,6 +91,7 @@ public class DatabaseHandler { private long lastZooKeeperConnectionAttempt = 0; private static final int minimumWaitBetweenFailedConnectionAttempts = 10000; private boolean lostZooKeeperConnectionEvent = false; + private boolean connectionEstablishmentIsAllowed = false; private Map<Integer, Integer> masterDataEvent = null; public DatabaseHandler(DatabaseFactory databaseFactory, Timer timer, String zooKeeperAddress, int ourIndex, Object monitor) throws InterruptedException @@ -242,7 +243,7 @@ public class DatabaseHandler { didWork = true; } } - if (isDatabaseClosedSafe() && zooKeeperIsConfigured()) { + if (isDatabaseClosedSafe() && zooKeeperIsConfigured() && connectionEstablishmentIsAllowed()) { long currentTime = timer.getCurrentTimeInMillis(); if (currentTime - lastZooKeeperConnectionAttempt < minimumWaitBetweenFailedConnectionAttempts) { return false; // Not time to attempt connection yet. @@ -267,6 +268,14 @@ public class DatabaseHandler { return didWork; } + private boolean connectionEstablishmentIsAllowed() { + return connectionEstablishmentIsAllowed; + } + + public void setConnectionEstablishmentIsAllowed(boolean allowed) { + connectionEstablishmentIsAllowed = allowed; + } + private boolean zooKeeperIsConfigured() { // This should only ever be null during unit testing. return zooKeeperAddress != null; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java index b3bb458ed74..cb3568ca853 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java @@ -59,6 +59,7 @@ public class SlobrokClient implements NodeLookup { freshMirror = true; } + @Override public void shutdown() { if (supervisor != null) { supervisor.transport().shutdown().join(); @@ -67,6 +68,12 @@ public class SlobrokClient implements NodeLookup { public Mirror getMirror() { return mirror; } + @Override + public boolean isReady() { + return mirror.ready(); + } + + @Override public boolean updateCluster(ContentCluster cluster, NodeAddedOrRemovedListener listener) { if (mirror == null) return false; int mirrorVersion = mirror.updates(); |