diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2021-03-18 12:07:45 +0100 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2021-03-18 12:07:45 +0100 |
commit | c3c4996cb992820bd3543b9355ec1a23bcb5ef71 (patch) | |
tree | efcbb129f519d1e783554402a9552266768cbfc5 | |
parent | d4dacaaa3a7fa0dc79c467f25bb68de99c023771 (diff) |
Inhibit ZooKeeper connections until our local Slobrok mirror is ready.
Otherwise, if there are transient Slobrok issues during CC startup and
we end up winning the election, we risk publishing a cluster state where
the entire cluster appears down (since we do not have any knowledge of
Slobrok node mapping state). This will adversely affect availability for
all the obvious reasons.
6 files changed, 45 insertions, 2 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 5d5ffb917d2..898ebe243b0 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -597,6 +597,10 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd public void tick() throws Exception { synchronized (monitor) { boolean didWork; + // We defer ZooKeeper connections until our local Slobrok mirror is ready. + // Otherwise we risk winning an election without having any clue about the state of + // the nodes in the cluster, causing us to spuriously publish cluster down-states. + database.setConnectionEstablishmentIsAllowed(nodeLookup.isReady()); didWork = database.doNextZooKeeperTask(databaseContext); didWork |= updateMasterElectionState(); didWork |= handleLeadershipEdgeTransitions(); diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java index ceb81e91b7d..65b97a3ae82 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java @@ -12,4 +12,11 @@ public interface NodeLookup { boolean updateCluster(ContentCluster cluster, NodeAddedOrRemovedListener listener); + /** + * Returns whether the lookup instance has been able to bootstrap itself with information about nodes. + * + * Calling updateCluster() _before_ isReady has returned true may not provide any useful data. + */ + boolean isReady(); + } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java index 4f02e31b426..a3ef158c3f4 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java @@ -91,6 +91,7 @@ public class DatabaseHandler { private long lastZooKeeperConnectionAttempt = 0; private static final int minimumWaitBetweenFailedConnectionAttempts = 10000; private boolean lostZooKeeperConnectionEvent = false; + private boolean connectionEstablishmentIsAllowed = false; private Map<Integer, Integer> masterDataEvent = null; public DatabaseHandler(DatabaseFactory databaseFactory, Timer timer, String zooKeeperAddress, int ourIndex, Object monitor) throws InterruptedException @@ -242,7 +243,7 @@ public class DatabaseHandler { didWork = true; } } - if (isDatabaseClosedSafe() && zooKeeperIsConfigured()) { + if (isDatabaseClosedSafe() && zooKeeperIsConfigured() && connectionEstablishmentIsAllowed()) { long currentTime = timer.getCurrentTimeInMillis(); if (currentTime - lastZooKeeperConnectionAttempt < minimumWaitBetweenFailedConnectionAttempts) { return false; // Not time to attempt connection yet. @@ -267,6 +268,14 @@ public class DatabaseHandler { return didWork; } + private boolean connectionEstablishmentIsAllowed() { + return connectionEstablishmentIsAllowed; + } + + public void setConnectionEstablishmentIsAllowed(boolean allowed) { + connectionEstablishmentIsAllowed = allowed; + } + private boolean zooKeeperIsConfigured() { // This should only ever be null during unit testing. return zooKeeperAddress != null; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java index b3bb458ed74..cb3568ca853 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java @@ -59,6 +59,7 @@ public class SlobrokClient implements NodeLookup { freshMirror = true; } + @Override public void shutdown() { if (supervisor != null) { supervisor.transport().shutdown().join(); @@ -67,6 +68,12 @@ public class SlobrokClient implements NodeLookup { public Mirror getMirror() { return mirror; } + @Override + public boolean isReady() { + return mirror.ready(); + } + + @Override public boolean updateCluster(ContentCluster cluster, NodeAddedOrRemovedListener listener) { if (mirror == null) return false; int mirrorVersion = mirror.updates(); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseHandlerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseHandlerTest.java index 9c0a94309a5..7f664e65dc0 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseHandlerTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseHandlerTest.java @@ -10,6 +10,7 @@ import org.junit.Test; import static org.hamcrest.Matchers.equalTo; import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.eq; @@ -65,11 +66,22 @@ public class DatabaseHandlerTest { } DatabaseHandler createHandler() throws Exception { - return new DatabaseHandler(mockDbFactory, mockTimer, databaseAddress, 0, monitor); + var handler = new DatabaseHandler(mockDbFactory, mockTimer, databaseAddress, 0, monitor); + handler.setConnectionEstablishmentIsAllowed(true); + return handler; } } @Test + public void can_not_connect_to_database_if_connectivity_is_not_allowed() throws Exception { + Fixture f = new Fixture(); + DatabaseHandler handler = f.createHandler(); + handler.setConnectionEstablishmentIsAllowed(false); + handler.doNextZooKeeperTask(f.createMockContext()); + assertTrue(handler.isClosed()); // No connectivity allowed yet + } + + @Test public void can_store_latest_cluster_state_bundle() throws Exception { Fixture f = new Fixture(); DatabaseHandler handler = f.createHandler(); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java index b322c62967a..d7bca47026f 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java @@ -154,4 +154,8 @@ public class DummyCommunicator implements Communicator, NodeLookup { return false; } + @Override + public boolean isReady() { + return true; + } } |