diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2021-03-22 14:46:34 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-22 14:46:34 +0100 |
commit | df817de66efaff56e2032fc4723a77af445b0aad (patch) | |
tree | 33c0f0b64917d523835f0b36270bd6c06f4970e9 | |
parent | 3bcb78fd70ad0307183f9689c3dc05840d30a775 (diff) | |
parent | d8f449b620f572599dfa224a7536b8aa111f3b98 (diff) |
Merge pull request #17082 from vespa-engine/vekterli/inhibit-db-connectivity-until-slobrok-is-ready-2
More consistent cluster controller ZooKeeper and election handling [run-systemtest]
7 files changed, 50 insertions, 12 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 5d5ffb917d2..bdc8b8497aa 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -208,7 +208,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd public boolean isMaster() { synchronized (monitor) { - return masterElectionHandler.isMaster(); + return isMaster; } } @@ -386,7 +386,9 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd // Iff master, always store new version in ZooKeeper _before_ publishing to any // nodes so that a cluster controller crash after publishing but before a successful // ZK store will not risk reusing the same version number. - if (masterElectionHandler.isMaster()) { + // Use isMaster instead of election handler state, as isMaster is set _after_ we have + // completed a leadership event edge, so we know we have read from ZooKeeper. + if (isMaster) { storeClusterStateMetaDataToZooKeeper(stateBundle); } } @@ -438,7 +440,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd */ public void lostDatabaseConnection() { verifyInControllerThread(); - boolean wasMaster = masterElectionHandler.isMaster(); + boolean wasMaster = isMaster; masterElectionHandler.lostDatabaseConnection(); if (wasMaster) { // Enforce that we re-fetch all state information from ZooKeeper upon the next tick if we're still master. @@ -521,6 +523,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd masterElectionHandler.setFleetControllerCount(options.fleetControllerCount); masterElectionHandler.setMasterZooKeeperCooldownPeriod(options.masterZooKeeperCooldownPeriod); + masterElectionHandler.setUsingZooKeeper(options.zooKeeperServerAddress != null && !options.zooKeeperServerAddress.isEmpty()); if (rpcServer != null) { rpcServer.setMasterElectionHandler(masterElectionHandler); @@ -617,7 +620,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd if ( ! isRunning()) { return; } didWork |= systemStateBroadcaster.processResponses(); if ( ! isRunning()) { return; } - if (masterElectionHandler.isMaster()) { + if (isMaster) { didWork |= broadcastClusterStateToEligibleNodes(); systemStateBroadcaster.checkIfClusterStateIsAckedByAllDistributors(database, databaseContext, this); } @@ -769,7 +772,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd private boolean taskMayBeCompletedImmediately(RemoteClusterControllerTask task) { // We cannot introduce a version barrier for tasks when we're not the master (and therefore will not publish new versions). - return (!task.hasVersionAckDependency() || task.isFailed() || !masterElectionHandler.isMaster()); + return (!task.hasVersionAckDependency() || task.isFailed() || !isMaster); } private RemoteClusterControllerTask.Context createRemoteTaskProcessingContext() { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java index 5f391b7b8e7..0dd26026c5d 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java @@ -25,6 +25,7 @@ public class MasterElectionHandler implements MasterInterface { private Map<Integer, Integer> nextMasterData; private long masterGoneFromZooKeeperTime; // Set to time master fleet controller disappears from zookeeper private long masterZooKeeperCooldownPeriod; // The period in ms that we won't take over unless master come back. + private boolean usingZooKeeper = false; // Unit tests may not use ZooKeeper at all. public MasterElectionHandler(int index, int totalCount, Object monitor, Timer timer) { this.monitor = monitor; @@ -42,7 +43,7 @@ public class MasterElectionHandler implements MasterInterface { public void setFleetControllerCount(int count) { totalCount = count; - if (count == 1) { + if (count == 1 && !usingZooKeeper) { masterCandidate = 0; followers = 1; nextInLineCount = 0; @@ -53,6 +54,14 @@ public class MasterElectionHandler implements MasterInterface { masterZooKeeperCooldownPeriod = period; } + public void setUsingZooKeeper(boolean usingZK) { + if (!usingZooKeeper && usingZK) { + // Reset any shortcuts taken by non-ZK election logic. + resetElectionProgress(); + } + usingZooKeeper = usingZK; + } + @Override public boolean isMaster() { Integer master = getMaster(); @@ -106,7 +115,9 @@ public class MasterElectionHandler implements MasterInterface { public boolean watchMasterElection(DatabaseHandler database, DatabaseHandler.Context dbContext) throws InterruptedException { - if (totalCount == 1) return false; // No point in doing master election with only one node configured to be cluster controller + if (totalCount == 1 && !usingZooKeeper) { + return false; // Allow single configured node to become master implicitly if no ZK configured + } if (nextMasterData == null) { if (masterCandidate == null) { log.log(Level.FINEST, "Cluster controller " + index + ": No current master candidate. Waiting for data to do master election."); @@ -217,15 +228,19 @@ public class MasterElectionHandler implements MasterInterface { } public void lostDatabaseConnection() { - if (totalCount > 1) { + if (totalCount > 1 || usingZooKeeper) { log.log(Level.INFO, "Cluster controller " + index + ": Clearing master data as we lost connection on node " + index); - masterData = null; - masterCandidate = null; - followers = 0; - nextMasterData = null; + resetElectionProgress(); } } + private void resetElectionProgress() { + masterData = null; + masterCandidate = null; + followers = 0; + nextMasterData = null; + } + public void writeHtmlState(StringBuilder sb, int stateGatherCount) { sb.append("<h2>Master state</h2>\n"); Integer master = getMaster(); diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java index ceb81e91b7d..65b97a3ae82 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeLookup.java @@ -12,4 +12,11 @@ public interface NodeLookup { boolean updateCluster(ContentCluster cluster, NodeAddedOrRemovedListener listener); + /** + * Returns whether the lookup instance has been able to bootstrap itself with information about nodes. + * + * Calling updateCluster() _before_ isReady has returned true may not provide any useful data. + */ + boolean isReady(); + } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java index 3f04bbd9200..d19425a7c95 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java @@ -91,6 +91,7 @@ public class DatabaseHandler { private long lastZooKeeperConnectionAttempt = 0; private static final int minimumWaitBetweenFailedConnectionAttempts = 10000; private boolean lostZooKeeperConnectionEvent = false; + private boolean connectionEstablishmentIsAllowed = false; private Map<Integer, Integer> masterDataEvent = null; public DatabaseHandler(DatabaseFactory databaseFactory, Timer timer, String zooKeeperAddress, int ourIndex, Object monitor) throws InterruptedException diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java index b3bb458ed74..8649e7cc11a 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/rpc/SlobrokClient.java @@ -59,6 +59,7 @@ public class SlobrokClient implements NodeLookup { freshMirror = true; } + @Override public void shutdown() { if (supervisor != null) { supervisor.transport().shutdown().join(); @@ -67,6 +68,12 @@ public class SlobrokClient implements NodeLookup { public Mirror getMirror() { return mirror; } + @Override + public boolean isReady() { + return mirror != null && mirror.ready(); + } + + @Override public boolean updateCluster(ContentCluster cluster, NodeAddedOrRemovedListener listener) { if (mirror == null) return false; int mirrorVersion = mirror.updates(); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseHandlerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseHandlerTest.java index 9c0a94309a5..f6add77423a 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseHandlerTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DatabaseHandlerTest.java @@ -10,6 +10,7 @@ import org.junit.Test; import static org.hamcrest.Matchers.equalTo; import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.eq; diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java index b322c62967a..d7bca47026f 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DummyCommunicator.java @@ -154,4 +154,8 @@ public class DummyCommunicator implements Communicator, NodeLookup { return false; } + @Override + public boolean isReady() { + return true; + } } |