diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2021-03-08 13:11:46 +0100 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2021-03-08 14:37:20 +0100 |
commit | af8915becf15db9d2c2c1dbb3408e89d54dd074f (patch) | |
tree | d42e517235f0bbfdc3f8c753bc643b6f1cc2e6b3 /clustercontroller-core | |
parent | b583805ecb68e347776097b6f4ac3622b4d53004 (diff) |
Better handling of ZK connectivity issues concurrent with elections
Adds the following safeguards/improvements:
- Do not clear pending (non-persisted) writes over a `connect()` edge.
Avoids having the controller eternally wait for a doomed pending
write to be completed when it has no other events that can trigger
a new write.
- Trigger `lostDatabaseConnection()` whenever ZK is reconfigured to
ensure we reload the newest state before trying to compute/publish
any new states.
- Explicitly drop leadership in `lostDatabaseConnection()` to immediately
prevent controller from trying any funny leader-related business
since it no longer can depend on ZK watches triggering.
- When falling back to default state/cluster bundle, ensure that any
subsequent dependent znode write is predicated on the pre-existing
znode version being 0, i.e. did not previously exist.
Diffstat (limited to 'clustercontroller-core')
3 files changed, 59 insertions, 28 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index e8222d73a63..bafd7168f03 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -284,7 +284,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd } log.log(Level.INFO, "Fleetcontroller done shutting down event thread."); controllerThreadId = Thread.currentThread().getId(); - database.shutdown(this); + database.shutdown(databaseContext); if (statusPageServer != null) { statusPageServer.shutdown(); @@ -436,7 +436,13 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd */ public void lostDatabaseConnection() { verifyInControllerThread(); + boolean wasMaster = masterElectionHandler.isMaster(); masterElectionHandler.lostDatabaseConnection(); + if (wasMaster) { + // Enforce that we re-fetch all state information from ZooKeeper upon the next tick if we're still master. + dropLeadershipState(); + metricUpdater.updateMasterState(false); + } } private void failAllVersionDependentTasks() { @@ -501,8 +507,8 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd cluster.setPollingFrequency(options.statePollingFrequency); cluster.setDistribution(options.storageDistribution); cluster.setNodes(options.nodes); - database.setZooKeeperAddress(options.zooKeeperServerAddress); - database.setZooKeeperSessionTimeout(options.zooKeeperSessionTimeout); + database.setZooKeeperAddress(options.zooKeeperServerAddress, databaseContext); + database.setZooKeeperSessionTimeout(options.zooKeeperSessionTimeout, databaseContext); stateGatherer.setMaxSlobrokDisconnectGracePeriod(options.maxSlobrokDisconnectGracePeriod); stateGatherer.setNodeStateRequestTimeout(options.nodeStateRequestTimeoutMS); @@ -1066,18 +1072,22 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd wantedStateChanged = false; } } else { - if (isMaster) { - eventLog.add(new ClusterEvent(ClusterEvent.Type.MASTER_ELECTION, "This node is no longer fleetcontroller master.", timer.getCurrentTimeInMillis())); - firstAllowedStateBroadcast = Long.MAX_VALUE; - failAllVersionDependentTasks(); - } - wantedStateChanged = false; - isMaster = false; + dropLeadershipState(); } metricUpdater.updateMasterState(isMaster); return didWork; } + private void dropLeadershipState() { + if (isMaster) { + eventLog.add(new ClusterEvent(ClusterEvent.Type.MASTER_ELECTION, "This node is no longer fleetcontroller master.", timer.getCurrentTimeInMillis())); + firstAllowedStateBroadcast = Long.MAX_VALUE; + failAllVersionDependentTasks(); + } + wantedStateChanged = false; + isMaster = false; + } + public void run() { controllerThreadId = Thread.currentThread().getId(); try { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java index 5d5bb674d4f..94a76763f34 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java @@ -43,11 +43,15 @@ public class DatabaseHandler { ClusterStateBundle clusterStateBundle; void clear() { - masterVote = null; + clearNonClusterStateFields(); lastSystemStateVersion = null; + clusterStateBundle = null; + } + + void clearNonClusterStateFields() { + masterVote = null; wantedStates = null; startTimestamps = null; - clusterStateBundle = null; } } private class DatabaseListener implements Database.DatabaseListener { @@ -96,7 +100,7 @@ public class DatabaseHandler { this.nodeIndex = ourIndex; pendingStore.masterVote = ourIndex; // To begin with we'll vote for ourselves. this.monitor = monitor; - // TODO: Require non-null, not possible now since at least ClusterFeedBlockTest usese null address + // TODO: Require non-null, not possible now since at least ClusterFeedBlockTest uses null address this.zooKeeperAddress = zooKeeperAddress; } @@ -106,8 +110,8 @@ public class DatabaseHandler { } } - public void shutdown(FleetController fleetController) { - relinquishDatabaseConnectivity(fleetController); + public void shutdown(Context context) { + relinquishDatabaseConnectivity(context); } public boolean isClosed() { return database == null || database.isClosed(); } @@ -116,7 +120,7 @@ public class DatabaseHandler { return lastKnownStateBundleVersionWrittenBySelf; } - public void reset() { + public void reset(Context context) { final boolean wasRunning; synchronized (databaseMonitor) { wasRunning = database != null; @@ -126,37 +130,46 @@ public class DatabaseHandler { database = null; } } - clearSessionMetaData(); + clearSessionMetaData(true); + context.getFleetController().lostDatabaseConnection(); if (wasRunning) { log.log(Level.INFO, "Fleetcontroller " + nodeIndex + ": Done resetting database state"); } } - private void clearSessionMetaData() { + private void clearSessionMetaData(boolean clearPendingStateWrites) { // Preserve who we want to vote for Integer currentVote = (pendingStore.masterVote != null ? pendingStore.masterVote : currentlyStored.masterVote); currentlyStored.clear(); - pendingStore.clear(); + if (clearPendingStateWrites) { + pendingStore.clear(); + } else { + // If we have pending cluster state writes we cannot drop these on the floor, as otherwise the + // core CC logic may keep thinking it has persisted writes it really has not. Clearing pending + // state writes woudl also prevent the controller from detecting itself being out of sync by + // triggering CaS violations upon znode writes. + pendingStore.clearNonClusterStateFields(); + } pendingStore.masterVote = currentVote; log.log(Level.FINE, "Cleared session metadata. Pending master vote is now " + pendingStore.masterVote); } - public void setZooKeeperAddress(String address) { + public void setZooKeeperAddress(String address, Context context) { if (address == null && zooKeeperAddress == null) return; if (address != null && address.equals(zooKeeperAddress)) return; if (zooKeeperAddress != null) { log.log(Level.INFO, "Fleetcontroller " + nodeIndex + ": " + (address == null ? "Stopped using ZooKeeper." : "Got new ZooKeeper address to use: " + address)); } zooKeeperAddress = address; - reset(); + reset(context); } - public void setZooKeeperSessionTimeout(int timeout) { + public void setZooKeeperSessionTimeout(int timeout, Context context) { if (timeout == zooKeeperSessionTimeout) return; log.log(Level.FINE, "Fleetcontroller " + nodeIndex + ": Got new ZooKeeper session timeout of " + timeout + " milliseconds."); zooKeeperSessionTimeout = timeout; - reset(); + reset(context); } private boolean usingZooKeeper() { return (zooKeeperAddress != null); } @@ -169,7 +182,9 @@ public class DatabaseHandler { database.close(); } // We still hold the database lock while calling this, we want to block callers. - clearSessionMetaData(); + // Don't clear pending state writes in case they were attempted prior to connect() + // being called, but after receiving a database loss event. + clearSessionMetaData(false); log.log(Level.INFO, "Fleetcontroller " + nodeIndex + ": Setting up new ZooKeeper session at " + zooKeeperAddress); DatabaseFactory.Params params = new DatabaseFactory.Params() @@ -247,7 +262,7 @@ public class DatabaseHandler { "has likely taken over ownership: %s", e.getMessage())); // Clear DB and master election state. This shall trigger a full re-fetch of all // version and election-related metadata. - relinquishDatabaseConnectivity(context.getFleetController()); + relinquishDatabaseConnectivity(context); } return didWork; } @@ -257,9 +272,9 @@ public class DatabaseHandler { return zooKeeperAddress != null; } - private void relinquishDatabaseConnectivity(FleetController fleetController) { - reset(); - fleetController.lostDatabaseConnection(); + private void relinquishDatabaseConnectivity(Context context) { + // reset() will handle both session clearing and trigger a database loss callback into the CC. + reset(context); } private boolean performZooKeeperWrites() throws InterruptedException { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/ZooKeeperDatabase.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/ZooKeeperDatabase.java index 60835d5bb4f..4cdbb49dedc 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/ZooKeeperDatabase.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/ZooKeeperDatabase.java @@ -238,6 +238,9 @@ public class ZooKeeperDatabase extends Database { } catch (InterruptedException e) { throw (InterruptedException) new InterruptedException("Interrupted").initCause(e); } catch (Exception e) { + // If we return a default, empty version, writes dependent on this bundle should only + // succeed if the previous znode version is 0, i.e. not yet created. + lastKnownStateVersionZNodeVersion = 0; maybeLogExceptionWarning(e, "Failed to retrieve latest system state version used. Returning null"); return null; } @@ -390,6 +393,9 @@ public class ZooKeeperDatabase extends Database { maybeLogExceptionWarning(e, "Failed to retrieve last published cluster state bundle from " + "ZooKeeper, will use an empty state as baseline"); } + // If we return a default, empty bundle, writes dependent on this bundle should only + // succeed if the previous znode version is 0, i.e. not yet created. + lastKnownStateBundleZNodeVersion = 0; return ClusterStateBundle.ofBaselineOnly(AnnotatedClusterState.emptyState()); } |