diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2019-07-05 16:42:39 +0200 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2019-07-05 16:42:39 +0200 |
commit | e363dc115148782e882ae9789344f2fa4997c120 (patch) | |
tree | 15ff3ca123d0e3cca0fbbb38ab7a788832eadfaa /clustercontroller-core | |
parent | 23303b94e954b29d10350a5ad3dc585daac95a48 (diff) |
Do not allow states to be published when they have pending ZK writes
Avoids a race condition where a bundle ZK write fails but we have not yet
detected that ZK connectivity has been lost. This could lead to violating
the invariant that published state versions are strictly increasing.
Diffstat (limited to 'clustercontroller-core')
2 files changed, 15 insertions, 0 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index ba35243c14d..364184331a8 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -661,6 +661,12 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd } private boolean broadcastClusterStateToEligibleNodes() { + // If there's a pending DB store we have not yet been able to store the + // current state bundle to ZK and must therefore _not_ allow it to be published. + if (database.hasPendingClusterStateMetaDataStore()) { + log.log(LogLevel.DEBUG, "Can't publish current cluster state as it has one or more pending ZooKeeper stores"); + return false; + } boolean sentAny = false; // Give nodes a fair chance to respond first time to state gathering requests, so we don't // disturb system when we take over. Allow anyways if we have states from all nodes. diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java index f2b1b523aba..f30b86130c2 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java @@ -352,6 +352,15 @@ public class DatabaseHandler { doNextZooKeeperTask(context); } + // TODO should we expand this to cover _any_ pending ZK write? + public boolean hasPendingClusterStateMetaDataStore() { + synchronized (databaseMonitor) { + return ((zooKeeperAddress != null) && + ((pendingStore.clusterStateBundle != null) || + (pendingStore.lastSystemStateVersion != null))); + } + } + public ClusterStateBundle getLatestClusterStateBundle() throws InterruptedException { log.log(LogLevel.DEBUG, () -> String.format("Fleetcontroller %d: Retrieving latest cluster state bundle from ZooKeeper", nodeIndex)); synchronized (databaseMonitor) { |