diff options
author | HÃ¥kon Hallingstad <hakon.hallingstad@gmail.com> | 2023-06-02 12:55:24 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-02 12:55:24 +0200 |
commit | 3a847df3386a724be512b748280f466d7b17435b (patch) | |
tree | 28209e9fa2e3f3f3b963c447969c65b2a1dd91d6 | |
parent | 449cf2278a6f7dd5c546a4efe7029766b1450fa6 (diff) | |
parent | 19dfcf47e91559ab0b29fe121e58a937aa3bcfb4 (diff) |
Merge pull request #27266 from vespa-engine/hmusum/cluster-controller-cleanup-6
Hmusum/cluster controller cleanup 6
16 files changed, 187 insertions, 264 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 42460b5943e..8027cec4e3c 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -485,11 +485,9 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta // TODO: remove as many temporal parameter dependencies as possible here. Currently duplication of state. stateChangeHandler.reconfigureFromOptions(options); - stateChangeHandler.setStateChangedFlag(); // Always trigger state recomputation after reconfig masterElectionHandler.setFleetControllerCount(options.fleetControllerCount()); masterElectionHandler.setMasterZooKeeperCooldownPeriod(options.masterZooKeeperCooldownPeriod()); - masterElectionHandler.setUsingZooKeeper(options.zooKeeperServerAddress() != null && !options.zooKeeperServerAddress().isEmpty()); if (rpcServer != null) { rpcServer.setMasterElectionHandler(masterElectionHandler); diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java index bac6a838300..e116bb28e46 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java @@ -178,7 +178,7 @@ public class FleetControllerOptions { this.distributionBits = distributionBits; this.zooKeeperSessionTimeout = zooKeeperSessionTimeout; this.masterZooKeeperCooldownPeriod = masterZooKeeperCooldownPeriod; - this.zooKeeperServerAddress = zooKeeperServerAddress; + this.zooKeeperServerAddress = Objects.requireNonNull(zooKeeperServerAddress, "zooKeeperServerAddress cannot be null"); this.maxTransitionTime = maxTransitionTime; this.maxInitProgressTime = maxInitProgressTime; this.maxPrematureCrashes = maxPrematureCrashes; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java index fa303533355..fc8a6a05573 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MasterElectionHandler.java @@ -26,7 +26,6 @@ public class MasterElectionHandler implements MasterInterface { private Map<Integer, Integer> nextMasterData; private long masterGoneFromZooKeeperTime; // Set to time master fleet controller disappears from zookeeper private long masterZooKeeperCooldownPeriod; // The period in ms that we won't take over unless master come back. - private boolean usingZooKeeper = false; // Unit tests may not use ZooKeeper at all. public MasterElectionHandler(FleetControllerContext context, int index, int totalCount, Object monitor, Timer timer) { this.context = context; @@ -34,7 +33,8 @@ public class MasterElectionHandler implements MasterInterface { this.timer = timer; this.index = index; this.totalCount = totalCount; - this.nextInLineCount = Integer.MAX_VALUE; + // nextInLineCount should/will always be 0 when we have one controller + this.nextInLineCount = totalCount == 1 ? 0 : Integer.MAX_VALUE; if (cannotBecomeMaster()) context.log(logger, Level.FINE, () -> "We can never become master and will always stay a follower."); // Tag current time as when we have not seen any other master. Make sure we're not taking over at once for master that is on the way down @@ -43,25 +43,12 @@ public class MasterElectionHandler implements MasterInterface { public void setFleetControllerCount(int count) { totalCount = count; - if (count == 1 && !usingZooKeeper) { - masterCandidate = 0; - followers = 1; - nextInLineCount = 0; - } } public void setMasterZooKeeperCooldownPeriod(int period) { masterZooKeeperCooldownPeriod = period; } - public void setUsingZooKeeper(boolean usingZK) { - if (!usingZooKeeper && usingZK) { - // Reset any shortcuts taken by non-ZK election logic. - resetElectionProgress(); - } - usingZooKeeper = usingZK; - } - @Override public boolean isMaster() { Integer master = getMaster(); @@ -121,15 +108,13 @@ public class MasterElectionHandler implements MasterInterface { public boolean isFirstInLine() { return (nextInLineCount < 1); } public boolean watchMasterElection(DatabaseHandler database, DatabaseHandler.DatabaseContext dbContext) { - if (totalCount == 1 && !usingZooKeeper) { - return false; // Allow single configured node to become master implicitly if no ZK configured - } if (nextMasterData == null) { if (masterCandidate == null) { context.log(logger, Level.FINEST, () -> "No current master candidate. Waiting for data to do master election."); } return false; // Nothing have happened since last time. } + // Move next data to temporary, such that we don't need to keep lock, and such that we don't retry // if we happen to fail processing the data. Map<Integer, Integer> state; @@ -140,6 +125,7 @@ public class MasterElectionHandler implements MasterInterface { } context.log(logger, Level.INFO, "Got master election state " + toString(state) + "."); if (state.isEmpty()) throw new IllegalStateException("Database has no master data. We should at least have data for ourselves."); + Map.Entry<Integer, Integer> first = state.entrySet().iterator().next(); Integer currentMaster = getMaster(); if (currentMaster != null && first.getKey().intValue() != currentMaster.intValue()) { @@ -238,10 +224,8 @@ public class MasterElectionHandler implements MasterInterface { } public void lostDatabaseConnection() { - if (totalCount > 1 || usingZooKeeper) { - context.log(logger, Level.INFO, "Clearing master data as we lost connection on node " + index); - resetElectionProgress(); - } + context.log(logger, Level.INFO, "Clearing master data as we lost connection on node " + index); + resetElectionProgress(); } private void resetElectionProgress() { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java index 28149477e36..2317777e43d 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateChangeHandler.java @@ -234,6 +234,7 @@ public class StateChangeHandler { setMaxInitProgressTime(options.maxInitProgressTime()); setMaxSlobrokDisconnectGracePeriod(options.maxSlobrokDisconnectGracePeriod()); setMaxTransitionTime(options.maxTransitionTime()); + setStateChangedFlag(); // Always trigger state recomputation after reconfig } // TODO too many hidden behavior dependencies between this and the actually diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java index efb97a4a69e..ed194776d78 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/database/DatabaseHandler.java @@ -15,6 +15,7 @@ import org.apache.zookeeper.KeeperException; import java.io.PrintWriter; import java.io.StringWriter; import java.util.Map; +import java.util.Objects; import java.util.TreeMap; import java.util.logging.Level; import java.util.logging.Logger; @@ -101,8 +102,7 @@ public class DatabaseHandler { this.timer = timer; pendingStore.masterVote = fleetControllerContext.id().index(); // To begin with we'll vote for ourselves. this.monitor = monitor; - // TODO: Require non-null, not possible now since at least ClusterFeedBlockTest uses null address - this.zooKeeperAddress = zooKeeperAddress; + this.zooKeeperAddress = Objects.requireNonNull(zooKeeperAddress, "zooKeeperAddress cannot be null"); } private boolean isDatabaseClosedSafe() { @@ -161,11 +161,9 @@ public class DatabaseHandler { } public void setZooKeeperAddress(String address, DatabaseContext databaseContext) { - if (address == null && zooKeeperAddress == null) return; - if (address != null && address.equals(zooKeeperAddress)) return; - if (zooKeeperAddress != null) { - fleetControllerContext.log(logger, Level.INFO, "" + (address == null ? "Stopped using ZooKeeper." : "Got new ZooKeeper address to use: " + address)); - } + Objects.requireNonNull(address, "address cannot be null"); + if (address.equals(zooKeeperAddress)) return; + fleetControllerContext.log(logger, Level.INFO, "Got new ZooKeeper address to use: " + address); zooKeeperAddress = address; reset(databaseContext); } @@ -177,8 +175,6 @@ public class DatabaseHandler { reset(databaseContext); } - private boolean usingZooKeeper() { return (zooKeeperAddress != null); } - private void connect(long currentTime) { try { lastZooKeeperConnectionAttempt = currentTime; @@ -245,7 +241,7 @@ public class DatabaseHandler { didWork = true; } } - if (isDatabaseClosedSafe() && zooKeeperIsConfigured()) { + if (isDatabaseClosedSafe()) { long currentTime = timer.getCurrentTimeInMillis(); if (currentTime - lastZooKeeperConnectionAttempt < minimumWaitBetweenFailedConnectionAttempts) { return false; // Not time to attempt connection yet. @@ -270,11 +266,6 @@ public class DatabaseHandler { return didWork; } - private boolean zooKeeperIsConfigured() { - // This should only ever be null during unit testing. - return zooKeeperAddress != null; - } - private void relinquishDatabaseConnectivity(DatabaseContext databaseContext) { // reset() will handle both session clearing and trigger a database loss callback into the CC. reset(databaseContext); @@ -383,9 +374,7 @@ public class DatabaseHandler { } Integer version = currentlyStored.lastSystemStateVersion; if (version == null) { - if (usingZooKeeper()) { - fleetControllerContext.log(logger, Level.WARNING, "Failed to retrieve latest system state version from ZooKeeper. Returning version 0."); - } + fleetControllerContext.log(logger, Level.WARNING, "Failed to retrieve latest system state version from ZooKeeper. Returning version 0."); return 0; // FIXME "fail-oblivious" is not a good error handling mode for such a critical component! } return version; @@ -395,22 +384,13 @@ public class DatabaseHandler { fleetControllerContext.log(logger, Level.FINE, () -> "Scheduling bundle " + clusterStateBundle + " to be saved to ZooKeeper"); pendingStore.clusterStateBundle = clusterStateBundle; doNextZooKeeperTask(databaseContext); - // FIXME this is a nasty hack to get around the fact that a massive amount of unit tests - // set up the system with a null ZooKeeper server address. If we don't fake that we have - // written the state version, the tests will never progress past waiting for state broadcasts. - if (zooKeeperAddress == null) { - logger.warning(() -> "Simulating ZK write of version " + clusterStateBundle.getVersion() + - ". This should not happen in production!"); - lastKnownStateBundleVersionWrittenBySelf = clusterStateBundle.getVersion(); - } } // TODO should we expand this to cover _any_ pending ZK write? public boolean hasPendingClusterStateMetaDataStore() { synchronized (databaseMonitor) { - return ((zooKeeperAddress != null) && - ((pendingStore.clusterStateBundle != null) || - (pendingStore.lastSystemStateVersion != null))); + return ((pendingStore.clusterStateBundle != null) || + (pendingStore.lastSystemStateVersion != null)); } } @@ -458,11 +438,9 @@ public class DatabaseHandler { } Map<Node, NodeState> wantedStates = currentlyStored.wantedStates; if (wantedStates == null) { - if (usingZooKeeper()) { - // We get here if the ZooKeeper client has lost the connection to ZooKeeper. - // TODO: Should instead fail the tick until connected!? - fleetControllerContext.log(logger, Level.FINE, () -> "Failed to retrieve wanted states from ZooKeeper. Assuming UP for all nodes."); - } + // We get here if the ZooKeeper client has lost connection to ZooKeeper. + // TODO: Should instead fail the tick until connected!? + fleetControllerContext.log(logger, Level.FINE, () -> "Failed to retrieve wanted states from ZooKeeper. Assuming UP for all nodes."); wantedStates = new TreeMap<>(); } boolean altered = false; @@ -510,9 +488,7 @@ public class DatabaseHandler { } Map<Node, Long> startTimestamps = currentlyStored.startTimestamps; if (startTimestamps == null) { - if (usingZooKeeper()) { - fleetControllerContext.log(logger, Level.WARNING, "Failed to retrieve start timestamps from ZooKeeper. Cluster state will be bloated with timestamps until we get them set."); - } + fleetControllerContext.log(logger, Level.WARNING, "Failed to retrieve start timestamps from ZooKeeper. Cluster state will be bloated with timestamps until we get them set."); startTimestamps = new TreeMap<>(); } for (Map.Entry<Node, Long> e : startTimestamps.entrySet()) { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java index 5aae401e157..7a9bea91b9c 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/status/LegacyIndexPageRequestHandler.java @@ -228,7 +228,7 @@ public class LegacyIndexPageRequestHandler implements StatusPageServer.RequestHa sb.append("<tr><td><nobr>RPC port</nobr></td><td align=\"right\">").append(options.rpcPort() == 0 ? "Pick random available" : options.rpcPort()).append("</td></tr>"); sb.append("<tr><td><nobr>HTTP port</nobr></td><td align=\"right\">").append(options.httpPort() == 0 ? "Pick random available" : options.httpPort()).append("</td></tr>"); sb.append("<tr><td><nobr>Master cooldown period</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.masterZooKeeperCooldownPeriod())).append("</td></tr>"); - String zooKeeperAddress = (options.zooKeeperServerAddress() == null ? "Not using Zookeeper" : splitZooKeeperAddress(options.zooKeeperServerAddress())); + String zooKeeperAddress = splitZooKeeperAddress(options.zooKeeperServerAddress()); sb.append("<tr><td><nobr>Zookeeper server address</nobr></td><td align=\"right\">").append(zooKeeperAddress).append("</td></tr>"); sb.append("<tr><td><nobr>Zookeeper session timeout</nobr></td><td align=\"right\">").append(RealTimer.printDuration(options.zooKeeperSessionTimeout())).append("</td></tr>"); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java index d4eea261767..55e256cf89c 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java @@ -32,15 +32,17 @@ public class ClusterFeedBlockTest extends FleetControllerTest { private FleetController ctrl; private DummyCommunicator communicator; - private void initialize(FleetControllerOptions options) throws Exception { + private void initialize(FleetControllerOptions.Builder builder) throws Exception { List<Node> nodes = new ArrayList<>(); - for (int i = 0; i < options.nodes().size(); ++i) { + for (int i = 0; i < builder.nodes().size(); ++i) { nodes.add(new Node(NodeType.STORAGE, i)); nodes.add(new Node(NodeType.DISTRIBUTOR, i)); } - var context = new TestFleetControllerContext(options); communicator = new DummyCommunicator(nodes, timer); + setUpZooKeeperServer(builder); + options = builder.build(); + var context = new TestFleetControllerContext(options); boolean start = false; ctrl = createFleetController(timer, options, context, communicator, communicator, null, start); @@ -57,16 +59,16 @@ public class ClusterFeedBlockTest extends FleetControllerTest { ctrl.tick(); } - private static FleetControllerOptions createOptions(Map<String, Double> feedBlockLimits, double clusterFeedBlockNoiseLevel) { + private static FleetControllerOptions.Builder createOptions(Map<String, Double> feedBlockLimits, double clusterFeedBlockNoiseLevel) { return defaultOptions() .setStorageDistribution(DistributionBuilder.forFlatCluster(NODE_COUNT)) .setNodes(new HashSet<>(DistributionBuilder.buildConfiguredNodes(NODE_COUNT))) .setClusterFeedBlockEnabled(true) .setClusterFeedBlockLimit(feedBlockLimits) - .setClusterFeedBlockNoiseLevel(clusterFeedBlockNoiseLevel).build(); + .setClusterFeedBlockNoiseLevel(clusterFeedBlockNoiseLevel); } - private static FleetControllerOptions createOptions(Map<String, Double> feedBlockLimits) { + private static FleetControllerOptions.Builder createOptions(Map<String, Double> feedBlockLimits) { return createOptions(feedBlockLimits, 0.0); } @@ -109,7 +111,7 @@ public class ClusterFeedBlockTest extends FleetControllerTest { assertTrue(ctrl.getClusterStateBundle().clusterFeedIsBlocked()); // Increase cheese allowance. Should now automatically unblock since reported usage is lower. - ctrl.updateOptions(createOptions(mapOf(usage("cheese", 0.9), usage("wine", 0.4)))); + ctrl.updateOptions(createOptions(mapOf(usage("cheese", 0.9), usage("wine", 0.4))).build()); ctrl.tick(); // Options propagation ctrl.tick(); // State recomputation assertFalse(ctrl.getClusterStateBundle().clusterFeedIsBlocked()); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java index 30c90ee0664..b5aebadd82b 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java @@ -870,7 +870,9 @@ public class ClusterStateGeneratorTest { .setMinNodeRatioPerGroup(0.6) .setDistributionBits(7) .setMaxTransitionTime(NodeType.DISTRIBUTOR, 1000) - .setMaxTransitionTime(NodeType.STORAGE, 2000).build(); + .setMaxTransitionTime(NodeType.STORAGE, 2000) + .setZooKeeperServerAddress("localhost:2181") + .build(); final ClusterStateGenerator.Params params = ClusterStateGenerator.Params.fromOptions(options); assertThat(params.maxPrematureCrashes, equalTo(options.maxPrematureCrashes())); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBitCountTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBitCountTest.java index 95b9d13cad5..11bdb6ec1c8 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBitCountTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBitCountTest.java @@ -22,7 +22,7 @@ public class DistributionBitCountTest extends FleetControllerTest { for (int i = 0 ; i < 10; i++) { configuredNodes.add(new ConfiguredNode(i, false)); } - var builder = defaultOptions("mycluster", configuredNodes); + var builder = defaultOptions(configuredNodes); builder.setDistributionBits(17); Timer timer = new RealTimer(); setUpFleetController(timer, builder); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java index 238a64459ca..fb59df7e433 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java @@ -25,6 +25,8 @@ import com.yahoo.vespa.clustercontroller.core.testutils.WaitTask; import com.yahoo.vespa.clustercontroller.core.testutils.Waiter; import com.yahoo.vespa.clustercontroller.utils.util.NoMetricReporter; import org.junit.jupiter.api.AfterEach; + +import java.io.IOException; import java.time.Duration; import java.util.ArrayList; import java.util.Arrays; @@ -72,31 +74,24 @@ public abstract class FleetControllerTest implements Waiter { } protected static FleetControllerOptions.Builder defaultOptions() { - return defaultOptions(DEFAULT_NODE_COUNT); - } - - protected static FleetControllerOptions.Builder defaultOptions(int nodeCount) { - return defaultOptions("mycluster", IntStream.range(0, nodeCount) - .mapToObj(i -> new ConfiguredNode(i, false)) - .collect(Collectors.toSet())); + return defaultOptions(IntStream.range(0, DEFAULT_NODE_COUNT) + .mapToObj(i -> new ConfiguredNode(i, false)) + .collect(Collectors.toSet())); } - protected static FleetControllerOptions.Builder defaultOptions(String clusterName, Collection<ConfiguredNode> nodes) { - var builder = new FleetControllerOptions.Builder(clusterName, nodes); + protected static FleetControllerOptions.Builder defaultOptions(Collection<ConfiguredNode> nodes) { + var builder = new FleetControllerOptions.Builder("mycluster", nodes); builder.enableTwoPhaseClusterStateActivation(true); // Enable by default, tests can explicitly disable. builder.setStorageDistribution(DistributionBuilder.forFlatCluster(builder.nodes().size())); + builder.setZooKeeperServerAddress("localhost:2181"); return builder; } - private void setUpSystem(FleetControllerOptions.Builder builder) throws Exception { - log.log(Level.FINE, "Setting up system"); - if (builder.zooKeeperServerAddress() != null) { - zooKeeperServer = new ZooKeeperTestServer(); - // Need to set zookeeper address again, as port number is not known until ZooKeeperTestServer has been created - builder.setZooKeeperServerAddress(zooKeeperServer.getAddress()); - log.log(Level.FINE, "Set up new zookeeper server at " + zooKeeperServer.getAddress()); - } - builder.setSlobrokConnectionSpecs(getSlobrokConnectionSpecs(slobrok)); + protected void setUpZooKeeperServer(FleetControllerOptions.Builder builder) throws IOException { + zooKeeperServer = new ZooKeeperTestServer(); + // Need to set zookeeper address again, as port number is not known until ZooKeeperTestServer has been created + builder.setZooKeeperServerAddress(zooKeeperServer.getAddress()); + log.log(Level.FINE, "Set up new zookeeper server at " + zooKeeperServer.getAddress()); } FleetController createFleetController(Timer timer, FleetControllerOptions options) { @@ -144,7 +139,8 @@ public abstract class FleetControllerTest implements Waiter { } protected FleetControllerOptions setUpFleetController(Timer timer, FleetControllerOptions.Builder builder) throws Exception { - setUpSystem(builder); + setUpZooKeeperServer(builder); + builder.setSlobrokConnectionSpecs(getSlobrokConnectionSpecs(slobrok)); options = builder.build(); startFleetController(timer); return options; diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MasterElectionTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MasterElectionTest.java index 77c89d77ba5..93a96be71a0 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MasterElectionTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/MasterElectionTest.java @@ -6,7 +6,6 @@ import com.yahoo.jrt.Spec; import com.yahoo.jrt.Supervisor; import com.yahoo.jrt.Target; import com.yahoo.jrt.Transport; -import com.yahoo.jrt.slobrok.server.Slobrok; import com.yahoo.vdslib.state.ClusterState; import com.yahoo.vdslib.state.NodeState; import com.yahoo.vdslib.state.NodeType; @@ -53,7 +52,6 @@ public class MasterElectionTest extends FleetControllerTest { if (zooKeeperServer == null) { zooKeeperServer = new ZooKeeperTestServer(); } - slobrok = new Slobrok(); builder.setZooKeeperSessionTimeout(defaultZkSessionTimeoutInMillis()) .setZooKeeperServerAddress(zooKeeperServer.getAddress()) .setSlobrokConnectionSpecs(getSlobrokConnectionSpecs(slobrok)) diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NoZooKeeperTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NoZooKeeperTest.java deleted file mode 100644 index 3d3a38aacd4..00000000000 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NoZooKeeperTest.java +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.clustercontroller.core; - -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.assertTrue; - -public class NoZooKeeperTest extends FleetControllerTest { - - @Test - void testWantedStatesInZooKeeper() throws Exception { - // Null is the default for zooKeeperServerAddress - FleetControllerOptions.Builder builder = defaultOptions(); - Timer timer = new FakeTimer(); - setUpFleetController(timer, builder); - setUpVdsNodes(timer); - waitForStableSystem(); - - assertTrue(nodes.get(0).isDistributor()); - nodes.get(0).disconnect(); - waitForState("version:\\d+ distributor:10 .0.s:d storage:10"); - - nodes.get(0).connect(); - waitForState("version:\\d+ distributor:10 storage:10"); - } -} diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java index 2c77767d6b4..e432efc1447 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java @@ -30,7 +30,7 @@ public class NodeSlobrokConfigurationMembershipTest extends FleetControllerTest } private FleetControllerOptions.Builder optionsForConfiguredNodes(Set<ConfiguredNode> configuredNodes) { - return defaultOptions("mycluster", configuredNodes) + return defaultOptions(configuredNodes) .setMaxSlobrokDisconnectGracePeriod(60 * 1000) .setNodeStateRequestTimeoutMS(10000 * 60 * 1000) .setMaxTransitionTime(NodeType.DISTRIBUTOR, 0) diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcServerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcServerTest.java index 02e3a4a4125..e75ade7309c 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcServerTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/RpcServerTest.java @@ -131,7 +131,7 @@ public class RpcServerTest extends FleetControllerTest { Set<ConfiguredNode> configuredNodes = new TreeSet<>(); for (int i = 0; i < 10; i++) configuredNodes.add(new ConfiguredNode(i, false)); - FleetControllerOptions.Builder builder = defaultOptions("mycluster", configuredNodes); + FleetControllerOptions.Builder builder = defaultOptions(configuredNodes); builder.setMinRatioOfStorageNodesUp(0); builder.setMaxInitProgressTime(30000); builder.setStableStateTimePeriod(60000); @@ -224,7 +224,7 @@ public class RpcServerTest extends FleetControllerTest { for (int i = 0; i < 4; i++) configuredNodes.add(new ConfiguredNode(i, false)); configuredNodes.add(new ConfiguredNode(4, true)); // Last node is configured retired - FleetControllerOptions.Builder builder = defaultOptions("mycluster", configuredNodes) + FleetControllerOptions.Builder builder = defaultOptions(configuredNodes) .setMinRatioOfStorageNodesUp(0) .setMaxInitProgressTime(30000) .setStableStateTimePeriod(60000); @@ -257,7 +257,7 @@ public class RpcServerTest extends FleetControllerTest { List<ConfiguredNode> configuredNodes = new ArrayList<>(); for (int i = 0; i < 5; i++) configuredNodes.add(new ConfiguredNode(i, false)); - FleetControllerOptions.Builder builder = defaultOptions("mycluster", configuredNodes) + FleetControllerOptions.Builder builder = defaultOptions(configuredNodes) .setMaxInitProgressTime(30000) .setStableStateTimePeriod(60000); setUpFleetController(timer, builder); @@ -281,10 +281,8 @@ public class RpcServerTest extends FleetControllerTest { configuredNodes.add(new ConfiguredNode(i, true)); configuredNodes.add(new ConfiguredNode(5, false)); configuredNodes.add(new ConfiguredNode(6, false)); - FleetControllerOptions.Builder builder = defaultOptions("mycluster", configuredNodes) - .setSlobrokConnectionSpecs(this.options.slobrokConnectionSpecs()) - .setMaxInitProgressTime(30000) - .setStableStateTimePeriod(60000); + var builder = FleetControllerOptions.Builder.copy(fleetController().getOptions()) + .setNodes(configuredNodes); fleetController().updateOptions(builder.build()); waitForState("version:\\d+ distributor:7 storage:7 .0.s:m .1.s:m .2.s:r .3.s:r .4.s:r"); } @@ -311,10 +309,8 @@ public class RpcServerTest extends FleetControllerTest { Set<ConfiguredNode> configuredNodes = new TreeSet<>(); for (int i = 0; i < 7; i++) configuredNodes.add(new ConfiguredNode(i, false)); - FleetControllerOptions.Builder builder = defaultOptions("mycluster", configuredNodes) - .setSlobrokConnectionSpecs(this.options.slobrokConnectionSpecs()) - .setMaxInitProgressTime(30000) - .setStableStateTimePeriod(60000); + var builder = FleetControllerOptions.Builder.copy(fleetController().getOptions()) + .setNodes(configuredNodes); fleetController().updateOptions(builder.build()); waitForState("version:\\d+ distributor:7 storage:7 .0.s:m .1.s:m"); } @@ -336,7 +332,7 @@ public class RpcServerTest extends FleetControllerTest { List<ConfiguredNode> configuredNodes = new ArrayList<>(); for (int i = 0; i < 5; i++) configuredNodes.add(new ConfiguredNode(i, false)); - FleetControllerOptions.Builder builder = defaultOptions("mycluster", configuredNodes) + FleetControllerOptions.Builder builder = defaultOptions(configuredNodes) .setMaxInitProgressTime(30000) .setStableStateTimePeriod(60000); options = builder.build(); @@ -349,10 +345,8 @@ public class RpcServerTest extends FleetControllerTest { Set<ConfiguredNode> configuredNodes = new TreeSet<>(); for (int i = 0; i < 5; i++) configuredNodes.add(new ConfiguredNode(i, false)); - FleetControllerOptions.Builder builder = defaultOptions("mycluster", configuredNodes) - .setSlobrokConnectionSpecs(options.slobrokConnectionSpecs()) - .setMaxInitProgressTime(30000) - .setStableStateTimePeriod(60000); + var builder = FleetControllerOptions.Builder.copy(fleetController().getOptions()) + .setNodes(configuredNodes); fleetController().updateOptions(builder.build()); waitForState("version:\\d+ distributor:5 storage:5"); } @@ -364,10 +358,8 @@ public class RpcServerTest extends FleetControllerTest { configuredNodes.add(new ConfiguredNode(i, true)); configuredNodes.add(new ConfiguredNode(5, false)); configuredNodes.add(new ConfiguredNode(6, false)); - FleetControllerOptions.Builder builder = defaultOptions("mycluster", configuredNodes) - .setSlobrokConnectionSpecs(options.slobrokConnectionSpecs()) - .setMaxInitProgressTime(30000) - .setStableStateTimePeriod(60000); + var builder = FleetControllerOptions.Builder.copy(fleetController().getOptions()) + .setNodes(configuredNodes); fleetController().updateOptions(builder.build()); waitForState("version:\\d+ distributor:7 storage:7 .0.s:r .1.s:r .2.s:r .3.s:r .4.s:r"); } @@ -378,10 +370,8 @@ public class RpcServerTest extends FleetControllerTest { configuredNodes.add(new ConfiguredNode(i, true)); configuredNodes.add(new ConfiguredNode(5, false)); configuredNodes.add(new ConfiguredNode(6, false)); - FleetControllerOptions.Builder builder = defaultOptions("mycluster", configuredNodes) - .setSlobrokConnectionSpecs(options.slobrokConnectionSpecs()) - .setMaxInitProgressTime(30000) - .setStableStateTimePeriod(60000); + var builder = FleetControllerOptions.Builder.copy(fleetController().getOptions()) + .setNodes(configuredNodes); fleetController().updateOptions(builder.build()); waitForState("version:\\d+ distributor:7 storage:7 .0.s:r .1.s:r .2.s:r .3.s:r .4.s:r"); } @@ -411,7 +401,7 @@ public class RpcServerTest extends FleetControllerTest { void testSetNodeState() throws Exception { Set<Integer> nodeIndexes = new TreeSet<>(List.of(4, 6, 9, 10, 14, 16, 21, 22, 23, 25)); Set<ConfiguredNode> configuredNodes = nodeIndexes.stream().map(i -> new ConfiguredNode(i, false)).collect(Collectors.toSet()); - FleetControllerOptions.Builder options = defaultOptions("mycluster", configuredNodes); + FleetControllerOptions.Builder options = defaultOptions(configuredNodes); //options.setStorageDistribution(new Distribution(getDistConfig(nodeIndexes))); setUpFleetController(timer, options); setUpVdsNodes(timer, false, nodeIndexes); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java index c0e116ef5fe..f2261794b75 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java @@ -31,16 +31,18 @@ public class StateChangeTest extends FleetControllerTest { private FleetController ctrl; private DummyCommunicator communicator; - private void initialize(FleetControllerOptions options) throws Exception { + private void initialize(FleetControllerOptions.Builder builder) throws Exception { List<Node> nodes = new ArrayList<>(); - for (int i = 0; i < options.nodes().size(); ++i) { + for (int i = 0; i < builder.nodes().size(); ++i) { nodes.add(new Node(NodeType.STORAGE, i)); nodes.add(new Node(NodeType.DISTRIBUTOR, i)); } - var context = new TestFleetControllerContext(options); + setUpZooKeeperServer(builder); communicator = new DummyCommunicator(nodes, timer); boolean start = false; + FleetControllerOptions options = builder.build(); + var context = new TestFleetControllerContext(options); ctrl = createFleetController(timer, options, context, communicator, communicator, null, start); ctrl.tick(); @@ -72,7 +74,7 @@ public class StateChangeTest extends FleetControllerTest { FleetControllerOptions.Builder options = defaultOptions(); options.setMaxInitProgressTime(50000); - initialize(options.build()); + initialize(options); // Should now pick up previous node states ctrl.tick(); @@ -95,7 +97,7 @@ public class StateChangeTest extends FleetControllerTest { // Regular init progress does not update the cluster state until the node is done initializing (or goes down, // whichever comes first). - assertEquals("version:6 distributor:10 .0.s:i .0.i:0.0 .1.s:i .1.i:0.0 .2.s:i .2.i:0.0 .3.s:i .3.i:0.0 " + + assertEquals("version:5 distributor:10 .0.s:i .0.i:0.0 .1.s:i .1.i:0.0 .2.s:i .2.i:0.0 .3.s:i .3.i:0.0 " + ".4.s:i .4.i:0.0 .5.s:i .5.i:0.0 .6.s:i .6.i:0.0 .7.s:i .7.i:0.0 .8.s:i .8.i:0.0 " + ".9.s:i .9.i:0.0 storage:10 .0.s:i .0.i:0.1 .1.s:i .1.i:0.1 .2.s:i .2.i:0.1 .3.s:i .3.i:0.1 " + ".4.s:i .4.i:0.1 .5.s:i .5.i:0.1 .6.s:i .6.i:0.1 .7.s:i .7.i:0.1 .8.s:i .8.i:0.1 .9.s:i .9.i:0.1", @@ -118,12 +120,12 @@ public class StateChangeTest extends FleetControllerTest { timer.advanceTime(options.maxInitProgressTime() / 20); ctrl.tick(); - assertEquals("version:8 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:7 distributor:10 storage:10", ctrl.getSystemState().toString()); verifyNodeEvents(new Node(NodeType.DISTRIBUTOR, 0), """ Event: distributor.0: Now reporting state U - Event: distributor.0: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: distributor.0: Altered node state in cluster state from 'D' to 'U' Event: distributor.0: Now reporting state I, i 0.00 Event: distributor.0: Altered node state in cluster state from 'U' to 'I, i 0.00' Event: distributor.0: Now reporting state U @@ -133,7 +135,7 @@ public class StateChangeTest extends FleetControllerTest { verifyNodeEvents(new Node(NodeType.STORAGE, 0), """ Event: storage.0: Now reporting state U - Event: storage.0: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: storage.0: Altered node state in cluster state from 'D' to 'U' Event: storage.0: Now reporting state I, i 0.00 (ls) Event: storage.0: Altered node state in cluster state from 'U' to 'D' Event: storage.0: Now reporting state I, i 0.100 (read) @@ -153,7 +155,7 @@ public class StateChangeTest extends FleetControllerTest { // Two-phase cluster state activation changes this quite a bit, so disable it. At least for now. .enableTwoPhaseClusterStateActivation(false); - initialize(builder.build()); + initialize(builder); ctrl.tick(); @@ -164,7 +166,7 @@ public class StateChangeTest extends FleetControllerTest { String desc = ctrl.getReportedNodeState(new Node(NodeType.DISTRIBUTOR, 0)).getDescription(); assertTrue(desc.contains("Closed at other end"), desc); - assertEquals("version:4 distributor:10 .0.s:d storage:10", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 .0.s:d storage:10", ctrl.getSystemState().toString()); timer.advanceTime(1000); @@ -175,7 +177,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:5 distributor:10 .0.t:12345678 storage:10 .0.s:m", ctrl.getSystemState().toString()); + assertEquals("version:4 distributor:10 .0.t:12345678 storage:10 .0.s:m", ctrl.getSystemState().toString()); assert(!ctrl.getReportedNodeState(new Node(NodeType.DISTRIBUTOR, 0)).hasDescription()); desc = ctrl.getReportedNodeState(new Node(NodeType.STORAGE, 0)).getDescription(); @@ -185,7 +187,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:6 distributor:10 .0.t:12345678 storage:10 .0.s:d", ctrl.getSystemState().toString()); + assertEquals("version:5 distributor:10 .0.t:12345678 storage:10 .0.s:d", ctrl.getSystemState().toString()); desc = ctrl.getReportedNodeState(new Node(NodeType.STORAGE, 0)).getDescription(); assertTrue(desc.contains("Closed at other end"), desc); @@ -198,14 +200,14 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:7 distributor:10 storage:10 .0.t:12345679", ctrl.getSystemState().toString()); + assertEquals("version:6 distributor:10 storage:10 .0.t:12345679", ctrl.getSystemState().toString()); assert(!ctrl.getReportedNodeState(new Node(NodeType.STORAGE, 0)).hasDescription()); verifyNodeEvents(new Node(NodeType.DISTRIBUTOR, 0), """ Event: distributor.0: Now reporting state U - Event: distributor.0: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: distributor.0: Altered node state in cluster state from 'D' to 'U' Event: distributor.0: Failed to get node state: D: Closed at other end Event: distributor.0: Stopped or possibly crashed after 0 ms, which is before stable state time period. Premature crash count is now 1. Event: distributor.0: Altered node state in cluster state from 'U' to 'D: Closed at other end' @@ -217,7 +219,7 @@ public class StateChangeTest extends FleetControllerTest { verifyNodeEvents(new Node(NodeType.STORAGE, 0), """ Event: storage.0: Now reporting state U - Event: storage.0: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: storage.0: Altered node state in cluster state from 'D' to 'U' Event: storage.0: Failed to get node state: D: Closed at other end Event: storage.0: Stopped or possibly crashed after 1000 ms, which is before stable state time period. Premature crash count is now 1. Event: storage.0: Altered node state in cluster state from 'U' to 'M: Closed at other end' @@ -243,7 +245,7 @@ public class StateChangeTest extends FleetControllerTest { .setNodeStateRequestTimeoutMS(60 * 60 * 1000) .setMaxSlobrokDisconnectGracePeriod(100000); - initialize(builder.build()); + initialize(builder); ctrl.tick(); @@ -264,7 +266,7 @@ public class StateChangeTest extends FleetControllerTest { tick(1000); - assertEquals("version:5 distributor:10 storage:10 .0.s:m", ctrl.getSystemState().toString()); + assertEquals("version:4 distributor:10 storage:10 .0.s:m", ctrl.getSystemState().toString()); assert(!ctrl.getReportedNodeState(new Node(NodeType.DISTRIBUTOR, 0)).hasDescription()); desc = ctrl.getReportedNodeState(new Node(NodeType.STORAGE, 0)).getDescription(); @@ -273,7 +275,7 @@ public class StateChangeTest extends FleetControllerTest { tick(builder.maxTransitionTime().get(NodeType.STORAGE) + 1); - assertEquals("version:6 distributor:10 storage:10 .0.s:d", ctrl.getSystemState().toString()); + assertEquals("version:5 distributor:10 storage:10 .0.s:d", ctrl.getSystemState().toString()); desc = ctrl.getReportedNodeState(new Node(NodeType.STORAGE, 0)).getDescription(); assertTrue(desc.contains("Received signal 15 (SIGTERM - Termination signal)") || desc.contains("controlled shutdown"), desc); @@ -282,7 +284,7 @@ public class StateChangeTest extends FleetControllerTest { tick(1000); - assertEquals("version:7 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:6 distributor:10 storage:10", ctrl.getSystemState().toString()); assert(!ctrl.getReportedNodeState(new Node(NodeType.STORAGE, 0)).hasDescription()); assertEquals(0, ctrl.getCluster().getNodeInfo(new Node(NodeType.DISTRIBUTOR, 0)).getPrematureCrashCount()); @@ -291,7 +293,7 @@ public class StateChangeTest extends FleetControllerTest { verifyNodeEvents(new Node(NodeType.DISTRIBUTOR, 0), """ Event: distributor.0: Now reporting state U - Event: distributor.0: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: distributor.0: Altered node state in cluster state from 'D' to 'U' Event: distributor.0: Failed to get node state: D: controlled shutdown Event: distributor.0: Altered node state in cluster state from 'U' to 'D: controlled shutdown' Event: distributor.0: Now reporting state U @@ -301,7 +303,7 @@ public class StateChangeTest extends FleetControllerTest { verifyNodeEvents(new Node(NodeType.STORAGE, 0), """ Event: storage.0: Now reporting state U - Event: storage.0: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: storage.0: Altered node state in cluster state from 'D' to 'U' Event: storage.0: Failed to get node state: D: controlled shutdown Event: storage.0: Altered node state in cluster state from 'U' to 'M: controlled shutdown' Event: storage.0: Exceeded implicit maintenance mode grace period of 5000 milliseconds. Marking node down. @@ -317,7 +319,7 @@ public class StateChangeTest extends FleetControllerTest { FleetControllerOptions.Builder builder = defaultOptions() .setMaxSlobrokDisconnectGracePeriod(60 * 1000); - initialize(builder.build()); + initialize(builder); ctrl.tick(); @@ -333,7 +335,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); ctrl.tick(); - assertEquals("version:3 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:2 distributor:10 storage:10", ctrl.getSystemState().toString()); nodes = new ArrayList<>(); for (int i = 0; i < 10; ++i) { @@ -345,12 +347,12 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:3 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:2 distributor:10 storage:10", ctrl.getSystemState().toString()); verifyNodeEvents(new Node(NodeType.STORAGE, 0), """ Event: storage.0: Now reporting state U - Event: storage.0: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: storage.0: Altered node state in cluster state from 'D' to 'U' Event: storage.0: Node is no longer in slobrok, but we still have a pending state request. """); } @@ -360,13 +362,13 @@ public class StateChangeTest extends FleetControllerTest { FleetControllerOptions.Builder builder = defaultOptions() .setMaxSlobrokDisconnectGracePeriod(60 * 1000); - initialize(builder.build()); + initialize(builder); communicator.setNodeState(new Node(NodeType.STORAGE, 6), State.DOWN, "Connection error: Closed at other end"); ctrl.tick(); - assertEquals("version:4 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); NodeState ns = ctrl.getReportedNodeState(new Node(NodeType.STORAGE, 6)); assertTrue(ns.getDescription().contains("Connection error: Closed at other end"), ns.toString()); @@ -378,14 +380,14 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); // Still maintenance since .i progress 0.0 is really down. - assertEquals("version:4 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.STORAGE, 6), new NodeState(NodeType.STORAGE, State.INITIALIZING).setInitProgress(0.6f), ""); ctrl.tick(); // Now it's OK - assertEquals("version:5 distributor:10 storage:10 .6.s:i .6.i:0.6", ctrl.getSystemState().toString()); + assertEquals("version:4 distributor:10 storage:10 .6.s:i .6.i:0.6", ctrl.getSystemState().toString()); tick(1000); @@ -393,13 +395,13 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:6 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:5 distributor:10 storage:10", ctrl.getSystemState().toString()); assert(!ctrl.getReportedNodeState(new Node(NodeType.STORAGE, 6)).hasDescription()); verifyNodeEvents(new Node(NodeType.STORAGE, 6), """ Event: storage.6: Now reporting state U - Event: storage.6: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: storage.6: Altered node state in cluster state from 'D' to 'U' Event: storage.6: Failed to get node state: D: Connection error: Closed at other end Event: storage.6: Stopped or possibly crashed after 0 ms, which is before stable state time period. Premature crash count is now 1. Event: storage.6: Altered node state in cluster state from 'U' to 'M: Connection error: Closed at other end' @@ -419,16 +421,16 @@ public class StateChangeTest extends FleetControllerTest { nodes.add(new ConfiguredNode(i, retired)); } - FleetControllerOptions.Builder builder = defaultOptions("mycluster", nodes) + FleetControllerOptions.Builder builder = defaultOptions(nodes) .setMaxSlobrokDisconnectGracePeriod(60 * 1000); - initialize(builder.build()); + initialize(builder); communicator.setNodeState(new Node(NodeType.STORAGE, 6), State.DOWN, "Connection error: Closed at other end"); ctrl.tick(); - assertEquals("version:4 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); NodeState ns = ctrl.getReportedNodeState(new Node(NodeType.STORAGE, 6)); assertTrue(ns.getDescription().contains("Connection error: Closed at other end"), ns.toString()); @@ -440,14 +442,14 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); // Still maintenance since .i progress 0.0 is really down. - assertEquals("version:4 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.STORAGE, 6), new NodeState(NodeType.STORAGE, State.INITIALIZING).setInitProgress(0.6f), ""); ctrl.tick(); // Still maintenance since configured. - assertEquals("version:4 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); tick(1000); @@ -455,13 +457,13 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:5 distributor:10 storage:10 .6.s:r", ctrl.getSystemState().toString()); + assertEquals("version:4 distributor:10 storage:10 .6.s:r", ctrl.getSystemState().toString()); assert(!ctrl.getReportedNodeState(new Node(NodeType.STORAGE, 6)).hasDescription()); verifyNodeEvents(new Node(NodeType.STORAGE, 6), """ Event: storage.6: Now reporting state U - Event: storage.6: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'R' + Event: storage.6: Altered node state in cluster state from 'D' to 'R' Event: storage.6: Failed to get node state: D: Connection error: Closed at other end Event: storage.6: Stopped or possibly crashed after 0 ms, which is before stable state time period. Premature crash count is now 1. Event: storage.6: Altered node state in cluster state from 'R' to 'M: Connection error: Closed at other end' @@ -480,21 +482,21 @@ public class StateChangeTest extends FleetControllerTest { nodes.add(new ConfiguredNode(i, retired)); } - FleetControllerOptions.Builder builder = defaultOptions("mycluster", nodes) + FleetControllerOptions.Builder builder = defaultOptions(nodes) .setMaxSlobrokDisconnectGracePeriod(60 * 1000); - initialize(builder.build()); + initialize(builder); communicator.setNodeState(new Node(NodeType.STORAGE, 6), State.DOWN, "Connection error: Closed at other end"); ctrl.tick(); - assertEquals("version:4 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); timer.advanceTime(100000); ctrl.tick(); - assertEquals("version:5 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:4 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); } // Test that a node that has been down for a long time (above steady state period), actually alters cluster state to @@ -509,7 +511,7 @@ public class StateChangeTest extends FleetControllerTest { .setNodeStateRequestTimeoutMS(1000000) .setMaxSlobrokDisconnectGracePeriod(1000000); - initialize(builder.build()); + initialize(builder); timer.advanceTime(100000); // Node has been in steady state up ctrl.tick(); @@ -518,40 +520,40 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:4 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); timer.advanceTime(100000); // Node has been in steady state down ctrl.tick(); - assertEquals("version:5 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:4 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.STORAGE, 6), new NodeState(NodeType.STORAGE, State.INITIALIZING).setInitProgress(0.001f), ""); ctrl.tick(); - assertEquals("version:5 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:4 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.STORAGE, 6), new NodeState(NodeType.STORAGE, State.INITIALIZING).setInitProgress(0.1f), ""); ctrl.tick(); - assertEquals("version:6 distributor:10 storage:10 .6.s:i .6.i:0.1", ctrl.getSystemState().toString()); + assertEquals("version:5 distributor:10 storage:10 .6.s:i .6.i:0.1", ctrl.getSystemState().toString()); ctrl.tick(); - assertEquals("version:6 distributor:10 storage:10 .6.s:i .6.i:0.1", ctrl.getSystemState().toString()); + assertEquals("version:5 distributor:10 storage:10 .6.s:i .6.i:0.1", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.STORAGE, 6), new NodeState(NodeType.STORAGE, State.UP), ""); ctrl.tick(); - assertEquals("version:7 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:6 distributor:10 storage:10", ctrl.getSystemState().toString()); verifyNodeEvents(new Node(NodeType.STORAGE, 6), """ Event: storage.6: Now reporting state U - Event: storage.6: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: storage.6: Altered node state in cluster state from 'D' to 'U' Event: storage.6: Failed to get node state: D: Connection error: Closed at other end Event: storage.6: Altered node state in cluster state from 'U' to 'M: Connection error: Closed at other end' Event: storage.6: Exceeded implicit maintenance mode grace period of 5000 milliseconds. Marking node down. @@ -573,7 +575,7 @@ public class StateChangeTest extends FleetControllerTest { .setStableStateTimePeriod(1000000) .setMaxSlobrokDisconnectGracePeriod(10000000); - initialize(builder.build()); + initialize(builder); timer.advanceTime(1000000); // Node has been in steady state up @@ -583,26 +585,26 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:4 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); timer.advanceTime(1000000); // Node has been in steady state down ctrl.tick(); - assertEquals("version:5 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:4 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.STORAGE, 6), new NodeState(NodeType.STORAGE, State.INITIALIZING).setInitProgress(0.1f), ""); ctrl.tick(); - assertEquals("version:6 distributor:10 storage:10 .6.s:i .6.i:0.1", ctrl.getSystemState().toString()); + assertEquals("version:5 distributor:10 storage:10 .6.s:i .6.i:0.1", ctrl.getSystemState().toString()); timer.advanceTime(builder.maxInitProgressTime() + 1); ctrl.tick(); // We should now get the node marked down. - assertEquals("version:7 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:6 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); tick(1000); @@ -621,7 +623,7 @@ public class StateChangeTest extends FleetControllerTest { tick(1000); // Still down since it seemingly crashed during last init. - assertEquals("version:7 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:6 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); ctrl.tick(); @@ -629,12 +631,12 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:8 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:7 distributor:10 storage:10", ctrl.getSystemState().toString()); verifyNodeEvents(new Node(NodeType.STORAGE, 6), """ Event: storage.6: Now reporting state U - Event: storage.6: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: storage.6: Altered node state in cluster state from 'D' to 'U' Event: storage.6: Failed to get node state: D: Connection error: Closed at other end Event: storage.6: Altered node state in cluster state from 'U' to 'M: Connection error: Closed at other end' Event: storage.6: Exceeded implicit maintenance mode grace period of 5000 milliseconds. Marking node down. @@ -662,7 +664,7 @@ public class StateChangeTest extends FleetControllerTest { // Set long so we don't time out RPC requests and mark nodes down due to advancing time to get in steady state builder.setNodeStateRequestTimeoutMS((int) builder.stableStateTimePeriod() * 2); - initialize(builder.build()); + initialize(builder); timer.advanceTime(1000000); // Node has been in steady state up @@ -672,19 +674,19 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:4 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); timer.advanceTime(1000000); // Node has been in steady state down ctrl.tick(); - assertEquals("version:5 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:4 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.STORAGE, 6), new NodeState(NodeType.STORAGE, State.INITIALIZING).setInitProgress(0.3f), ""); ctrl.tick(); - assertEquals("version:6 distributor:10 storage:10 .6.s:i .6.i:0.3", ctrl.getSystemState().toString()); + assertEquals("version:5 distributor:10 storage:10 .6.s:i .6.i:0.3", ctrl.getSystemState().toString()); ctrl.tick(); @@ -692,7 +694,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:7 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:6 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); } @Test @@ -705,7 +707,7 @@ public class StateChangeTest extends FleetControllerTest { // Set very high so the advanceTime don't start sending state replies right before we disconnect. .setNodeStateRequestTimeoutMS(365 * 24 * 60 * 1000); - initialize(builder.build()); + initialize(builder); timer.advanceTime(1000000); // Node has been in steady state up @@ -715,13 +717,13 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:4 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); timer.advanceTime(1000000); // Node has been in steady state down ctrl.tick(); - assertEquals("version:5 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:4 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); ctrl.tick(); @@ -729,7 +731,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:6 distributor:10 storage:10 .6.s:i .6.i:0.3", ctrl.getSystemState().toString()); + assertEquals("version:5 distributor:10 storage:10 .6.s:i .6.i:0.3", ctrl.getSystemState().toString()); ctrl.tick(); @@ -737,7 +739,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:7 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:6 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); tick(1000); @@ -745,13 +747,13 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:7 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:6 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.STORAGE, 6), State.UP, ""); ctrl.tick(); - assertEquals("version:8 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:7 distributor:10 storage:10", ctrl.getSystemState().toString()); } @Test @@ -764,7 +766,7 @@ public class StateChangeTest extends FleetControllerTest { .setStableStateTimePeriod(1000000) .setMaxSlobrokDisconnectGracePeriod(10000000); - initialize(builder.build()); + initialize(builder); timer.advanceTime(1000000); // Node has been in steady state up @@ -774,13 +776,13 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:4 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 storage:10 .6.s:m", ctrl.getSystemState().toString()); timer.advanceTime(1000000); // Node has been in steady state down ctrl.tick(); - assertEquals("version:5 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:4 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); for (int j = 0; j <= builder.maxPrematureCrashes(); ++j) { ctrl.tick(); @@ -804,7 +806,7 @@ public class StateChangeTest extends FleetControllerTest { tick(1000); } - assertEquals("version:7 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); + assertEquals("version:6 distributor:10 storage:10 .6.s:d", ctrl.getSystemState().toString()); } @Test @@ -818,13 +820,13 @@ public class StateChangeTest extends FleetControllerTest { .setMinRatioOfDistributorNodesUp(0.0) .setMinRatioOfStorageNodesUp(0.0); - initialize(builder.build()); + initialize(builder); timer.advanceTime(1000000); // Node has been in steady state up ctrl.tick(); - assertEquals("version:3 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:2 distributor:10 storage:10", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.DISTRIBUTOR, 0), State.DOWN, "Connection error: Closed at other end"); communicator.setNodeState(new Node(NodeType.DISTRIBUTOR, 1), State.DOWN, "Connection error: Closed at other end"); @@ -836,13 +838,13 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:4 distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.DISTRIBUTOR, 4), State.DOWN, "Connection error: Closed at other end"); ctrl.tick(); - assertEquals("version:5 cluster:d distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d .4.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); + assertEquals("version:4 cluster:d distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d .4.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); tick(1000); @@ -850,7 +852,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:6 distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); + assertEquals("version:5 distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); tick(1000); @@ -858,7 +860,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:7 cluster:d distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d .2.s:d", ctrl.getSystemState().toString()); + assertEquals("version:6 cluster:d distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d .2.s:d", ctrl.getSystemState().toString()); } @Test @@ -872,13 +874,13 @@ public class StateChangeTest extends FleetControllerTest { options.setMinRatioOfDistributorNodesUp(0.6); options.setMinRatioOfStorageNodesUp(0.8); - initialize(options.build()); + initialize(options); timer.advanceTime(1000000); // Node has been in steady state up ctrl.tick(); - assertEquals("version:3 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:2 distributor:10 storage:10", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.DISTRIBUTOR, 0), State.DOWN, "Connection error: Closed at other end"); communicator.setNodeState(new Node(NodeType.DISTRIBUTOR, 1), State.DOWN, "Connection error: Closed at other end"); @@ -890,13 +892,13 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:4 distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); + assertEquals("version:3 distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); communicator.setNodeState(new Node(NodeType.DISTRIBUTOR, 4), State.DOWN, "Connection error: Closed at other end"); ctrl.tick(); - assertEquals("version:5 cluster:d distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d .4.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); + assertEquals("version:4 cluster:d distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d .4.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); tick(1000); @@ -904,7 +906,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:6 distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); + assertEquals("version:5 distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d", ctrl.getSystemState().toString()); tick(1000); @@ -912,7 +914,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:7 cluster:d distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d .2.s:d", ctrl.getSystemState().toString()); + assertEquals("version:6 cluster:d distributor:10 .0.s:d .1.s:d .2.s:d .3.s:d storage:10 .0.s:d .1.s:d .2.s:d", ctrl.getSystemState().toString()); } /** @@ -1044,7 +1046,7 @@ public class StateChangeTest extends FleetControllerTest { FleetControllerOptions.Builder options = defaultOptions(); options.setDistributionBits(17); - initialize(options.build()); + initialize(options); timer.advanceTime(1000000); // Node has been in steady state up @@ -1054,7 +1056,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:4 bits:15 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:3 bits:15 distributor:10 storage:10", ctrl.getSystemState().toString()); tick(1000); @@ -1062,7 +1064,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); - assertEquals("version:5 bits:13 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:4 bits:13 distributor:10 storage:10", ctrl.getSystemState().toString()); tick(1000); setMinUsedBitsForAllNodes(16); @@ -1070,13 +1072,13 @@ public class StateChangeTest extends FleetControllerTest { // Don't increase dist bits until we've reached at least the wanted // level, in order to avoid multiple full redistributions of data. - assertEquals("version:5 bits:13 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:4 bits:13 distributor:10 storage:10", ctrl.getSystemState().toString()); tick(1000); setMinUsedBitsForAllNodes(19); ctrl.tick(); - assertEquals("version:6 bits:17 distributor:10 storage:10", ctrl.getSystemState().toString()); + assertEquals("version:5 bits:17 distributor:10 storage:10", ctrl.getSystemState().toString()); } private void setMinUsedBitsForAllNodes(int bits) { @@ -1140,16 +1142,16 @@ public class StateChangeTest extends FleetControllerTest { options.setMaxTransitionTime(NodeType.STORAGE, 0); options.setMinStorageNodesUp(10); options.setMinDistributorNodesUp(10); - initialize(options.build()); + initialize(options); ctrl.tick(); - assertThat(ctrl.consolidatedClusterState().toString(), equalTo("version:3 distributor:10 storage:10")); + assertThat(ctrl.consolidatedClusterState().toString(), equalTo("version:2 distributor:10 storage:10")); communicator.setNodeState(new Node(NodeType.STORAGE, 2), State.DOWN, "foo"); ctrl.tick(); assertThat(ctrl.consolidatedClusterState().toString(), - equalTo("version:4 cluster:d distributor:10 storage:10 .2.s:d")); + equalTo("version:3 cluster:d distributor:10 storage:10 .2.s:d")); // After this point, any further node changes while the cluster is still down won't be published. // This is because cluster state similarity checks are short-circuited if both are Down, as no other parts @@ -1162,7 +1164,7 @@ public class StateChangeTest extends FleetControllerTest { // NOTE: _same_ version, different node state content. Overall cluster down-state is still the same. assertThat(ctrl.consolidatedClusterState().toString(), - equalTo("version:4 cluster:d distributor:10 storage:10 .2.s:d .5.s:d")); + equalTo("version:3 cluster:d distributor:10 storage:10 .2.s:d .5.s:d")); } // Related to the above test, watchTimer invocations must receive the _current_ state and not the @@ -1174,7 +1176,7 @@ public class StateChangeTest extends FleetControllerTest { options.setMaxTransitionTime(NodeType.STORAGE, 1000); options.setMinStorageNodesUp(10); options.setMinDistributorNodesUp(10); - initialize(options.build()); + initialize(options); ctrl.tick(); communicator.setNodeState(new Node(NodeType.STORAGE, 2), State.DOWN, "foo"); @@ -1182,7 +1184,7 @@ public class StateChangeTest extends FleetControllerTest { ctrl.tick(); communicator.setNodeState(new Node(NodeType.STORAGE, 3), State.DOWN, "foo"); ctrl.tick(); - assertThat(ctrl.consolidatedClusterState().toString(), equalTo("version:4 cluster:d distributor:10 storage:10 .2.s:m .3.s:m")); + assertThat(ctrl.consolidatedClusterState().toString(), equalTo("version:3 cluster:d distributor:10 storage:10 .2.s:m .3.s:m")); // Subsequent timer tick should _not_ trigger additional events. Providing published state // only would result in "Marking node down" events for node 2 emitted per tick. @@ -1194,7 +1196,7 @@ public class StateChangeTest extends FleetControllerTest { verifyNodeEvents(new Node(NodeType.STORAGE, 2), """ Event: storage.2: Now reporting state U - Event: storage.2: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: storage.2: Altered node state in cluster state from 'D' to 'U' Event: storage.2: Failed to get node state: D: foo Event: storage.2: Stopped or possibly crashed after 500 ms, which is before stable state time period. Premature crash count is now 1. Event: storage.2: Altered node state in cluster state from 'U' to 'M: foo' @@ -1208,7 +1210,7 @@ public class StateChangeTest extends FleetControllerTest { @Test void do_not_emit_multiple_events_when_node_state_does_not_match_versioned_state() throws Exception { FleetControllerOptions.Builder options = defaultOptions(); - initialize(options.build()); + initialize(options); ctrl.tick(); communicator.setNodeState( @@ -1240,7 +1242,7 @@ public class StateChangeTest extends FleetControllerTest { verifyNodeEvents(new Node(NodeType.STORAGE, 2), """ Event: storage.2: Now reporting state U - Event: storage.2: Altered node state in cluster state from 'D: Node not seen in slobrok.' to 'U' + Event: storage.2: Altered node state in cluster state from 'D' to 'U' Event: storage.2: Now reporting state I, i 0.100 (read) Event: storage.2: Altered node state in cluster state from 'U' to 'I, i 0.100 (read)' Event: storage.2: Altered min distribution bit count from 16 to 17 @@ -1307,7 +1309,7 @@ public class StateChangeTest extends FleetControllerTest { // TODO ideally we'd break this out so it doesn't depend on fields in the parent test instance, but // fleet controller tests have a _lot_ of state, so risk of duplicating a lot of that... class RemoteTaskFixture { - RemoteTaskFixture(FleetControllerOptions options) throws Exception { + RemoteTaskFixture(FleetControllerOptions.Builder options) throws Exception { initialize(options); ctrl.tick(); } @@ -1392,12 +1394,12 @@ public class StateChangeTest extends FleetControllerTest { return options; } - private RemoteTaskFixture createFixtureWith(FleetControllerOptions options) throws Exception { + private RemoteTaskFixture createFixtureWith(FleetControllerOptions.Builder options) throws Exception { return new RemoteTaskFixture(options); } private RemoteTaskFixture createDefaultFixture() throws Exception { - return new RemoteTaskFixture(defaultOptions().build()); + return new RemoteTaskFixture(defaultOptions()); } @Test @@ -1430,7 +1432,7 @@ public class StateChangeTest extends FleetControllerTest { @Test void no_op_synchronous_remote_task_can_complete_immediately_if_current_state_already_acked() throws Exception { - RemoteTaskFixture fixture = createFixtureWith(optionsWithZeroTransitionTime().build()); + RemoteTaskFixture fixture = createFixtureWith(optionsWithZeroTransitionTime()); fixture.markStorageNodeDown(0); MockTask task = fixture.scheduleNoOpVersionDependentTask(); // Tries to set node 0 into Down; already in that state @@ -1443,7 +1445,7 @@ public class StateChangeTest extends FleetControllerTest { @Test void no_op_synchronous_remote_task_waits_until_current_state_is_acked() throws Exception { - RemoteTaskFixture fixture = createFixtureWith(optionsWithZeroTransitionTime().build()); + RemoteTaskFixture fixture = createFixtureWith(optionsWithZeroTransitionTime()); communicator.setShouldDeferDistributorClusterStateAcks(true); fixture.markStorageNodeDown(0); @@ -1467,7 +1469,7 @@ public class StateChangeTest extends FleetControllerTest { // the cluster down-state to have been published. @Test void immediately_complete_sync_remote_task_when_cluster_is_down() throws Exception { - RemoteTaskFixture fixture = createFixtureWith(optionsAllowingZeroNodesDown().build()); + RemoteTaskFixture fixture = createFixtureWith(optionsAllowingZeroNodesDown()); // Controller options require 10/10 nodes up, so take one down to trigger a cluster Down edge. fixture.markStorageNodeDown(1); MockTask task = fixture.scheduleVersionDependentTaskWithSideEffects(); @@ -1501,7 +1503,7 @@ public class StateChangeTest extends FleetControllerTest { void synchronous_task_immediately_failed_when_leadership_lost() throws Exception { FleetControllerOptions.Builder options = optionsWithZeroTransitionTime(); options.setCount(3); - RemoteTaskFixture fixture = createFixtureWith(options.build()); + RemoteTaskFixture fixture = createFixtureWith(options); fixture.winLeadership(); markAllNodesAsUp(options.build()); @@ -1526,7 +1528,7 @@ public class StateChangeTest extends FleetControllerTest { void cluster_state_ack_is_not_dependent_on_state_send_grace_period() throws Exception { FleetControllerOptions.Builder options = defaultOptions(); options.setMinTimeBetweenNewSystemStates(10_000); - RemoteTaskFixture fixture = createFixtureWith(options.build()); + RemoteTaskFixture fixture = createFixtureWith(options); // Have to increment timer here to be able to send state generated by the scheduled task timer.advanceTime(10_000); @@ -1546,11 +1548,10 @@ public class StateChangeTest extends FleetControllerTest { void synchronous_task_immediately_answered_when_not_leader() throws Exception { FleetControllerOptions.Builder builder = optionsWithZeroTransitionTime(); builder.setCount(3); - var options = builder.build(); - RemoteTaskFixture fixture = createFixtureWith(options); + RemoteTaskFixture fixture = createFixtureWith(builder); fixture.loseLeadership(); - markAllNodesAsUp(options); + markAllNodesAsUp(ctrl.getOptions()); MockTask task = fixture.scheduleVersionDependentTaskWithSideEffects(); @@ -1562,7 +1563,7 @@ public class StateChangeTest extends FleetControllerTest { void task_not_completed_within_deadline_is_failed_with_deadline_exceeded_error() throws Exception { FleetControllerOptions.Builder builder = defaultOptions(); builder.setMaxDeferredTaskVersionWaitTime(Duration.ofSeconds(60)); - RemoteTaskFixture fixture = createFixtureWith(builder.build()); + RemoteTaskFixture fixture = createFixtureWith(builder); MockTask task = fixture.scheduleVersionDependentTaskWithSideEffects(); communicator.setShouldDeferDistributorClusterStateAcks(true); @@ -1588,7 +1589,7 @@ public class StateChangeTest extends FleetControllerTest { options.setMaxDeferredTaskVersionWaitTime(Duration.ofSeconds(60)); options.enableTwoPhaseClusterStateActivation(deferredActivation); options.setMaxDivergentNodesPrintedInTaskErrorMessages(10); - RemoteTaskFixture fixture = createFixtureWith(options.build()); + RemoteTaskFixture fixture = createFixtureWith(options); MockTask task = fixture.scheduleVersionDependentTaskWithSideEffects(); communicator.setShouldDeferDistributorClusterStateAcks(true); @@ -1610,14 +1611,14 @@ public class StateChangeTest extends FleetControllerTest { @Test void task_not_completed_within_deadline_lists_nodes_not_converged_in_error_message() throws Exception { doTestTaskDeadlineExceeded(false, "the following nodes have not converged to " + - "at least version 4: distributor.0, distributor.1, distributor.2, distributor.3, " + + "at least version 3: distributor.0, distributor.1, distributor.2, distributor.3, " + "distributor.4, distributor.5, distributor.6, distributor.7, distributor.8, distributor.9"); } @Test void task_not_completed_within_deadline_with_deferred_activation_checks_activation_version() throws Exception { doTestTaskDeadlineExceeded(true, "the following nodes have not converged to " + - "at least version 4: distributor.0, distributor.1, distributor.2, distributor.3, " + + "at least version 3: distributor.0, distributor.1, distributor.2, distributor.3, " + "distributor.4, distributor.5, distributor.6, distributor.7, distributor.8, distributor.9 " + "(... and 10 more)"); } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicatorTest.java index b533168e61a..1018515cbfa 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicatorTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/rpc/RPCCommunicatorTest.java @@ -76,10 +76,11 @@ public class RPCCommunicatorTest { @Test void testGenerateNodeStateRequestTimeoutMsWithUpdates() { final RPCCommunicator communicator = new RPCCommunicator(RPCCommunicator.createRealSupervisor(), null /* Timer */, INDEX, 1, 1, 100, 0); - FleetControllerOptions.Builder builder = new FleetControllerOptions.Builder(null /*clustername*/, Set.of(new ConfiguredNode(0, false))); - builder.setNodeStateRequestTimeoutEarliestPercentage(100); - builder.setNodeStateRequestTimeoutLatestPercentage(100); - builder.setNodeStateRequestTimeoutMS(NODE_STATE_REQUEST_TIMEOUT_INTERVAL_MAX_MS); + FleetControllerOptions.Builder builder = new FleetControllerOptions.Builder(null /*clustername*/, Set.of(new ConfiguredNode(0, false))) + .setNodeStateRequestTimeoutEarliestPercentage(100) + .setNodeStateRequestTimeoutLatestPercentage(100) + .setNodeStateRequestTimeoutMS(NODE_STATE_REQUEST_TIMEOUT_INTERVAL_MAX_MS) + .setZooKeeperServerAddress("localhost:2181"); communicator.propagateOptions(builder.build()); long timeOutMs = communicator.generateNodeStateRequestTimeout().toMillis(); assertEquals(timeOutMs, NODE_STATE_REQUEST_TIMEOUT_INTERVAL_MAX_MS); |