summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java1
-rw-r--r--clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java7
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java2
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java11
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java10
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java117
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java32
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java202
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java5
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFixture.java136
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBuilder.java107
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java106
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java100
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java370
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java188
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java39
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java94
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateGeneratorTest.java4
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/content/ClusterControllerConfig.java18
-rw-r--r--config-model/src/main/resources/schema/content.rnc7
-rw-r--r--config-model/src/test/java/com/yahoo/vespa/model/content/FleetControllerClusterTest.java34
-rw-r--r--configdefinitions/src/vespa/fleetcontroller.def8
22 files changed, 1411 insertions, 187 deletions
diff --git a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
index 07b0a68520b..6b02debc182 100644
--- a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
+++ b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
@@ -69,6 +69,7 @@ public class ClusterControllerClusterConfigurer {
options.minTimeBetweenNewSystemStates = config.min_time_between_new_systemstates();
options.maxSlobrokDisconnectGracePeriod = (int) (config.max_slobrok_disconnect_grace_period() * 1000);
options.distributionBits = config.ideal_distribution_bits();
+ options.minNodeRatioPerGroup = config.min_node_ratio_per_group();
}
private void configure(SlobroksConfig config) {
diff --git a/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java b/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java
index 6cf4f962c9f..79da2574dbc 100644
--- a/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java
+++ b/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java
@@ -22,7 +22,11 @@ public class ClusterControllerClusterConfigurerTest extends TestCase {
group.nodes.add(node);
distributionConfig.group.add(group);
FleetcontrollerConfig.Builder fleetcontrollerConfig = new FleetcontrollerConfig.Builder();
- fleetcontrollerConfig.cluster_name("storage").index(0).zookeeper_server("zoo");
+ fleetcontrollerConfig
+ .cluster_name("storage")
+ .index(0)
+ .zookeeper_server("zoo")
+ .min_node_ratio_per_group(0.123);
SlobroksConfig.Builder slobroksConfig = new SlobroksConfig.Builder();
SlobroksConfig.Slobrok.Builder slobrok = new SlobroksConfig.Slobrok.Builder();
slobrok.connectionspec("foo");
@@ -47,6 +51,7 @@ public class ClusterControllerClusterConfigurerTest extends TestCase {
metric
);
assertTrue(configurer.getOptions() != null);
+ assertEquals(0.123, configurer.getOptions().minNodeRatioPerGroup, 0.01);
// Oki with no zookeeper if one node
zookeepersConfig.zookeeperserverlist("");
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
index 67681f87d92..98648451e1d 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
@@ -191,7 +191,7 @@ public class ContentCluster {
* @param newState state wanted to be set @return NodeUpgradePrechecker.Response
*/
public NodeStateChangeChecker.Result calculateEffectOfNewState(
- Node node, int clusterState, SetUnitStateRequest.Condition condition, NodeState oldState, NodeState newState) {
+ Node node, ClusterState clusterState, SetUnitStateRequest.Condition condition, NodeState oldState, NodeState newState) {
NodeStateChangeChecker nodeStateChangeChecker = new NodeStateChangeChecker(
minStorageNodesUp,
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
index 572e24bcb35..ceeeddf49fa 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
@@ -416,8 +416,10 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
systemStateGenerator.setStableStateTimePeriod(options.stableStateTimePeriod);
systemStateGenerator.setMinNodesUp(options.minDistributorNodesUp, options.minStorageNodesUp,
options.minRatioOfDistributorNodesUp, options.minRatioOfStorageNodesUp);
+ systemStateGenerator.setMinNodeRatioPerGroup(options.minNodeRatioPerGroup);
systemStateGenerator.setMaxSlobrokDisconnectGracePeriod(options.maxSlobrokDisconnectGracePeriod);
systemStateGenerator.setDistributionBits(options.distributionBits);
+ systemStateGenerator.setDistribution(options.storageDistribution);
masterElectionHandler.setFleetControllerCount(options.fleetControllerCount);
masterElectionHandler.setMasterZooKeeperCooldownPeriod(options.masterZooKeeperCooldownPeriod);
@@ -426,9 +428,9 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
try{
rpcServer.setSlobrokConnectionSpecs(options.slobrokConnectionSpecs, options.rpcPort);
} catch (ListenFailedException e) {
- log.log(LogLevel.WARNING, "Failed to bind RPC server to port " + options.rpcPort +". This may be natural if cluster have altered the services running on this node: " + e.getMessage());
+ log.log(LogLevel.WARNING, "Failed to bind RPC server to port " + options.rpcPort +". This may be natural if cluster has altered the services running on this node: " + e.getMessage());
} catch (Exception e) {
- log.log(LogLevel.WARNING, "Failed to initailize RPC server socket: " + e.getMessage());
+ log.log(LogLevel.WARNING, "Failed to initialize RPC server socket: " + e.getMessage());
}
}
@@ -436,7 +438,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
try{
statusPageServer.setPort(options.httpPort);
} catch (Exception e) {
- log.log(LogLevel.WARNING, "Failed to initialize status server socket. This may be natural if cluster have altered the services running on this node: " + e.getMessage());
+ log.log(LogLevel.WARNING, "Failed to initialize status server socket. This may be natural if cluster has altered the services running on this node: " + e.getMessage());
}
}
@@ -503,7 +505,6 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
didWork |= systemStateBroadcaster.processResponses();
if (masterElectionHandler.isMaster()) {
didWork |= broadcastClusterStateToEligibleNodes();
-
}
didWork |= processAnyPendingStatusPageRequest();
@@ -637,7 +638,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
// Send getNodeState requests to zero or more nodes.
didWork |= stateGatherer.sendMessages(cluster, communicator, this);
didWork |= systemStateGenerator.watchTimers(cluster, this);
- didWork |= systemStateGenerator.notifyIfNewSystemState(this);
+ didWork |= systemStateGenerator.notifyIfNewSystemState(cluster, this);
if ( ! isStateGatherer) {
if ( ! isMaster) {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
index f18fa6d3a9b..a2449449a05 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
@@ -69,6 +69,16 @@ public class FleetControllerOptions implements Cloneable {
public double minRatioOfStorageNodesUp = 0.50;
/**
+ * Minimum ratio of nodes in an "available" state (up, initializing or maintenance)
+ * that shall be present in a group for the group itself to be considered available.
+ * If the ratio of available nodes drop under this limit, the group's nodes will be
+ * implicitly taken down.
+ *
+ * A value of 0.0 implies group auto-takedown feature is effectively disabled.
+ */
+ public double minNodeRatioPerGroup = 0.0;
+
+ /**
* Milliseconds to sleep after doing a work cycle where we did no work. Some events do not interrupt the sleeping,
* such as slobrok changes, so shouldn't set this too high.
*/
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java
new file mode 100644
index 00000000000..74b15b61ac3
--- /dev/null
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java
@@ -0,0 +1,117 @@
+package com.yahoo.vespa.clustercontroller.core;
+
+import com.yahoo.vdslib.distribution.ConfiguredNode;
+import com.yahoo.vdslib.distribution.Distribution;
+import com.yahoo.vdslib.distribution.Group;
+import com.yahoo.vdslib.distribution.GroupVisitor;
+import com.yahoo.vdslib.state.ClusterState;
+import com.yahoo.vdslib.state.Node;
+import com.yahoo.vdslib.state.NodeState;
+import com.yahoo.vdslib.state.NodeType;
+import com.yahoo.vdslib.state.State;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.stream.Stream;
+
+class GroupAvailabilityCalculator {
+ private final Distribution distribution;
+ private final double minNodeRatioPerGroup;
+
+ private GroupAvailabilityCalculator(Distribution distribution,
+ double minNodeRatioPerGroup)
+ {
+ this.distribution = distribution;
+ this.minNodeRatioPerGroup = minNodeRatioPerGroup;
+ }
+
+ public static class Builder {
+ private Distribution distribution;
+ private double minNodeRatioPerGroup = 1.0;
+
+ Builder withDistribution(Distribution distribution) {
+ this.distribution = distribution;
+ return this;
+ }
+ Builder withMinNodeRatioPerGroup(double minRatio) {
+ this.minNodeRatioPerGroup = minRatio;
+ return this;
+ }
+ GroupAvailabilityCalculator build() {
+ return new GroupAvailabilityCalculator(distribution, minNodeRatioPerGroup);
+ }
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ private class InsufficientAvailabilityGroupVisitor implements GroupVisitor {
+ private final Set<Integer> implicitlyDown = new HashSet<>();
+ private final ClusterState clusterState;
+
+ public InsufficientAvailabilityGroupVisitor(ClusterState clusterState) {
+ this.clusterState = clusterState;
+ }
+
+ private boolean nodeIsAvailableInState(final int index, final String states) {
+ return clusterState.getNodeState(new Node(NodeType.STORAGE, index)).getState().oneOf(states);
+ }
+
+ private Stream<ConfiguredNode> availableNodesIn(Group g) {
+ // We consider nodes in states (u)p, (i)nitializing, (m)aintenance as being
+ // available from the perspective of taking entire groups down (even though
+ // maintenance mode is a half-truth in this regard).
+ return g.getNodes().stream().filter(n -> nodeIsAvailableInState(n.index(), "uim"));
+ }
+
+ private Stream<ConfiguredNode> candidateNodesForSettingDown(Group g) {
+ // We don't implicitly set (m)aintenance nodes down, as these are usually set
+ // in maintenance for a good reason (e.g. orchestration or manual reboot).
+ // Similarly, we don't take down (r)etired nodes as these may contain data
+ // that the rest of the cluster needs.
+ return g.getNodes().stream().filter(n -> nodeIsAvailableInState(n.index(), "ui"));
+ }
+
+ private double computeGroupAvailability(Group g) {
+ // TODO also look at distributors
+ final long availableNodes = availableNodesIn(g).count();
+ // Model should make it impossible to deploy with zero nodes in a group,
+ // so no div by zero risk.
+ return availableNodes / (double)g.getNodes().size();
+ }
+
+ private void markAllAvailableGroupNodeIndicesAsDown(Group group) {
+ candidateNodesForSettingDown(group).forEach(n -> implicitlyDown.add(n.index()));
+ }
+
+ @Override
+ public boolean visitGroup(Group group) {
+ if (group.isLeafGroup()) {
+ if (computeGroupAvailability(group) < minNodeRatioPerGroup) {
+ markAllAvailableGroupNodeIndicesAsDown(group);
+ }
+ }
+ return true;
+ }
+
+ Set<Integer> implicitlyDownNodeIndices() {
+ return implicitlyDown;
+ }
+ }
+
+ private static boolean isFlatCluster(Group root) {
+ return root.isLeafGroup();
+ }
+
+ public Set<Integer> nodesThatShouldBeDown(ClusterState state) {
+ if (isFlatCluster(distribution.getRootGroup())) {
+ // Implicit group takedown only applies to hierarchic cluster setups.
+ return new HashSet<>();
+ }
+ InsufficientAvailabilityGroupVisitor visitor = new InsufficientAvailabilityGroupVisitor(state);
+ distribution.visitGroups(visitor);
+ return visitor.implicitlyDownNodeIndices();
+ }
+
+}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
index f312194c15d..12018ac4b6f 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
@@ -1,6 +1,7 @@
// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;
+import com.yahoo.vdslib.state.ClusterState;
import com.yahoo.vdslib.state.Node;
import com.yahoo.vdslib.state.NodeState;
import com.yahoo.vdslib.state.NodeType;
@@ -79,7 +80,7 @@ public class NodeStateChangeChecker {
}
public Result evaluateTransition(
- Node node, int clusterStateVersion, SetUnitStateRequest.Condition condition,
+ Node node, ClusterState clusterState, SetUnitStateRequest.Condition condition,
NodeState oldState, NodeState newState) {
if (condition == SetUnitStateRequest.Condition.FORCE) {
return Result.allowSettingOfWantedState();
@@ -107,7 +108,7 @@ public class NodeStateChangeChecker {
case UP:
return canSetStateUp(node, oldState.getState());
case MAINTENANCE:
- return canSetStateMaintenance(node, clusterStateVersion);
+ return canSetStateMaintenance(node, clusterState);
default:
return Result.createDisallowed("Safe only supports state UP and MAINTENANCE, you tried: " + newState);
}
@@ -127,17 +128,17 @@ public class NodeStateChangeChecker {
return Result.allowSettingOfWantedState();
}
- private Result canSetStateMaintenance(Node node, int clusterStateVersion) {
+ private Result canSetStateMaintenance(Node node, ClusterState clusterState) {
NodeInfo nodeInfo = clusterInfo.getNodeInfo(node);
if (nodeInfo == null) {
return Result.createDisallowed("Unknown node " + node);
}
- NodeState reportedState = nodeInfo.getReportedState();
- if (reportedState.getState() == State.DOWN) {
+ NodeState currentState = clusterState.getNodeState(node);
+ if (currentState.getState() == State.DOWN) {
return Result.allowSettingOfWantedState();
}
- Result checkDistributorsResult = checkDistributors(node, clusterStateVersion);
+ Result checkDistributorsResult = checkDistributors(node, clusterState.getVersion());
if (!checkDistributorsResult.settingWantedStateIsAllowed()) {
return checkDistributorsResult;
}
@@ -151,7 +152,7 @@ public class NodeStateChangeChecker {
return Result.createDisallowed("There are only " + clusterInfo.getStorageNodeInfo().size() +
" storage nodes up, while config requires at least " + minStorageNodesUp);
}
- Result fractionCheck = isFractionHighEnough();
+ Result fractionCheck = isFractionHighEnough(clusterState);
if (!fractionCheck.settingWantedStateIsAllowed()) {
return fractionCheck;
}
@@ -168,16 +169,23 @@ public class NodeStateChangeChecker {
return Result.allowSettingOfWantedState();
}
- private Result isFractionHighEnough() {
+ private int contentNodesWithAvailableNodeState(ClusterState clusterState) {
+ final int nodeCount = clusterState.getNodeCount(NodeType.STORAGE);
int upNodesCount = 0;
- int nodesCount = 0;
- for (StorageNodeInfo storageNodeInfo : clusterInfo.getStorageNodeInfo()) {
- nodesCount++;
- State state = storageNodeInfo.getReportedState().getState();
+ for (int i = 0; i < nodeCount; ++i) {
+ final Node node = new Node(NodeType.STORAGE, i);
+ final State state = clusterState.getNodeState(node).getState();
if (state == State.UP || state == State.RETIRED || state == State.INITIALIZING) {
upNodesCount++;
}
}
+ return upNodesCount;
+ }
+
+ private Result isFractionHighEnough(ClusterState clusterState) {
+ final int nodesCount = clusterInfo.getStorageNodeInfo().size();
+ final int upNodesCount = contentNodesWithAvailableNodeState(clusterState);
+
if (nodesCount == 0) {
return Result.createDisallowed("No storage nodes in cluster state, not safe to restart.");
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
index 5cf88b68f29..3dd1216d5d0 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
@@ -4,6 +4,7 @@ package com.yahoo.vespa.clustercontroller.core;
import com.yahoo.jrt.Spec;
import com.yahoo.log.LogLevel;
import com.yahoo.vdslib.distribution.ConfiguredNode;
+import com.yahoo.vdslib.distribution.Distribution;
import com.yahoo.vdslib.state.*;
import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler;
import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
@@ -18,7 +19,8 @@ import java.util.stream.Collectors;
/**
* This class get node state updates and uses them to decide the cluster state.
*/
- // TODO: Remove all current state from this and make it rely on state from ClusterInfo instead
+// TODO: Remove all current state from this and make it rely on state from ClusterInfo instead
+// TODO: Do this ASAP! SystemStateGenerator should ideally behave as a pure function!
public class SystemStateGenerator {
private static Logger log = Logger.getLogger(SystemStateGenerator.class.getName());
@@ -27,6 +29,7 @@ public class SystemStateGenerator {
private final EventLogInterface eventLog;
private ClusterStateView currentClusterStateView;
private ClusterStateView nextClusterStateView;
+ private Distribution distribution;
private boolean nextStateViewChanged = false;
private boolean isMaster = false;
@@ -41,6 +44,7 @@ public class SystemStateGenerator {
private int minStorageNodesUp = 1;
private double minRatioOfDistributorNodesUp = 0.50;
private double minRatioOfStorageNodesUp = 0.50;
+ private double minNodeRatioPerGroup = 0.0;
private int maxSlobrokDisconnectGracePeriod = 1000;
private int idealDistributionBits = 16;
private static final boolean disableUnstableNodes = true;
@@ -116,17 +120,18 @@ public class SystemStateGenerator {
minStorageNodesUp = minStorNodes;
minRatioOfDistributorNodesUp = minDistRatio;
minRatioOfStorageNodesUp = minStorRatio;
- nextStateViewChanged = true; // ... maybe
+ nextStateViewChanged = true;
+ }
+
+ public void setMinNodeRatioPerGroup(double upRatio) {
+ this.minNodeRatioPerGroup = upRatio;
+ nextStateViewChanged = true;
}
/** Sets the nodes of this and attempts to keep the node state in sync */
public void setNodes(ClusterInfo newClusterInfo) {
this.nodes = new HashSet<>(newClusterInfo.getConfiguredNodes().values());
- // Nodes that are removed from config will be automatically marked as DOWN
- // in the cluster state by createNextVersionOfClusterStateView, ensuring
- // that these are not carried over into new cluster states.
-
for (ConfiguredNode node : this.nodes) {
NodeInfo newNodeInfo = newClusterInfo.getStorageNodeInfo(node.index());
NodeState currentState = currentClusterStateView.getClusterState().getNodeState(new Node(NodeType.STORAGE, node.index()));
@@ -134,6 +139,23 @@ public class SystemStateGenerator {
proposeNewNodeState(newNodeInfo, new NodeState(NodeType.STORAGE, node.retired() ? State.RETIRED : State.UP));
}
}
+
+ // Ensure that any nodes that have been removed from the config are also
+ // promptly removed from the next (and subsequent) generated cluster states.
+ pruneAllNodesNotContainedInConfig();
+
+ nextStateViewChanged = true;
+ }
+
+ private void pruneAllNodesNotContainedInConfig() {
+ Set<Integer> configuredIndices = this.nodes.stream().map(ConfiguredNode::index).collect(Collectors.toSet());
+ final ClusterState candidateNextState = nextClusterStateView.getClusterState();
+ pruneNodesNotContainedInConfig(candidateNextState, configuredIndices, NodeType.DISTRIBUTOR);
+ pruneNodesNotContainedInConfig(candidateNextState, configuredIndices, NodeType.STORAGE);
+ }
+
+ public void setDistribution(Distribution distribution) {
+ this.distribution = distribution;
nextStateViewChanged = true;
}
@@ -196,14 +218,13 @@ public class SystemStateGenerator {
return state;
}
- private Event getDownDueToTooFewNodesEvent() {
- Event clusterEvent = null;
+ private Optional<Event> getDownDueToTooFewNodesEvent(ClusterState nextClusterState) {
int upStorageCount = 0, upDistributorCount = 0;
int dcount = nodes.size();
int scount = nodes.size();
for (NodeType type : NodeType.getTypes()) {
for (ConfiguredNode node : nodes) {
- NodeState ns = nextClusterStateView.getClusterState().getNodeState(new Node(type, node.index()));
+ NodeState ns = nextClusterState.getNodeState(new Node(type, node.index()));
if (ns.getState() == State.UP || ns.getState() == State.RETIRED || ns.getState() == State.INITIALIZING) {
if (type.equals(NodeType.STORAGE))
++upStorageCount;
@@ -215,46 +236,149 @@ public class SystemStateGenerator {
long timeNow = timer.getCurrentTimeInMillis();
if (upStorageCount < minStorageNodesUp) {
- clusterEvent = new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE,
+ return Optional.of(new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE,
"Less than " + minStorageNodesUp + " storage nodes available (" + upStorageCount + "). Setting cluster state down.",
- timeNow);
+ timeNow));
}
if (upDistributorCount < minDistributorNodesUp) {
- clusterEvent = new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE,
+ return Optional.of(new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE,
"Less than " + minDistributorNodesUp + " distributor nodes available (" + upDistributorCount + "). Setting cluster state down.",
- timeNow);
+ timeNow));
}
if (minRatioOfStorageNodesUp * scount > upStorageCount) {
- clusterEvent = new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE,
+ return Optional.of(new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE,
"Less than " + (100 * minRatioOfStorageNodesUp) + " % of storage nodes are available ("
+ upStorageCount + "/" + scount + "). Setting cluster state down.",
- timeNow);
+ timeNow));
}
if (minRatioOfDistributorNodesUp * dcount > upDistributorCount) {
- clusterEvent = new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE,
+ return Optional.of(new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE,
"Less than " + (100 * minRatioOfDistributorNodesUp) + " % of distributor nodes are available ("
+ upDistributorCount + "/" + dcount + "). Setting cluster state down.",
- timeNow);
+ timeNow));
+ }
+ return Optional.empty();
+ }
+
+ private static Node storageNode(int index) {
+ return new Node(NodeType.STORAGE, index);
+ }
+
+ private void performImplicitStorageNodeStateTransitions(ClusterState candidateState, ContentCluster cluster) {
+ if (distribution == null) {
+ return; // FIXME due to tests that don't bother setting distr config! Never happens in prod.
+ }
+ // First clear the states of any nodes that according to reported/wanted state alone
+ // should have their states cleared. We might still take these down again based on the
+ // decisions of the group availability calculator, but this way we ensure that groups
+ // that no longer should be down will have their nodes implicitly made available again.
+ // TODO this will be void once SystemStateGenerator has been rewritten to be stateless.
+ final Set<Integer> clearedNodes = clearDownStateForStorageNodesThatCanBeUp(candidateState, cluster);
+
+ final GroupAvailabilityCalculator calc = new GroupAvailabilityCalculator.Builder()
+ .withMinNodeRatioPerGroup(minNodeRatioPerGroup)
+ .withDistribution(distribution)
+ .build();
+ final Set<Integer> nodesToTakeDown = calc.nodesThatShouldBeDown(candidateState);
+ markNodesAsDownDueToGroupUnavailability(cluster, candidateState, nodesToTakeDown, clearedNodes);
+
+ clearedNodes.removeAll(nodesToTakeDown);
+ logEventsForNodesThatWereTakenUp(clearedNodes, cluster);
+ }
+
+ private void logEventsForNodesThatWereTakenUp(Set<Integer> newlyUpNodes, ContentCluster cluster) {
+ newlyUpNodes.forEach(i -> {
+ final NodeInfo info = cluster.getNodeInfo(storageNode(i)); // Should always be non-null here.
+ // TODO the fact that this only happens for group up events is implementation specific
+ // should generalize this if we get other such events.
+ eventLog.addNodeOnlyEvent(new NodeEvent(info,
+ "Group availability restored; taking node back up",
+ NodeEvent.Type.CURRENT, timer.getCurrentTimeInMillis()), LogLevel.INFO);
+ });
+ }
+
+ private void markNodesAsDownDueToGroupUnavailability(ContentCluster cluster,
+ ClusterState candidateState,
+ Set<Integer> nodesToTakeDown,
+ Set<Integer> clearedNodes)
+ {
+ for (Integer idx : nodesToTakeDown) {
+ final Node node = storageNode(idx);
+ NodeState newState = new NodeState(NodeType.STORAGE, State.DOWN);
+ newState.setDescription("group node availability below configured threshold");
+ candidateState.setNodeState(node, newState);
+
+ logNodeGroupDownEdgeEventOnce(clearedNodes, node, cluster);
+ }
+ }
+
+ private void logNodeGroupDownEdgeEventOnce(Set<Integer> clearedNodes, Node node, ContentCluster cluster) {
+ final NodeInfo nodeInfo = cluster.getNodeInfo(node);
+ // If clearedNodes contains the index it means we're just re-downing a node
+ // that was previously down. If this is the case, we'd cause a duplicate
+ // event if we logged it now as well.
+ if (nodeInfo != null && !clearedNodes.contains(node.getIndex())) {
+ eventLog.addNodeOnlyEvent(new NodeEvent(nodeInfo,
+ "Setting node down as the total availability of its group is " +
+ "below the configured threshold",
+ NodeEvent.Type.CURRENT, timer.getCurrentTimeInMillis()), LogLevel.INFO);
}
- return clusterEvent;
}
- private ClusterStateView createNextVersionOfClusterStateView(Event clusterEvent) {
+ private NodeState baselineNodeState(NodeInfo info) {
+ NodeState reported = info.getReportedState();
+ NodeState wanted = info.getWantedState();
+
+ final NodeState baseline = reported.clone();
+ if (wanted.getState() != State.UP) {
+ baseline.setDescription(wanted.getDescription());
+ if (reported.above(wanted)) {
+ baseline.setState(wanted.getState());
+ }
+ }
+ return baseline;
+ }
+
+ // Returns set of nodes whose state was cleared
+ private Set<Integer> clearDownStateForStorageNodesThatCanBeUp(
+ ClusterState candidateState, ContentCluster cluster)
+ {
+ final int nodeCount = candidateState.getNodeCount(NodeType.STORAGE);
+ final Set<Integer> clearedNodes = new HashSet<>();
+ for (int i = 0; i < nodeCount; ++i) {
+ final Node node = storageNode(i);
+ final NodeInfo info = cluster.getNodeInfo(node);
+ final NodeState currentState = candidateState.getNodeState(node);
+ if (currentState.getState() == State.DOWN) {
+ if (mayClearNodeDownState(info)) {
+ candidateState.setNodeState(node, baselineNodeState(info));
+ clearedNodes.add(i);
+ }
+ }
+ }
+ return clearedNodes;
+ }
+
+ private boolean mayClearNodeDownState(NodeInfo info) {
+ if (info == null) {
+ // Nothing known about node in cluster info; we definitely don't want it
+ // to be taken up at this point.
+ return false;
+ }
+ return (info.getReportedState().getState() != State.DOWN
+ && info.getWantedState().getState().oneOf("ur"));
+ }
+
+ private ClusterStateView createNextVersionOfClusterStateView(ContentCluster cluster) {
// If you change this method, see *) in notifyIfNewSystemState
ClusterStateView candidateClusterStateView = nextClusterStateView.cloneForNewState();
ClusterState candidateClusterState = candidateClusterStateView.getClusterState();
- candidateClusterState.setClusterState(clusterEvent == null ? State.UP : State.DOWN);
-
int currentDistributionBits = calculateMinDistributionBitCount();
if (currentDistributionBits != nextClusterStateView.getClusterState().getDistributionBitCount()) {
candidateClusterState.setDistributionBits(currentDistributionBits);
}
-
- Set<Integer> configuredIndices = this.nodes.stream().map(ConfiguredNode::index).collect(Collectors.toSet());
-
- pruneNodesNotContainedInConfig(candidateClusterState, configuredIndices, NodeType.DISTRIBUTOR);
- pruneNodesNotContainedInConfig(candidateClusterState, configuredIndices, NodeType.STORAGE);
+ performImplicitStorageNodeStateTransitions(candidateClusterState, cluster);
return candidateClusterStateView;
}
@@ -310,12 +434,29 @@ public class SystemStateGenerator {
}
}
- public boolean notifyIfNewSystemState(SystemStateListener stateListener) {
+ private void mergeIntoNextClusterState(ClusterState sourceState) {
+ final ClusterState nextState = nextClusterStateView.getClusterState();
+ final int nodeCount = sourceState.getNodeCount(NodeType.STORAGE);
+ for (int i = 0; i < nodeCount; ++i) {
+ final Node node = storageNode(i);
+ final NodeState stateInSource = sourceState.getNodeState(node);
+ final NodeState stateInTarget = nextState.getNodeState(node);
+ if (stateInSource.getState() != stateInTarget.getState()) {
+ nextState.setNodeState(node, stateInSource);
+ }
+ }
+ }
+
+ public boolean notifyIfNewSystemState(ContentCluster cluster, SystemStateListener stateListener) {
if ( ! nextStateViewChanged) return false;
- Event clusterEvent = getDownDueToTooFewNodesEvent();
- ClusterStateView newClusterStateView = createNextVersionOfClusterStateView(clusterEvent);
+ ClusterStateView newClusterStateView = createNextVersionOfClusterStateView(cluster);
+
ClusterState newClusterState = newClusterStateView.getClusterState();
+ // Creating the next version of the state may implicitly take down nodes, so our checks
+ // for taking the entire cluster down must happen _after_ this
+ Optional<Event> clusterDown = getDownDueToTooFewNodesEvent(newClusterState);
+ newClusterState.setClusterState(clusterDown.isPresent() ? State.DOWN : State.UP);
if (newClusterState.similarTo(currentClusterStateView.getClusterState())) {
log.log(LogLevel.DEBUG,
@@ -329,7 +470,7 @@ public class SystemStateGenerator {
newClusterState.setVersion(currentClusterStateView.getClusterState().getVersion() + 1);
recordNewClusterStateHasBeenChosen(currentClusterStateView.getClusterState(),
- newClusterStateView.getClusterState(), clusterEvent);
+ newClusterStateView.getClusterState(), clusterDown.orElse(null));
// *) Ensure next state is still up to date.
// This should make nextClusterStateView a deep-copy of currentClusterStateView.
@@ -338,6 +479,7 @@ public class SystemStateGenerator {
// This seems like a hack...
nextClusterStateView.getClusterState().setDistributionBits(newClusterState.getDistributionBitCount());
nextClusterStateView.getClusterState().setClusterState(newClusterState.getClusterState());
+ mergeIntoNextClusterState(newClusterState);
currentClusterStateView = newClusterStateView;
nextStateViewChanged = false;
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
index d6dd6faa60d..1b97123d636 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
@@ -80,12 +80,11 @@ public class SetNodeStateRequest extends Request<SetResponse> {
NodeState wantedState = nodeInfo.getUserWantedState();
NodeState newWantedState = getRequestedNodeState(newStates, node);
- int version = currentClusterState.getVersion();
NodeStateChangeChecker.Result result = cluster.calculateEffectOfNewState(
- node, version, condition, wantedState, newWantedState);
+ node, currentClusterState, condition, wantedState, newWantedState);
log.log(LogLevel.DEBUG, "node=" + node +
- " version=" + version +
+ " current-cluster-state=" + currentClusterState + // Includes version in output format
" condition=" + condition +
" wanted-state=" + wantedState +
" new-wanted-state=" + newWantedState +
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFixture.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFixture.java
new file mode 100644
index 00000000000..aca26000931
--- /dev/null
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFixture.java
@@ -0,0 +1,136 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core;
+
+import com.yahoo.vdslib.distribution.ConfiguredNode;
+import com.yahoo.vdslib.distribution.Distribution;
+import com.yahoo.vdslib.state.Node;
+import com.yahoo.vdslib.state.NodeState;
+import com.yahoo.vdslib.state.NodeType;
+import com.yahoo.vdslib.state.State;
+import com.yahoo.vespa.clustercontroller.core.listeners.NodeStateOrHostInfoChangeHandler;
+import com.yahoo.vespa.clustercontroller.core.mocks.TestEventLog;
+import com.yahoo.vespa.clustercontroller.utils.util.NoMetricReporter;
+import com.yahoo.vespa.config.content.StorDistributionConfig;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static org.mockito.Mockito.mock;
+
+class ClusterFixture {
+ public final ContentCluster cluster;
+ public final Distribution distribution;
+ public final FakeTimer timer;
+ public final EventLogInterface eventLog;
+ public final SystemStateGenerator generator;
+
+ public ClusterFixture(ContentCluster cluster, Distribution distribution) {
+ this.cluster = cluster;
+ this.distribution = distribution;
+ this.timer = new FakeTimer();
+ this.eventLog = mock(EventLogInterface.class);
+ this.generator = createGeneratorForFixtureCluster();
+ }
+
+ public SystemStateGenerator createGeneratorForFixtureCluster() {
+ final int controllerIndex = 0;
+ MetricUpdater metricUpdater = new MetricUpdater(new NoMetricReporter(), controllerIndex);
+ SystemStateGenerator generator = new SystemStateGenerator(timer, eventLog, metricUpdater);
+ generator.setNodes(cluster.clusterInfo());
+ generator.setDistribution(distribution);
+ return generator;
+ }
+
+ public void bringEntireClusterUp() {
+ cluster.clusterInfo().getConfiguredNodes().forEach((idx, node) -> {
+ reportStorageNodeState(idx, State.UP);
+ reportDistributorNodeState(idx, State.UP);
+ });
+ }
+
+ public void reportStorageNodeState(final int index, State state) {
+ final Node node = new Node(NodeType.STORAGE, index);
+ final NodeState nodeState = new NodeState(NodeType.STORAGE, state);
+ nodeState.setDescription("mockdesc");
+ NodeStateOrHostInfoChangeHandler handler = mock(NodeStateOrHostInfoChangeHandler.class);
+ NodeInfo nodeInfo = cluster.getNodeInfo(node);
+
+ generator.handleNewReportedNodeState(nodeInfo, nodeState, handler);
+ nodeInfo.setReportedState(nodeState, timer.getCurrentTimeInMillis());
+ }
+
+ public void reportStorageNodeState(final int index, NodeState nodeState) {
+ final Node node = new Node(NodeType.STORAGE, index);
+ final NodeInfo nodeInfo = cluster.getNodeInfo(node);
+ final long mockTime = 1234;
+ NodeStateOrHostInfoChangeHandler changeListener = mock(NodeStateOrHostInfoChangeHandler.class);
+ generator.handleNewReportedNodeState(nodeInfo, nodeState, changeListener);
+ nodeInfo.setReportedState(nodeState, mockTime);
+ }
+
+ public void reportDistributorNodeState(final int index, State state) {
+ final Node node = new Node(NodeType.DISTRIBUTOR, index);
+ final NodeState nodeState = new NodeState(NodeType.DISTRIBUTOR, state);
+ NodeStateOrHostInfoChangeHandler handler = mock(NodeStateOrHostInfoChangeHandler.class);
+ NodeInfo nodeInfo = cluster.getNodeInfo(node);
+
+ generator.handleNewReportedNodeState(nodeInfo, nodeState, handler);
+ nodeInfo.setReportedState(nodeState, timer.getCurrentTimeInMillis());
+ }
+
+ public void proposeStorageNodeWantedState(final int index, State state) {
+ final Node node = new Node(NodeType.STORAGE, index);
+ final NodeState nodeState = new NodeState(NodeType.STORAGE, state);
+ nodeState.setDescription("mockdesc");
+ NodeInfo nodeInfo = cluster.getNodeInfo(node);
+ nodeInfo.setWantedState(nodeState);
+
+ generator.proposeNewNodeState(nodeInfo, nodeState);
+
+ }
+
+ public void disableAutoClusterTakedown() {
+ generator.setMinNodesUp(0, 0, 0.0, 0.0);
+ }
+
+ public void disableTransientMaintenanceModeOnDown() {
+ Map<NodeType, Integer> maxTransitionTime = new TreeMap<>();
+ maxTransitionTime.put(NodeType.DISTRIBUTOR, 0);
+ maxTransitionTime.put(NodeType.STORAGE, 0);
+ generator.setMaxTransitionTime(maxTransitionTime);
+ }
+
+ public void enableTransientMaintenanceModeOnDown(final int transitionTime) {
+ Map<NodeType, Integer> maxTransitionTime = new TreeMap<>();
+ maxTransitionTime.put(NodeType.DISTRIBUTOR, transitionTime);
+ maxTransitionTime.put(NodeType.STORAGE, transitionTime);
+ generator.setMaxTransitionTime(maxTransitionTime);
+ }
+
+ public String generatedClusterState() {
+ return generator.getClusterState().toString();
+ }
+
+ public String verboseGeneratedClusterState() { return generator.getClusterState().toString(true); }
+
+ public static ClusterFixture forFlatCluster(int nodeCount) {
+ Collection<ConfiguredNode> nodes = DistributionBuilder.buildConfiguredNodes(nodeCount);
+
+ Distribution distribution = DistributionBuilder.forFlatCluster(nodeCount);
+ ContentCluster cluster = new ContentCluster("foo", nodes, distribution, 0, 0.0);
+
+ return new ClusterFixture(cluster, distribution);
+ }
+
+ public static ClusterFixture forHierarchicCluster(DistributionBuilder.GroupBuilder root) {
+ List<ConfiguredNode> nodes = DistributionBuilder.buildConfiguredNodes(root.totalNodeCount());
+ Distribution distribution = DistributionBuilder.forHierarchicCluster(root);
+ ContentCluster cluster = new ContentCluster("foo", nodes, distribution, 0, 0.0);
+
+ return new ClusterFixture(cluster, distribution);
+ }
+}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBuilder.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBuilder.java
new file mode 100644
index 00000000000..ebd87d08403
--- /dev/null
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBuilder.java
@@ -0,0 +1,107 @@
+package com.yahoo.vespa.clustercontroller.core;
+
+import com.yahoo.vdslib.distribution.ConfiguredNode;
+import com.yahoo.vdslib.distribution.Distribution;
+import com.yahoo.vespa.config.content.StorDistributionConfig;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+public class DistributionBuilder {
+ // TODO support nested groups
+ public static class GroupBuilder {
+ final int groupCount;
+ public List<Integer> groupsWithNodeCount;
+
+ public GroupBuilder(int groupCount) {
+ this.groupCount = groupCount;
+ }
+
+ public GroupBuilder(int... nodeCounts) {
+ this.groupCount = nodeCounts.length;
+ this.groupsWithNodeCount = IntStream.of(nodeCounts).boxed()
+ .collect(Collectors.toList());
+ }
+
+ public GroupBuilder eachWithNodeCount(int nodeCount) {
+ groupsWithNodeCount = IntStream.range(0, groupCount)
+ .map(i -> nodeCount).boxed()
+ .collect(Collectors.toList());
+ return this;
+ }
+
+ public int totalNodeCount() {
+ return groupsWithNodeCount.stream().reduce(0, Integer::sum);
+ }
+
+ public String groupDistributionSpec() {
+ return IntStream.range(0, groupCount).mapToObj(i -> "1")
+ .collect(Collectors.joining("|")) + "|*";
+ }
+ }
+
+ public static GroupBuilder withGroups(int groups) {
+ return new GroupBuilder(groups);
+ }
+
+ public static GroupBuilder withGroupNodes(int... nodeCounts) {
+ return new GroupBuilder(nodeCounts);
+ }
+
+ public static List<ConfiguredNode> buildConfiguredNodes(int nodeCount) {
+ return IntStream.range(0, nodeCount)
+ .mapToObj(i -> new ConfiguredNode(i, false))
+ .collect(Collectors.toList());
+ }
+
+ private static StorDistributionConfig.Group.Nodes.Builder configuredNode(ConfiguredNode node) {
+ StorDistributionConfig.Group.Nodes.Builder builder = new StorDistributionConfig.Group.Nodes.Builder();
+ builder.index(node.index());
+ return builder;
+ }
+
+ private static StorDistributionConfig.Group.Builder configuredGroup(
+ String name, int index, Collection<ConfiguredNode> nodes) {
+ StorDistributionConfig.Group.Builder builder = new StorDistributionConfig.Group.Builder();
+ builder.name(name);
+ builder.index(Integer.toString(index));
+ nodes.forEach(n -> builder.nodes(configuredNode(n)));
+ return builder;
+ }
+
+ public static Distribution forFlatCluster(int nodeCount) {
+ Collection<ConfiguredNode> nodes = buildConfiguredNodes(nodeCount);
+
+ StorDistributionConfig.Builder configBuilder = new StorDistributionConfig.Builder();
+ configBuilder.redundancy(2);
+ configBuilder.group(configuredGroup("bar", 0, nodes));
+
+ return new Distribution(new StorDistributionConfig(configBuilder));
+ }
+
+ public static Distribution forHierarchicCluster(GroupBuilder root) {
+ List<ConfiguredNode> nodes = buildConfiguredNodes(root.totalNodeCount());
+
+ StorDistributionConfig.Builder configBuilder = new StorDistributionConfig.Builder();
+ configBuilder.redundancy(2);
+
+ StorDistributionConfig.Group.Builder rootBuilder = new StorDistributionConfig.Group.Builder();
+ rootBuilder.name("invalid");
+ rootBuilder.index("invalid");
+ rootBuilder.partitions(root.groupDistributionSpec());
+ configBuilder.group(rootBuilder);
+
+ int offset = 0;
+ for (int group = 0; group < root.groupsWithNodeCount.size(); ++group) {
+ int nodeCount = root.groupsWithNodeCount.get(group);
+ StorDistributionConfig.Group.Builder groupBuilder
+ = configuredGroup("group_" + (group + 1), group + 1, nodes.subList(offset, offset + nodeCount));
+ configBuilder.group(groupBuilder);
+ offset += nodeCount;
+ }
+
+ return new Distribution(new StorDistributionConfig(configBuilder));
+ }
+}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java
index 86248d2e1e3..f4b3e648f63 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java
@@ -10,6 +10,7 @@ import com.yahoo.vdslib.distribution.Distribution;
import com.yahoo.vdslib.state.ClusterState;
import com.yahoo.vdslib.state.Node;
import com.yahoo.vdslib.state.NodeState;
+import com.yahoo.vdslib.state.NodeType;
import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler;
import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
import com.yahoo.vespa.clustercontroller.core.rpc.RPCCommunicator;
@@ -232,90 +233,43 @@ public abstract class FleetControllerTest implements Waiter {
return nodes;
}
- public interface NodeModifier {
- void modify(NodeInfo node);
+ protected static Set<Integer> asIntSet(Integer... idx) {
+ return Arrays.asList(idx).stream().collect(Collectors.toSet());
}
- NodeModifier makeDefaultTestNodeModifier() {
- return new NodeModifier() {
- @Override
- public void modify(NodeInfo node) {
- if (node.isDistributor()) {
- if (node.getNodeIndex() == 13) {
- node.setPrematureCrashCount(fleetController.getOptions().maxPrematureCrashes + 2);
- }
- return;
- }
- double latency = 75;
- long count = 1000;
- if (node.getNodeIndex() == 4) {
- latency = 300;
- count = 500;
- } else if (node.getNodeIndex() == 7) {
- latency = 120;
- count = 800;
- } else if (node.getNodeIndex() == 21) {
- latency = 2000;
- count = 600;
- } else if (node.getNodeIndex() == 25) {
- node.setPrematureCrashCount(fleetController.getOptions().maxPrematureCrashes + 1);
- } else if (node.getNodeIndex() == 26) {
- node.setPrematureCrashCount(fleetController.getOptions().maxPrematureCrashes);
- }
- String hostInfoString = generateHostInfo(latency, count);
- node.setHostInfo(HostInfo.createHostInfo(hostInfoString));
- }
- };
+ protected static Set<ConfiguredNode> asConfiguredNodes(Set<Integer> indices) {
+ return indices.stream().map(idx -> new ConfiguredNode(idx, false)).collect(Collectors.toSet());
}
- NodeModifier makeStdDevTestNodeModifier() {
- return new NodeModifier() {
- double[] latencies = new double[] { 30, 300, 60, 270 };
- int counter = 0;
-
+ protected void waitForStateExcludingNodeSubset(String expectedState, Set<Integer> excludedNodes) throws Exception {
+ // Due to the implementation details of the test base, this.waitForState() will always
+ // wait until all nodes added in the test have received the latest cluster state. Since we
+ // want to entirely ignore node #6, it won't get a cluster state at all and the test will
+ // fail unless otherwise handled. We thus use a custom waiter which filters out nodes with
+ // the sneaky index (storage and distributors with same index are treated as different nodes
+ // in this context).
+ Waiter subsetWaiter = new Waiter.Impl(new DataRetriever() {
@Override
- public void modify(NodeInfo node) {
- if (node.isDistributor()) {
- return;
- }
- String hostInfo = generateHostInfo(latencies[counter++ % latencies.length], 1500);
- node.setHostInfo(HostInfo.createHostInfo(hostInfo));
+ public Object getMonitor() { return timer; }
+ @Override
+ public FleetController getFleetController() { return fleetController; }
+ @Override
+ public List<DummyVdsNode> getDummyNodes() {
+ return nodes.stream()
+ .filter(n -> !excludedNodes.contains(n.getNode().getIndex()))
+ .collect(Collectors.toList());
}
- };
- }
-
- protected void setUpSlowDiskCluster(NodeModifier callback) throws Exception {
- int nodeCount = 31;
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
- // TODO: multiple groups!
- options.setStorageDistribution(new Distribution(Distribution.getDefaultDistributionConfig(3, nodeCount)));
- setUpFleetController(true, options);
- setUpVdsNodes(true, new DummyVdsNodeOptions(), false, nodeCount);
- waitForStableSystem(nodeCount);
- // Set one node as not being up. It should not contribute to the overall
- // latency or operation metrics, nor should its disks be included.
- nodes.get(2*13).disconnectAsShutdown();
- nodes.get(2*21+1).disconnectAsShutdown();
- waiter.waitForState("version:\\d+ distributor:31 .13.s:d storage:31 .21.s:m");
-
- for (NodeInfo node : fleetController.getCluster().getNodeInfo()) {
- callback.modify(node);
- }
+ @Override
+ public int getTimeoutMS() { return timeoutMS; }
+ });
+ subsetWaiter.waitForState(expectedState);
}
- protected void setUpSimpleCluster(int nodeCount) throws Exception {
- FleetControllerOptions options = new FleetControllerOptions("mycluster");
- // TODO: multiple groups!
- options.setStorageDistribution(new Distribution(Distribution.getDefaultDistributionConfig(3, nodeCount)));
- setUpFleetController(true, options);
- setUpVdsNodes(true, new DummyVdsNodeOptions(), false, nodeCount);
- waitForStableSystem(nodeCount);
- waiter.waitForState("version:\\d+ distributor:" + nodeCount + " storage:" + nodeCount);
-
- NodeModifier callback = makeDefaultTestNodeModifier();
- for (NodeInfo node : fleetController.getCluster().getNodeInfo()) {
- callback.modify(node);
- }
+ protected static Map<NodeType, Integer> transitionTimes(int milliseconds) {
+ Map<NodeType, Integer> maxTransitionTime = new TreeMap<>();
+ maxTransitionTime.put(NodeType.DISTRIBUTOR, milliseconds);
+ maxTransitionTime.put(NodeType.STORAGE, milliseconds);
+ return maxTransitionTime;
}
protected void tearDownSystem() throws Exception {
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java
new file mode 100644
index 00000000000..64bea57c6ad
--- /dev/null
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java
@@ -0,0 +1,100 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core;
+
+import com.yahoo.vdslib.state.NodeType;
+import org.junit.Test;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import static org.junit.Assert.assertFalse;
+
+public class GroupAutoTakedownLiveConfigTest extends FleetControllerTest {
+
+ private long mockConfigGeneration = 1;
+
+
+ private static FleetControllerOptions createOptions(
+ DistributionBuilder.GroupBuilder groupBuilder, double minNodeRatio)
+ {
+ FleetControllerOptions options = new FleetControllerOptions("mycluster");
+ options.setStorageDistribution(DistributionBuilder.forHierarchicCluster(groupBuilder));
+ options.nodes = DistributionBuilder.buildConfiguredNodes(groupBuilder.totalNodeCount())
+ .stream().collect(Collectors.toSet());
+ options.minNodeRatioPerGroup = minNodeRatio;
+ options.maxTransitionTime = transitionTimes(0);
+ return options;
+ }
+
+ private void updateConfigLive(FleetControllerOptions newOptions) {
+ ++mockConfigGeneration;
+ this.fleetController.updateOptions(newOptions, mockConfigGeneration);
+ }
+
+ private void reconfigureWithMinNodeRatio(double minNodeRatio) {
+ FleetControllerOptions newOptions = this.options.clone();
+ newOptions.minNodeRatioPerGroup = minNodeRatio;
+ updateConfigLive(newOptions);
+ }
+
+ private void reconfigureWithDistribution(DistributionBuilder.GroupBuilder groupBuilder) {
+ FleetControllerOptions newOptions = this.options.clone();
+ newOptions.nodes = DistributionBuilder.buildConfiguredNodes(groupBuilder.totalNodeCount())
+ .stream().collect(Collectors.toSet());
+ newOptions.storageDistribution = DistributionBuilder.forHierarchicCluster(groupBuilder);
+ updateConfigLive(newOptions);
+ }
+
+ private void setUp3x3ClusterWithMinNodeRatio(double minNodeRatio) throws Exception {
+ FleetControllerOptions options = createOptions(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(3),
+ minNodeRatio);
+ setUpFleetController(true, options);
+ setUpVdsNodes(true, new DummyVdsNodeOptions(), false, 9);
+ waitForState("version:\\d+ distributor:9 storage:9");
+ }
+
+ private void takeDownContentNode(int index) {
+ // nodes list contains both distributors and storage nodes, with distributors
+ // in even positions and storage nodes in odd positions.
+ final int arrayIndex = index*2 + 1;
+ assertFalse(nodes.get(arrayIndex).isDistributor());
+ nodes.get(arrayIndex).disconnect();
+ }
+
+ @Test
+ public void bootstrap_min_ratio_option_is_propagated_to_group_availability_logic() throws Exception {
+ setUp3x3ClusterWithMinNodeRatio(0.67);
+ takeDownContentNode(0);
+ waitForStateExcludingNodeSubset("version:\\d+ distributor:9 storage:9 .0.s:d .1.s:d .2.s:d", asIntSet(0));
+ }
+
+ @Test
+ public void min_ratio_live_reconfig_immediately_takes_effect() throws Exception {
+ // Initially, arbitrarily many nodes may be down in a group.
+ setUp3x3ClusterWithMinNodeRatio(0.0);
+ takeDownContentNode(3);
+ waitForStateExcludingNodeSubset("version:\\d+ distributor:9 storage:9 .3.s:d", asIntSet(3));
+
+ reconfigureWithMinNodeRatio(0.67);
+ waitForStateExcludingNodeSubset("version:\\d+ distributor:9 storage:9 .3.s:d .4.s:d .5.s:d", asIntSet(3));
+
+ reconfigureWithMinNodeRatio(0.0);
+ // Aaaand back up again!
+ waitForStateExcludingNodeSubset("version:\\d+ distributor:9 storage:9 .3.s:d", asIntSet(3));
+ }
+
+ @Test
+ public void live_distribution_config_changes_trigger_cluster_state_change() throws Exception {
+ setUp3x3ClusterWithMinNodeRatio(0.65);
+ takeDownContentNode(6);
+
+ // Not enough nodes down to trigger group take-down yet
+ waitForStateExcludingNodeSubset("version:\\d+ distributor:9 storage:9 .6.s:d", asIntSet(6));
+ // Removing a node from the same group as node 6 will dip it under the configured threshold,
+ // taking down the entire group. In this case we configure out node 8.
+ reconfigureWithDistribution(DistributionBuilder.withGroupNodes(3, 3, 2));
+ waitForStateExcludingNodeSubset("version:\\d+ distributor:8 storage:6", asIntSet(6, 8));
+ }
+}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java
new file mode 100644
index 00000000000..93e34b1f772
--- /dev/null
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java
@@ -0,0 +1,370 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core;
+
+import com.yahoo.vdslib.distribution.ConfiguredNode;
+import com.yahoo.vdslib.state.DiskState;
+import com.yahoo.vdslib.state.Node;
+import com.yahoo.vdslib.state.NodeState;
+import com.yahoo.vdslib.state.NodeType;
+import com.yahoo.vdslib.state.State;
+import com.yahoo.vespa.clustercontroller.core.listeners.NodeStateOrHostInfoChangeHandler;
+import com.yahoo.vespa.clustercontroller.core.listeners.SystemStateListener;
+import org.junit.Test;
+import org.mockito.ArgumentMatcher;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.hamcrest.core.AllOf.allOf;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.argThat;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+
+public class GroupAutoTakedownTest {
+
+ private static ClusterFixture createFixtureForAllUpFlatCluster(int nodeCount, double minNodeRatioPerGroup) {
+ ClusterFixture fixture = ClusterFixture.forFlatCluster(nodeCount);
+ setSharedFixtureOptions(fixture, minNodeRatioPerGroup);
+ return fixture;
+ }
+
+ private static ClusterFixture createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.GroupBuilder root, double minNodeRatioPerGroup)
+ {
+ ClusterFixture fixture = ClusterFixture.forHierarchicCluster(root);
+ setSharedFixtureOptions(fixture, minNodeRatioPerGroup);
+ return fixture;
+ }
+
+ private static void setSharedFixtureOptions(ClusterFixture fixture, double minNodeRatioPerGroup) {
+ fixture.generator.setMinNodeRatioPerGroup(minNodeRatioPerGroup);
+ fixture.disableTransientMaintenanceModeOnDown();
+ fixture.disableAutoClusterTakedown();
+ fixture.bringEntireClusterUp();
+ }
+
+ private String stateAfterStorageTransition(ClusterFixture fixture, final int index, final State state) {
+ transitionStoreNodeToState(fixture, index, state);
+ return fixture.generatedClusterState();
+ }
+
+ private String verboseStateAfterStorageTransition(ClusterFixture fixture, final int index, final State state) {
+ transitionStoreNodeToState(fixture, index, state);
+ return fixture.verboseGeneratedClusterState();
+ }
+
+ private void transitionStoreNodeToState(ClusterFixture fixture, int index, State state) {
+ fixture.reportStorageNodeState(index, state);
+ SystemStateListener listener = mock(SystemStateListener.class);
+ assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener));
+ }
+
+ /**
+ * Use a per-group availability requirement ratio of 99%. Ensure that taking down a single
+ * node out of 5 in a flat hierarchy does not take down the cluster, i.e. the config should
+ * not apply to a flat structure.
+ */
+ @Test
+ public void config_does_not_apply_to_flat_hierarchy_clusters() {
+ ClusterFixture fixture = createFixtureForAllUpFlatCluster(5, 0.99);
+
+ SystemStateListener listener = mock(SystemStateListener.class);
+ // First invocation; generates initial state and clears "new state" flag
+ assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener));
+ assertEquals("version:1 distributor:5 storage:5", fixture.generatedClusterState());
+
+ assertEquals("version:2 distributor:5 storage:5 .1.s:d",
+ stateAfterStorageTransition(fixture, 1, State.DOWN));
+ }
+
+ @Test
+ public void group_node_down_edge_implicitly_marks_down_rest_of_nodes_in_group() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51);
+
+ SystemStateListener listener = mock(SystemStateListener.class);
+ assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener));
+ assertEquals("version:1 distributor:6 storage:6", fixture.generatedClusterState());
+
+ // Same group as node 4
+ assertEquals("version:2 distributor:6 storage:4",
+ stateAfterStorageTransition(fixture, 5, State.DOWN));
+ // Same group as node 1
+ assertEquals("version:3 distributor:6 storage:4 .0.s:d .1.s:d",
+ stateAfterStorageTransition(fixture, 0, State.DOWN));
+ }
+
+ @Test
+ public void restored_group_node_availability_takes_group_back_up_automatically() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51);
+
+ // Group #2 -> down
+ assertEquals("version:1 distributor:6 storage:4",
+ stateAfterStorageTransition(fixture, 5, State.DOWN));
+
+ // Group #2 -> back up again
+ assertEquals("version:2 distributor:6 storage:6",
+ stateAfterStorageTransition(fixture, 5, State.UP));
+ }
+
+ @Test
+ public void no_op_for_downed_nodes_in_already_downed_group() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51);
+
+ assertEquals("version:1 distributor:6 storage:4",
+ stateAfterStorageTransition(fixture, 5, State.DOWN));
+
+ // 4, 5 in same group; this should not cause a new state since it's already implicitly down
+ fixture.reportStorageNodeState(4, State.DOWN);
+
+ SystemStateListener listener = mock(SystemStateListener.class);
+ assertFalse(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener));
+
+ assertEquals("version:1 distributor:6 storage:4", fixture.generatedClusterState());
+ }
+
+ @Test
+ public void verbose_node_state_description_updated_for_implicitly_downed_nodes() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.75);
+
+ // Nodes 6 and 7 are taken down implicitly and should have a message reflecting this.
+ // Node 8 is taken down by the fixture and gets a fixture-assigned message that
+ // we should _not_ lose/overwrite.
+ assertEquals("version:1 distributor:9 storage:9 .6.s:d " +
+ ".6.m:group\\x20node\\x20availability\\x20below\\x20configured\\x20threshold " +
+ ".7.s:d " +
+ ".7.m:group\\x20node\\x20availability\\x20below\\x20configured\\x20threshold " +
+ ".8.s:d .8.m:mockdesc",
+ verboseStateAfterStorageTransition(fixture, 8, State.DOWN));
+ }
+
+ @Test
+ public void legacy_cluster_wide_availabilty_ratio_is_computed_after_group_takedowns() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51);
+ fixture.generator.setMinNodesUp(5, 5, 0.51, 0.51);
+
+ // Taking down a node in a group forces the entire group down, which leaves us with
+ // only 4 content nodes (vs. minimum of 5 as specified above). The entire cluster
+ // should be marked as down in this case.
+ assertEquals("version:1 cluster:d distributor:6 storage:4",
+ stateAfterStorageTransition(fixture, 5, State.DOWN));
+ }
+
+ @Test
+ public void maintenance_wanted_state_not_overwritten() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.99);
+
+ NodeInfo nodeInfo = fixture.cluster.getNodeInfo(new Node(NodeType.STORAGE, 5));
+ fixture.generator.proposeNewNodeState(nodeInfo, new NodeState(NodeType.STORAGE, State.MAINTENANCE));
+ SystemStateListener listener = mock(SystemStateListener.class);
+ assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener));
+
+ // Maintenance not counted as down, so group still up
+ assertEquals("version:1 distributor:9 storage:9 .5.s:m", fixture.generatedClusterState());
+
+ // Group goes down, but maintenance node should still be in maintenance
+ assertEquals("version:2 distributor:9 storage:9 .3.s:d .4.s:d .5.s:m",
+ stateAfterStorageTransition(fixture, 4, State.DOWN));
+ }
+
+ @Test
+ public void transient_maintenance_mode_on_down_edge_does_not_take_down_group() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.99);
+ fixture.enableTransientMaintenanceModeOnDown(1000);
+
+ // Our timers are mocked, so taking down node 4 will deterministically transition to
+ // a transient maintenance mode. Group should not be taken down here.
+ assertEquals("version:1 distributor:9 storage:9 .4.s:m",
+ stateAfterStorageTransition(fixture, 4, State.DOWN));
+
+ // However, once grace period expires the group should be taken down.
+ fixture.timer.advanceTime(1001);
+ NodeStateOrHostInfoChangeHandler changeListener = mock(NodeStateOrHostInfoChangeHandler.class);
+ fixture.generator.watchTimers(fixture.cluster, changeListener);
+ SystemStateListener stateListener = mock(SystemStateListener.class);
+ assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, stateListener));
+
+ assertEquals("version:2 distributor:9 storage:9 .3.s:d .4.s:d .5.s:d", fixture.generatedClusterState());
+ }
+
+ private static class NodeEventWithDescription extends ArgumentMatcher<NodeEvent> {
+ private final String expected;
+
+ NodeEventWithDescription(String expected) {
+ this.expected = expected;
+ }
+
+ @Override
+ public boolean matches(Object o) {
+ return expected.equals(((NodeEvent)o).getDescription());
+ }
+ }
+
+ private static NodeEventWithDescription nodeEventWithDescription(String description) {
+ return new NodeEventWithDescription(description);
+ }
+
+ private static class EventForNode extends ArgumentMatcher<NodeEvent> {
+ private final Node expected;
+
+ EventForNode(Node expected) {
+ this.expected = expected;
+ }
+
+ @Override
+ public boolean matches(Object o) {
+ return ((NodeEvent)o).getNode().getNode().equals(expected);
+ }
+ }
+
+ private static EventForNode eventForNode(Node expected) {
+ return new EventForNode(expected);
+ }
+
+ private static Node contentNode(int index) {
+ return new Node(NodeType.STORAGE, index);
+ }
+
+ @Test
+ public void taking_down_node_adds_node_specific_event() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51);
+
+ assertEquals("version:1 distributor:6 storage:4",
+ stateAfterStorageTransition(fixture, 5, State.DOWN));
+
+ verify(fixture.eventLog).addNodeOnlyEvent(argThat(allOf(
+ nodeEventWithDescription("Setting node down as the total availability of its group is " +
+ "below the configured threshold"),
+ eventForNode(contentNode(4)))), any());
+ }
+
+ @Test
+ public void bringing_node_back_up_adds_node_specific_event() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51);
+
+ assertEquals("version:1 distributor:6 storage:4",
+ stateAfterStorageTransition(fixture, 5, State.DOWN));
+ assertEquals("version:2 distributor:6 storage:6",
+ stateAfterStorageTransition(fixture, 5, State.UP));
+
+ verify(fixture.eventLog).addNodeOnlyEvent(argThat(allOf(
+ nodeEventWithDescription("Group availability restored; taking node back up"),
+ eventForNode(contentNode(4)))), any());
+ }
+
+ @Test
+ public void wanted_state_retired_implicitly_down_node_transitioned_it_to_retired_mode_immediately() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.99);
+
+ assertEquals("version:1 distributor:9 storage:6",
+ stateAfterStorageTransition(fixture, 6, State.DOWN));
+ // Node 7 is implicitly down. Mark wanted state as retired. It should now be Retired
+ // but not Down.
+ fixture.proposeStorageNodeWantedState(7, State.RETIRED);
+
+ SystemStateListener stateListener = mock(SystemStateListener.class);
+ assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, stateListener));
+ assertEquals("version:2 distributor:9 storage:8 .6.s:d .7.s:r", fixture.generatedClusterState());
+ }
+
+ @Test
+ public void downed_config_retired_node_transitions_back_to_retired_on_up_edge() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.49);
+
+ assertEquals("version:1 distributor:6 storage:6 .4.s:d",
+ stateAfterStorageTransition(fixture, 4, State.DOWN));
+ assertEquals("version:2 distributor:6 storage:4",
+ stateAfterStorageTransition(fixture, 5, State.DOWN));
+
+ // Node 5 gets config-retired under our feet.
+ Set<ConfiguredNode> nodes = new HashSet<>(fixture.cluster.clusterInfo().getConfiguredNodes().values());
+ nodes.remove(new ConfiguredNode(5, false));
+ nodes.add(new ConfiguredNode(5, true));
+ // TODO this should ideally also set the retired flag in the distribution
+ // config, but only the ConfiguredNodes are actually looked at currently.
+ fixture.cluster.setNodes(nodes);
+ fixture.generator.setNodes(fixture.cluster.clusterInfo());
+
+ assertEquals("version:3 distributor:6 storage:6 .4.s:d .5.s:r",
+ stateAfterStorageTransition(fixture, 5, State.UP));
+ }
+
+ @Test
+ public void init_progress_is_preserved_across_group_down_up_edge() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51);
+
+ final Node node = new Node(NodeType.STORAGE, 4);
+ final NodeState newState = new NodeState(NodeType.STORAGE, State.INITIALIZING);
+ newState.setInitProgress(0.5);
+
+ fixture.reportStorageNodeState(4, newState);
+ SystemStateListener stateListener = mock(SystemStateListener.class);
+ assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, stateListener));
+
+ assertEquals("version:1 distributor:6 storage:6 .4.s:i .4.i:0.5", fixture.generatedClusterState());
+
+ assertEquals("version:2 distributor:6 storage:4",
+ stateAfterStorageTransition(fixture, 5, State.DOWN));
+ assertEquals("version:3 distributor:6 storage:6 .4.s:i .4.i:0.5",
+ stateAfterStorageTransition(fixture, 5, State.UP));
+ }
+
+ @Test
+ public void disk_states_are_preserved_across_group_down_up_edge() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51);
+
+ final Node node = new Node(NodeType.STORAGE, 4);
+ final NodeState newState = new NodeState(NodeType.STORAGE, State.UP);
+ newState.setDiskCount(7);
+ newState.setDiskState(5, new DiskState(State.DOWN));
+
+ fixture.reportStorageNodeState(4, newState);
+ SystemStateListener stateListener = mock(SystemStateListener.class);
+ assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, stateListener));
+
+ assertEquals("version:1 distributor:6 storage:6 .4.d:7 .4.d.5.s:d", fixture.generatedClusterState());
+
+ assertEquals("version:2 distributor:6 storage:4",
+ stateAfterStorageTransition(fixture, 5, State.DOWN));
+ assertEquals("version:3 distributor:6 storage:6 .4.d:7 .4.d.5.s:d",
+ stateAfterStorageTransition(fixture, 5, State.UP));
+ }
+
+ @Test
+ public void down_wanted_state_is_preserved_across_group_down_up_edge() {
+ ClusterFixture fixture = createFixtureForAllUpHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.60);
+
+ NodeInfo nodeInfo = fixture.cluster.getNodeInfo(new Node(NodeType.STORAGE, 5));
+ nodeInfo.setWantedState(new NodeState(NodeType.STORAGE, State.DOWN).setDescription("borkbork"));
+ fixture.generator.proposeNewNodeState(nodeInfo, nodeInfo.getWantedState());
+ SystemStateListener listener = mock(SystemStateListener.class);
+ assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener));
+
+ assertEquals("version:1 distributor:9 storage:9 .5.s:d .5.m:borkbork", fixture.verboseGeneratedClusterState());
+
+ assertEquals("version:2 distributor:9 storage:9 " +
+ ".3.s:d .3.m:group\\x20node\\x20availability\\x20below\\x20configured\\x20threshold " +
+ ".4.s:d .4.m:mockdesc .5.s:d .5.m:borkbork",
+ verboseStateAfterStorageTransition(fixture, 4, State.DOWN));
+ assertEquals("version:3 distributor:9 storage:9 .5.s:d .5.m:borkbork",
+ verboseStateAfterStorageTransition(fixture, 4, State.UP));
+ }
+
+}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java
new file mode 100644
index 00000000000..7d44afda68f
--- /dev/null
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java
@@ -0,0 +1,188 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.core;
+
+import com.yahoo.vdslib.state.ClusterState;
+import org.junit.Test;
+
+import java.text.ParseException;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.junit.Assert.assertThat;
+
+public class GroupAvailabilityCalculatorTest {
+
+ private static ClusterState clusterState(String state) {
+ try {
+ return new ClusterState(state);
+ } catch (ParseException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static GroupAvailabilityCalculator calcForFlatCluster(
+ final int nodeCount,
+ final double minNodeRatioPerGroup)
+ {
+ return GroupAvailabilityCalculator.builder()
+ .withDistribution(DistributionBuilder.forFlatCluster(nodeCount))
+ .withMinNodeRatioPerGroup(minNodeRatioPerGroup)
+ .build();
+ }
+
+ private static GroupAvailabilityCalculator calcForHierarchicCluster(
+ DistributionBuilder.GroupBuilder rootGroup,
+ final double minNodeRatioPerGroup)
+ {
+ return GroupAvailabilityCalculator.builder()
+ .withDistribution(DistributionBuilder.forHierarchicCluster(rootGroup))
+ .withMinNodeRatioPerGroup(minNodeRatioPerGroup)
+ .build();
+ }
+
+ private static Set<Integer> indices(Integer... idx) {
+ Set<Integer> indices = new HashSet<>();
+ Collections.addAll(indices, idx);
+ return indices;
+ }
+
+ private static Set<Integer> emptySet() { return indices(); }
+
+ @Test
+ public void flat_cluster_does_not_implicitly_take_down_nodes() {
+ GroupAvailabilityCalculator calc = calcForFlatCluster(5, 0.99);
+
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:5 storage:5 .1.s:d .2.s:d")), equalTo(emptySet()));
+
+ }
+
+ @Test
+ public void group_node_down_edge_implicitly_marks_down_rest_of_nodes_in_group() {
+ // 3 groups of 2 nodes, take down node #4 (1st node in last group). Since we require
+ // at least 51% of group capacity to be available, implicitly take down the last group
+ // entirely.
+ GroupAvailabilityCalculator calc = calcForHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51);
+
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:6 storage:6 .4.s:d")), equalTo(indices(5)));
+ }
+
+ // Setting 50% as min ratio in a group with 2 nodes should let group be up if
+ // one node goes down.
+ @Test
+ public void min_ratio_per_group_is_closed_interval() {
+ GroupAvailabilityCalculator calc = calcForHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.50);
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:6 storage:6 .4.s:d")), equalTo(emptySet()));
+ }
+
+ @Test
+ public void retired_node_is_counted_as_down() {
+ GroupAvailabilityCalculator calc = calcForHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.99);
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:6 storage:6 .1.s:r")), equalTo(indices(0)));
+ }
+
+ @Test
+ public void initializing_node_not_counted_as_down() {
+ GroupAvailabilityCalculator calc = calcForHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.99);
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:6 storage:6 .4.s:i")), equalTo(emptySet()));
+ }
+
+ @Test
+ public void maintenance_node_not_counted_as_down() {
+ GroupAvailabilityCalculator calc = calcForHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.99);
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:6 storage:6 .4.s:m")), equalTo(emptySet()));
+ }
+
+ @Test
+ public void existing_maintenance_node_not_implicitly_downed_when_group_taken_down() {
+ GroupAvailabilityCalculator calc = calcForHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.99);
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:9 storage:9 .4.s:m .5.s:d")), equalTo(indices(3))); // _not_ {3, 4}
+ }
+
+ @Test
+ public void existing_retired_node_not_implicitly_downed_when_group_taken_down() {
+ GroupAvailabilityCalculator calc = calcForHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.99);
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:9 storage:9 .4.s:r .5.s:d")), equalTo(indices(3))); // _not_ {3, 4}
+ }
+
+ @Test
+ public void down_to_down_edge_keeps_group_down() {
+ GroupAvailabilityCalculator calc = calcForHierarchicCluster(
+ DistributionBuilder.withGroups(2).eachWithNodeCount(4), 0.76);
+
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:8 storage:8 .1.s:d")), equalTo(indices(0, 2, 3)));
+
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:8 storage:8 .1.s:d .2.s:d")), equalTo(indices(0, 3)));
+ }
+
+ // Cluster state representations "prune" downed nodes at the end of the state,
+ // causing "storage:6 .5.s:d" to be reduced to "storage:5". This still implies a
+ // node is down according to the distribution config and must be handled as such.
+ @Test
+ public void implicitly_downed_node_at_state_end_is_counted_as_explicitly_down() {
+ GroupAvailabilityCalculator calc = calcForHierarchicCluster(
+ DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.99);
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:6 storage:5")), equalTo(indices(4)));
+ }
+
+ @Test
+ public void non_uniform_group_sizes_are_supported() {
+ GroupAvailabilityCalculator calc = calcForHierarchicCluster(
+ DistributionBuilder.withGroupNodes(1, 2, 3, 4), 0.67);
+
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:10 storage:10")), equalTo(emptySet()));
+ // Group 0 has only 1 node and should not cause any other nodes to be taken down
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:10 storage:10 .0.s:d")), equalTo(emptySet()));
+ // Too little availability in group 1
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:10 storage:10 .1.s:d")), equalTo(indices(2)));
+ // Too little availability in group 2
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:10 storage:10 .3.s:d")), equalTo(indices(4, 5)));
+ // Group 4 has 75% availability (>= 67%), so no auto take-down there
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:10 storage:10 .7.s:d")), equalTo(emptySet()));
+ // Drop group 4 availability to 50%; it should now be taken down entirely
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:10 storage:9 .7.s:d")), equalTo(indices(6, 8)));
+ }
+
+ @Test
+ public void min_ratio_of_zero_never_takes_down_groups_implicitly() {
+ GroupAvailabilityCalculator calc = calcForHierarchicCluster(
+ DistributionBuilder.withGroups(2).eachWithNodeCount(4), 0.0);
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:8 storage:8")), equalTo(emptySet()));
+ // 1 down in each group
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:8 storage:8 .0.s:d .4.s:d")), equalTo(emptySet()));
+ // 2 down in each group
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:8 storage:8 .0.s:d .1.s:d .4.s:d .5.s:d")), equalTo(emptySet()));
+ // 3 down in each group
+ assertThat(calc.nodesThatShouldBeDown(clusterState(
+ "distributor:8 storage:8 .0.s:d .1.s:d .2.s:d .4.s:d .5.s:d .6.s:d")), equalTo(emptySet()));
+ }
+
+}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java
index 10305de116a..ef4c80a2256 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java
@@ -18,38 +18,6 @@ public class NodeSlobrokConfigurationMembershipTest extends FleetControllerTest
private final Set<Integer> nodeIndices = asIntSet(0, 1, 2, 3);
private final int foreignNode = 6;
- private void waitForStateExcludingNodeSubset(String expectedState, Set<Integer> excludedNodes) throws Exception {
- // Due to the implementation details of the test base, this.waitForState() will always
- // wait until all nodes added in the test have received the latest cluster state. Since we
- // want to entirely ignore node #6, it won't get a cluster state at all and the test will
- // fail unless otherwise handled. We thus use a custom waiter which filters out nodes with
- // the sneaky index (storage and distributors with same index are treated as different nodes
- // in this context).
- Waiter subsetWaiter = new Waiter.Impl(new DataRetriever() {
- @Override
- public Object getMonitor() { return timer; }
- @Override
- public FleetController getFleetController() { return fleetController; }
- @Override
- public List<DummyVdsNode> getDummyNodes() {
- return nodes.stream()
- .filter(n -> !excludedNodes.contains(n.getNode().getIndex()))
- .collect(Collectors.toList());
- }
- @Override
- public int getTimeoutMS() { return timeoutMS; }
- });
- subsetWaiter.waitForState(expectedState);
- }
-
- private static Set<Integer> asIntSet(Integer... idx) {
- return Arrays.asList(idx).stream().collect(Collectors.toSet());
- }
-
- private static Set<ConfiguredNode> asConfiguredNodes(Set<Integer> indices) {
- return indices.stream().map(idx -> new ConfiguredNode(idx, false)).collect(Collectors.toSet());
- }
-
private void setUpClusterWithForeignNode(Set<Integer> validIndices, final int foreignNodeIndex) throws Exception {
final Set<ConfiguredNode> configuredNodes = asConfiguredNodes(validIndices);
FleetControllerOptions options = optionsForConfiguredNodes(configuredNodes);
@@ -63,6 +31,7 @@ public class NodeSlobrokConfigurationMembershipTest extends FleetControllerTest
FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes);
options.maxSlobrokDisconnectGracePeriod = 60 * 1000;
options.nodeStateRequestTimeoutMS = 10000 * 60 * 1000;
+ options.maxTransitionTime = transitionTimes(0);
return options;
}
@@ -108,10 +77,14 @@ public class NodeSlobrokConfigurationMembershipTest extends FleetControllerTest
assertTrue(configuredNodes.remove(new ConfiguredNode(0, true)));
fleetController.updateOptions(options, 0);
- // The previously retired node should now be marked as done, as it no longer
+ // The previously retired node should now be marked as down, as it no longer
// exists from the point of view of the content cluster. We have to use a subset
// state waiter, as the controller will not send the new state to node 0.
waitForStateExcludingNodeSubset("version:\\d+ distributor:4 .0.s:d storage:4 .0.s:d", asIntSet(0));
+
+ // Ensure it remains down for subsequent cluster state versions as well.
+ nodes.get(3).disconnect();
+ waitForStateExcludingNodeSubset("version:\\d+ distributor:4 .0.s:d storage:4 .0.s:d .1.s:d", asIntSet(0, 1));
}
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
index cb5cee70486..5b53e524102 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
@@ -4,15 +4,16 @@ package com.yahoo.vespa.clustercontroller.core;
import com.yahoo.vdslib.distribution.ConfiguredNode;
import com.yahoo.vdslib.distribution.Distribution;
import com.yahoo.vdslib.distribution.Group;
+import com.yahoo.vdslib.state.ClusterState;
import com.yahoo.vdslib.state.Node;
import com.yahoo.vdslib.state.NodeState;
import com.yahoo.vdslib.state.NodeType;
import com.yahoo.vdslib.state.State;
import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest;
-import org.junit.Before;
import org.junit.Test;
+import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
@@ -29,7 +30,7 @@ public class NodeStateChangeCheckerTest {
private static final int minStorageNodesUp = 3;
private static final int requiredRedundancy = 4;
- private static final int currentClusterState = 2;
+ private static final int currentClusterStateVersion = 2;
private static final double minRatioOfStorageNodesUp = 0.9;
private static final Node nodeDistributor = new Node(NodeType.DISTRIBUTOR, 1);
@@ -42,6 +43,18 @@ public class NodeStateChangeCheckerTest {
return new NodeState(NodeType.STORAGE, state).setDescription(description);
}
+ private static ClusterState clusterState(String state) {
+ try {
+ return new ClusterState(state);
+ } catch (ParseException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static ClusterState defaultAllUpClusterState() {
+ return clusterState(String.format("version:%d distributor:4 storage:4", currentClusterStateVersion));
+ }
+
private NodeStateChangeChecker createChangeChecker(ContentCluster cluster) {
return new NodeStateChangeChecker(minStorageNodesUp, minRatioOfStorageNodesUp, requiredRedundancy, cluster.clusterInfo());
}
@@ -93,12 +106,22 @@ public class NodeStateChangeCheckerTest {
"}\n";
}
+ private void markAllNodesAsReportingStateUp(ContentCluster cluster) {
+ final ClusterInfo clusterInfo = cluster.clusterInfo();
+ final int configuredNodeCount = cluster.clusterInfo().getConfiguredNodes().size();
+ for (int i = 0; i < configuredNodeCount; i++) {
+ clusterInfo.getDistributorNodeInfo(i).setReportedState(new NodeState(NodeType.DISTRIBUTOR, State.UP), 0);
+ clusterInfo.getDistributorNodeInfo(i).setHostInfo(HostInfo.createHostInfo(createDistributorHostInfo(4, 5, 6)));
+ clusterInfo.getStorageNodeInfo(i).setReportedState(new NodeState(NodeType.STORAGE, State.UP), 0);
+ }
+ }
+
@Test
public void testCanUpgradeForce() {
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(createCluster(createNodes(1)));
NodeState newState = new NodeState(NodeType.STORAGE, State.INITIALIZING);
NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition(
- nodeDistributor, currentClusterState, SetUnitStateRequest.Condition.FORCE,
+ nodeDistributor, defaultAllUpClusterState(), SetUnitStateRequest.Condition.FORCE,
upNodeState, newState);
assertTrue(result.settingWantedStateIsAllowed());
assertTrue(!result.wantedStateAlreadySet());
@@ -108,7 +131,7 @@ public class NodeStateChangeCheckerTest {
public void testSafeSetStateDistributors() {
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(createCluster(createNodes(1)));
NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition(
- nodeDistributor, currentClusterState, SetUnitStateRequest.Condition.SAFE,
+ nodeDistributor, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE,
upNodeState, maintenanceNodeState);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
@@ -122,7 +145,7 @@ public class NodeStateChangeCheckerTest {
NodeStateChangeChecker nodeStateChangeChecker = new NodeStateChangeChecker(
5 /* min storage nodes */, minRatioOfStorageNodesUp, requiredRedundancy, cluster.clusterInfo());
NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition(
- nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE,
+ nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE,
upNodeState, maintenanceNodeState);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
@@ -143,12 +166,32 @@ public class NodeStateChangeCheckerTest {
// Not setting nodes up -> all are down
NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition(
- nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE,
+ nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE,
maintenanceNodeState, upNodeState);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
}
+ // A node may be reported as Up but have a generated state of Down if it's part of
+ // nodes taken down implicitly due to a group having too low node availability.
+ @Test
+ public void testSetUpSucceedsIfReportedIsUpButGeneratedIsDown() {
+ ContentCluster cluster = createCluster(createNodes(4));
+ NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
+
+ markAllNodesAsReportingStateUp(cluster);
+
+ ClusterState stateWithNodeDown = clusterState(String.format(
+ "version:%d distributor:4 storage:4 .%d.s:d",
+ currentClusterStateVersion, nodeStorage.getIndex()));
+
+ NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition(
+ nodeStorage, stateWithNodeDown, SetUnitStateRequest.Condition.SAFE,
+ maintenanceNodeState, upNodeState);
+ assertTrue(result.settingWantedStateIsAllowed());
+ assertFalse(result.wantedStateAlreadySet());
+ }
+
@Test
public void testCannotSetUpIfUnknownOldStateAndReportedIsDown() {
ContentCluster cluster = createCluster(createNodes(4));
@@ -156,7 +199,7 @@ public class NodeStateChangeCheckerTest {
// Not setting nodes up -> all are down
NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition(
- nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE,
+ nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE,
new NodeState(NodeType.STORAGE, State.DOWN), upNodeState);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
@@ -170,7 +213,7 @@ public class NodeStateChangeCheckerTest {
setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 3, 6)));
NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition(
- nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE,
+ nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE,
upNodeState, maintenanceNodeState);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
@@ -185,7 +228,7 @@ public class NodeStateChangeCheckerTest {
setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 3, 6)));
NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition(
- new Node(NodeType.STORAGE, 3), currentClusterState, SetUnitStateRequest.Condition.SAFE,
+ new Node(NodeType.STORAGE, 3), defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE,
upNodeState, maintenanceNodeState);
assertTrue(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
@@ -209,7 +252,7 @@ public class NodeStateChangeCheckerTest {
setAllNodesUp(cluster, HostInfo.createHostInfo(hostInfo));
NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition(
- new Node(NodeType.STORAGE, 1), currentClusterState, SetUnitStateRequest.Condition.SAFE,
+ new Node(NodeType.STORAGE, 1), defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE,
upNodeState, maintenanceNodeState);
assertTrue(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
@@ -222,7 +265,7 @@ public class NodeStateChangeCheckerTest {
cluster.clusterInfo().getStorageNodeInfo(1).setReportedState(new NodeState(NodeType.STORAGE, State.UP), 0);
NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition(
- nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState);
+ nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
assertThat(result.getReason(), is("Distributor node (0) has not reported any cluster state version yet."));
@@ -235,7 +278,7 @@ public class NodeStateChangeCheckerTest {
NodeState currentNodeState = createNodeState(state, oldDescription);
NodeState newNodeState = createNodeState(state, newDescription);
return nodeStateChangeChecker.evaluateTransition(
- nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE,
+ nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE,
currentNodeState, newNodeState);
}
@@ -280,13 +323,16 @@ public class NodeStateChangeCheckerTest {
cluster.clusterInfo().getStorageNodeInfo(x).setReportedState(new NodeState(NodeType.STORAGE, state), 0);
}
+ ClusterState clusterState = defaultAllUpClusterState();
+
if (storageNodeIndex >= 0) { // TODO: Move this into the calling test
NodeState downNodeState = new NodeState(NodeType.STORAGE, State.DOWN);
cluster.clusterInfo().getStorageNodeInfo(storageNodeIndex).setReportedState(downNodeState, 4 /* time */);
+ clusterState.setNodeState(new Node(NodeType.STORAGE, storageNodeIndex), downNodeState);
}
return nodeStateChangeChecker.evaluateTransition(
- nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState);
+ nodeStorage, clusterState, SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState);
}
private void setAllNodesUp(ContentCluster cluster, HostInfo distributorHostInfo) {
@@ -339,6 +385,28 @@ public class NodeStateChangeCheckerTest {
assertThat(result.getReason(), containsString("Not enough storage nodes running"));
}
+ @Test
+ public void testNodeRatioRequirementConsidersGeneratedNodeStates() {
+ ContentCluster cluster = createCluster(createNodes(4));
+ NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
+
+ markAllNodesAsReportingStateUp(cluster);
+
+ // Both minRatioOfStorageNodesUp and minStorageNodesUp imply that a single node being
+ // in state Down should halt the upgrade. This must also take into account the generated
+ // state, not just the reported state. In this case, all nodes are reported as being Up
+ // but one node has a generated state of Down.
+ ClusterState stateWithNodeDown = clusterState(String.format(
+ "version:%d distributor:4 storage:4 .3.s:d",
+ currentClusterStateVersion));
+
+ NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition(
+ nodeStorage, stateWithNodeDown, SetUnitStateRequest.Condition.SAFE,
+ upNodeState, maintenanceNodeState);
+ assertFalse(result.settingWantedStateIsAllowed());
+ assertFalse(result.wantedStateAlreadySet());
+ }
+
private List<ConfiguredNode> createNodes(int count) {
List<ConfiguredNode> nodes = new ArrayList<>();
for (int i = 0; i < count; i++)
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateGeneratorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateGeneratorTest.java
index ab6185d2b56..35118933b42 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateGeneratorTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateGeneratorTest.java
@@ -99,7 +99,7 @@ public class SystemStateGeneratorTest extends TestCase {
}
private void assertNewClusterStateReceived() {
- assertTrue(generator.notifyIfNewSystemState(systemStateListener));
+ assertTrue(generator.notifyIfNewSystemState(cluster, systemStateListener));
assertTrue(systemStateListener.toString(), systemStateListener.states.size() == 1);
systemStateListener.states.clear();
}
@@ -147,7 +147,7 @@ public class SystemStateGeneratorTest extends TestCase {
private void verifyClusterStateChanged(Node node, State state) {
log.info("Verifying cluster state has been updated for " + node + " to " + state);
- assertTrue(generator.notifyIfNewSystemState(systemStateListener));
+ assertTrue(generator.notifyIfNewSystemState(cluster, systemStateListener));
assertEquals(1, systemStateListener.states.size());
assertEquals(state, systemStateListener.states.get(0).getNodeState(node).getState());
systemStateListener.states.clear();
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/ClusterControllerConfig.java b/config-model/src/main/java/com/yahoo/vespa/model/content/ClusterControllerConfig.java
index fd1bc8a3362..4a0ce99914a 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/content/ClusterControllerConfig.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/content/ClusterControllerConfig.java
@@ -31,8 +31,12 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F
ModelElement tuning = null;
ModelElement clusterTuning = clusterElement.getChild("tuning");
+ Integer bucketSplittingMinimumBits = null;
+ Double minNodeRatioPerGroup = null;
if (clusterTuning != null) {
tuning = clusterTuning.getChild("cluster-controller");
+ minNodeRatioPerGroup = clusterTuning.childAsDouble("min-node-ratio-per-group");
+ bucketSplittingMinimumBits = clusterTuning.childAsInteger("bucket-splitting.minimum-bits");
}
if (tuning != null) {
@@ -43,10 +47,11 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F
tuning.childAsDuration("stable-state-period"),
tuning.childAsDouble("min-distributor-up-ratio"),
tuning.childAsDouble("min-storage-up-ratio"),
- clusterElement.childAsInteger("tuning.bucket-splitting.minimum-bits"));
+ bucketSplittingMinimumBits,
+ minNodeRatioPerGroup);
} else {
return new ClusterControllerConfig(ancestor, clusterName, null, null, null, null, null, null,
- clusterElement.childAsInteger("tuning.bucket-splitting.minimum-bits"));
+ bucketSplittingMinimumBits, minNodeRatioPerGroup);
}
}
}
@@ -59,7 +64,9 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F
Double minDistributorUpRatio;
Double minStorageUpRatio;
Integer minSplitBits;
+ private Double minNodeRatioPerGroup;
+ // TODO refactor; too many args
private ClusterControllerConfig(AbstractConfigProducer parent,
String clusterName,
Duration initProgressTime,
@@ -68,7 +75,8 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F
Duration stableStateTimePeriod,
Double minDistributorUpRatio,
Double minStorageUpRatio,
- Integer minSplitBits) {
+ Integer minSplitBits,
+ Double minNodeRatioPerGroup) {
super(parent, "fleetcontroller");
this.clusterName = clusterName;
@@ -79,6 +87,7 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F
this.minDistributorUpRatio = minDistributorUpRatio;
this.minStorageUpRatio = minStorageUpRatio;
this.minSplitBits = minSplitBits;
+ this.minNodeRatioPerGroup = minNodeRatioPerGroup;
}
@Override
@@ -117,5 +126,8 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F
if (minSplitBits != null) {
builder.ideal_distribution_bits(minSplitBits);
}
+ if (minNodeRatioPerGroup != null) {
+ builder.min_node_ratio_per_group(minNodeRatioPerGroup);
+ }
}
}
diff --git a/config-model/src/main/resources/schema/content.rnc b/config-model/src/main/resources/schema/content.rnc
index 36a8dd276ef..1c20acdc0bd 100644
--- a/config-model/src/main/resources/schema/content.rnc
+++ b/config-model/src/main/resources/schema/content.rnc
@@ -60,6 +60,10 @@ PersistenceThreads = element persistence-threads {
Thread+
}
+MinNodeRatioPerGroup = element min-node-ratio-per-group {
+ xsd:double { minInclusive = "0" maxInclusive = "1" }
+}
+
ClusterControllerTuning = element cluster-controller {
element init-progress-time { xsd:string { pattern = "([0-9\.]+)\s*([a-z]+)?" } }? &
element transition-time { xsd:string { pattern = "([0-9\.]+)\s*([a-z]+)?" } }? &
@@ -85,7 +89,8 @@ ClusterTuning = element tuning {
VisitorTuning? &
ClusterControllerTuning? &
Maintenance? &
- PersistenceThreads?
+ PersistenceThreads? &
+ MinNodeRatioPerGroup?
}
Content = element content {
diff --git a/config-model/src/test/java/com/yahoo/vespa/model/content/FleetControllerClusterTest.java b/config-model/src/test/java/com/yahoo/vespa/model/content/FleetControllerClusterTest.java
index 204491b0724..98cc4e5cc47 100644
--- a/config-model/src/test/java/com/yahoo/vespa/model/content/FleetControllerClusterTest.java
+++ b/config-model/src/test/java/com/yahoo/vespa/model/content/FleetControllerClusterTest.java
@@ -10,13 +10,6 @@ import org.w3c.dom.Document;
import static org.junit.Assert.assertEquals;
-/**
- * Created with IntelliJ IDEA.
- * User: thomasg
- * Date: 5/10/12
- * Time: 2:29 PM
- * To change this template use File | Settings | File Templates.
- */
public class FleetControllerClusterTest {
ClusterControllerConfig parse(String xml) {
Document doc = XML.getDocument(xml);
@@ -69,4 +62,31 @@ public class FleetControllerClusterTest {
FleetcontrollerConfig config = new FleetcontrollerConfig(builder);
assertEquals(13, config.init_progress_time());
}
+
+ @Test
+ public void min_node_ratio_per_group_tuning_config_is_propagated() {
+ FleetcontrollerConfig.Builder builder = new FleetcontrollerConfig.Builder();
+ parse("<cluster id=\"storage\">\n" +
+ " <documents/>\n" +
+ " <tuning>\n" +
+ " <min-node-ratio-per-group>0.75</min-node-ratio-per-group>\n" +
+ " </tuning>\n" +
+ "</cluster>").
+ getConfig(builder);
+
+ FleetcontrollerConfig config = new FleetcontrollerConfig(builder);
+ assertEquals(0.75, config.min_node_ratio_per_group(), 0.01);
+ }
+
+ @Test
+ public void min_node_ratio_per_group_is_implicitly_zero_when_omitted() {
+ FleetcontrollerConfig.Builder builder = new FleetcontrollerConfig.Builder();
+ parse("<cluster id=\"storage\">\n" +
+ " <documents/>\n" +
+ "</cluster>").
+ getConfig(builder);
+
+ FleetcontrollerConfig config = new FleetcontrollerConfig(builder);
+ assertEquals(0.0, config.min_node_ratio_per_group(), 0.01);
+ }
}
diff --git a/configdefinitions/src/vespa/fleetcontroller.def b/configdefinitions/src/vespa/fleetcontroller.def
index 919e9fc2209..0862beac64a 100644
--- a/configdefinitions/src/vespa/fleetcontroller.def
+++ b/configdefinitions/src/vespa/fleetcontroller.def
@@ -138,3 +138,11 @@ show_local_systemstates_in_event_log bool default=true
## The ideal number of distribution bits this system should have
ideal_distribution_bits int default=16
+
+## Minimum ratio of nodes that have to be available (i.e. not Down) in any
+## hierarchic content cluster group. If a higher ratio than this is Down at
+## any point, the remaning nodes in the group will be automatically marked
+## as down. Group nodes will automatically be taken back up as soon as node
+## availability has been restored above the given threshold.
+## Default is 0, i.e. functionality is for all intents and purposes disabled.
+min_node_ratio_per_group double default=0.0