diff options
22 files changed, 1411 insertions, 187 deletions
diff --git a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java index 07b0a68520b..6b02debc182 100644 --- a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java +++ b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java @@ -69,6 +69,7 @@ public class ClusterControllerClusterConfigurer { options.minTimeBetweenNewSystemStates = config.min_time_between_new_systemstates(); options.maxSlobrokDisconnectGracePeriod = (int) (config.max_slobrok_disconnect_grace_period() * 1000); options.distributionBits = config.ideal_distribution_bits(); + options.minNodeRatioPerGroup = config.min_node_ratio_per_group(); } private void configure(SlobroksConfig config) { diff --git a/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java b/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java index 6cf4f962c9f..79da2574dbc 100644 --- a/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java +++ b/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java @@ -22,7 +22,11 @@ public class ClusterControllerClusterConfigurerTest extends TestCase { group.nodes.add(node); distributionConfig.group.add(group); FleetcontrollerConfig.Builder fleetcontrollerConfig = new FleetcontrollerConfig.Builder(); - fleetcontrollerConfig.cluster_name("storage").index(0).zookeeper_server("zoo"); + fleetcontrollerConfig + .cluster_name("storage") + .index(0) + .zookeeper_server("zoo") + .min_node_ratio_per_group(0.123); SlobroksConfig.Builder slobroksConfig = new SlobroksConfig.Builder(); SlobroksConfig.Slobrok.Builder slobrok = new SlobroksConfig.Slobrok.Builder(); slobrok.connectionspec("foo"); @@ -47,6 +51,7 @@ public class ClusterControllerClusterConfigurerTest extends TestCase { metric ); assertTrue(configurer.getOptions() != null); + assertEquals(0.123, configurer.getOptions().minNodeRatioPerGroup, 0.01); // Oki with no zookeeper if one node zookeepersConfig.zookeeperserverlist(""); diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java index 67681f87d92..98648451e1d 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java @@ -191,7 +191,7 @@ public class ContentCluster { * @param newState state wanted to be set @return NodeUpgradePrechecker.Response */ public NodeStateChangeChecker.Result calculateEffectOfNewState( - Node node, int clusterState, SetUnitStateRequest.Condition condition, NodeState oldState, NodeState newState) { + Node node, ClusterState clusterState, SetUnitStateRequest.Condition condition, NodeState oldState, NodeState newState) { NodeStateChangeChecker nodeStateChangeChecker = new NodeStateChangeChecker( minStorageNodesUp, diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 572e24bcb35..ceeeddf49fa 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -416,8 +416,10 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd systemStateGenerator.setStableStateTimePeriod(options.stableStateTimePeriod); systemStateGenerator.setMinNodesUp(options.minDistributorNodesUp, options.minStorageNodesUp, options.minRatioOfDistributorNodesUp, options.minRatioOfStorageNodesUp); + systemStateGenerator.setMinNodeRatioPerGroup(options.minNodeRatioPerGroup); systemStateGenerator.setMaxSlobrokDisconnectGracePeriod(options.maxSlobrokDisconnectGracePeriod); systemStateGenerator.setDistributionBits(options.distributionBits); + systemStateGenerator.setDistribution(options.storageDistribution); masterElectionHandler.setFleetControllerCount(options.fleetControllerCount); masterElectionHandler.setMasterZooKeeperCooldownPeriod(options.masterZooKeeperCooldownPeriod); @@ -426,9 +428,9 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd try{ rpcServer.setSlobrokConnectionSpecs(options.slobrokConnectionSpecs, options.rpcPort); } catch (ListenFailedException e) { - log.log(LogLevel.WARNING, "Failed to bind RPC server to port " + options.rpcPort +". This may be natural if cluster have altered the services running on this node: " + e.getMessage()); + log.log(LogLevel.WARNING, "Failed to bind RPC server to port " + options.rpcPort +". This may be natural if cluster has altered the services running on this node: " + e.getMessage()); } catch (Exception e) { - log.log(LogLevel.WARNING, "Failed to initailize RPC server socket: " + e.getMessage()); + log.log(LogLevel.WARNING, "Failed to initialize RPC server socket: " + e.getMessage()); } } @@ -436,7 +438,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd try{ statusPageServer.setPort(options.httpPort); } catch (Exception e) { - log.log(LogLevel.WARNING, "Failed to initialize status server socket. This may be natural if cluster have altered the services running on this node: " + e.getMessage()); + log.log(LogLevel.WARNING, "Failed to initialize status server socket. This may be natural if cluster has altered the services running on this node: " + e.getMessage()); } } @@ -503,7 +505,6 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd didWork |= systemStateBroadcaster.processResponses(); if (masterElectionHandler.isMaster()) { didWork |= broadcastClusterStateToEligibleNodes(); - } didWork |= processAnyPendingStatusPageRequest(); @@ -637,7 +638,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd // Send getNodeState requests to zero or more nodes. didWork |= stateGatherer.sendMessages(cluster, communicator, this); didWork |= systemStateGenerator.watchTimers(cluster, this); - didWork |= systemStateGenerator.notifyIfNewSystemState(this); + didWork |= systemStateGenerator.notifyIfNewSystemState(cluster, this); if ( ! isStateGatherer) { if ( ! isMaster) { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java index f18fa6d3a9b..a2449449a05 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java @@ -69,6 +69,16 @@ public class FleetControllerOptions implements Cloneable { public double minRatioOfStorageNodesUp = 0.50; /** + * Minimum ratio of nodes in an "available" state (up, initializing or maintenance) + * that shall be present in a group for the group itself to be considered available. + * If the ratio of available nodes drop under this limit, the group's nodes will be + * implicitly taken down. + * + * A value of 0.0 implies group auto-takedown feature is effectively disabled. + */ + public double minNodeRatioPerGroup = 0.0; + + /** * Milliseconds to sleep after doing a work cycle where we did no work. Some events do not interrupt the sleeping, * such as slobrok changes, so shouldn't set this too high. */ diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java new file mode 100644 index 00000000000..74b15b61ac3 --- /dev/null +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java @@ -0,0 +1,117 @@ +package com.yahoo.vespa.clustercontroller.core; + +import com.yahoo.vdslib.distribution.ConfiguredNode; +import com.yahoo.vdslib.distribution.Distribution; +import com.yahoo.vdslib.distribution.Group; +import com.yahoo.vdslib.distribution.GroupVisitor; +import com.yahoo.vdslib.state.ClusterState; +import com.yahoo.vdslib.state.Node; +import com.yahoo.vdslib.state.NodeState; +import com.yahoo.vdslib.state.NodeType; +import com.yahoo.vdslib.state.State; + +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Stream; + +class GroupAvailabilityCalculator { + private final Distribution distribution; + private final double minNodeRatioPerGroup; + + private GroupAvailabilityCalculator(Distribution distribution, + double minNodeRatioPerGroup) + { + this.distribution = distribution; + this.minNodeRatioPerGroup = minNodeRatioPerGroup; + } + + public static class Builder { + private Distribution distribution; + private double minNodeRatioPerGroup = 1.0; + + Builder withDistribution(Distribution distribution) { + this.distribution = distribution; + return this; + } + Builder withMinNodeRatioPerGroup(double minRatio) { + this.minNodeRatioPerGroup = minRatio; + return this; + } + GroupAvailabilityCalculator build() { + return new GroupAvailabilityCalculator(distribution, minNodeRatioPerGroup); + } + } + + public static Builder builder() { + return new Builder(); + } + + private class InsufficientAvailabilityGroupVisitor implements GroupVisitor { + private final Set<Integer> implicitlyDown = new HashSet<>(); + private final ClusterState clusterState; + + public InsufficientAvailabilityGroupVisitor(ClusterState clusterState) { + this.clusterState = clusterState; + } + + private boolean nodeIsAvailableInState(final int index, final String states) { + return clusterState.getNodeState(new Node(NodeType.STORAGE, index)).getState().oneOf(states); + } + + private Stream<ConfiguredNode> availableNodesIn(Group g) { + // We consider nodes in states (u)p, (i)nitializing, (m)aintenance as being + // available from the perspective of taking entire groups down (even though + // maintenance mode is a half-truth in this regard). + return g.getNodes().stream().filter(n -> nodeIsAvailableInState(n.index(), "uim")); + } + + private Stream<ConfiguredNode> candidateNodesForSettingDown(Group g) { + // We don't implicitly set (m)aintenance nodes down, as these are usually set + // in maintenance for a good reason (e.g. orchestration or manual reboot). + // Similarly, we don't take down (r)etired nodes as these may contain data + // that the rest of the cluster needs. + return g.getNodes().stream().filter(n -> nodeIsAvailableInState(n.index(), "ui")); + } + + private double computeGroupAvailability(Group g) { + // TODO also look at distributors + final long availableNodes = availableNodesIn(g).count(); + // Model should make it impossible to deploy with zero nodes in a group, + // so no div by zero risk. + return availableNodes / (double)g.getNodes().size(); + } + + private void markAllAvailableGroupNodeIndicesAsDown(Group group) { + candidateNodesForSettingDown(group).forEach(n -> implicitlyDown.add(n.index())); + } + + @Override + public boolean visitGroup(Group group) { + if (group.isLeafGroup()) { + if (computeGroupAvailability(group) < minNodeRatioPerGroup) { + markAllAvailableGroupNodeIndicesAsDown(group); + } + } + return true; + } + + Set<Integer> implicitlyDownNodeIndices() { + return implicitlyDown; + } + } + + private static boolean isFlatCluster(Group root) { + return root.isLeafGroup(); + } + + public Set<Integer> nodesThatShouldBeDown(ClusterState state) { + if (isFlatCluster(distribution.getRootGroup())) { + // Implicit group takedown only applies to hierarchic cluster setups. + return new HashSet<>(); + } + InsufficientAvailabilityGroupVisitor visitor = new InsufficientAvailabilityGroupVisitor(state); + distribution.visitGroups(visitor); + return visitor.implicitlyDownNodeIndices(); + } + +} diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java index f312194c15d..12018ac4b6f 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java @@ -1,6 +1,7 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.clustercontroller.core; +import com.yahoo.vdslib.state.ClusterState; import com.yahoo.vdslib.state.Node; import com.yahoo.vdslib.state.NodeState; import com.yahoo.vdslib.state.NodeType; @@ -79,7 +80,7 @@ public class NodeStateChangeChecker { } public Result evaluateTransition( - Node node, int clusterStateVersion, SetUnitStateRequest.Condition condition, + Node node, ClusterState clusterState, SetUnitStateRequest.Condition condition, NodeState oldState, NodeState newState) { if (condition == SetUnitStateRequest.Condition.FORCE) { return Result.allowSettingOfWantedState(); @@ -107,7 +108,7 @@ public class NodeStateChangeChecker { case UP: return canSetStateUp(node, oldState.getState()); case MAINTENANCE: - return canSetStateMaintenance(node, clusterStateVersion); + return canSetStateMaintenance(node, clusterState); default: return Result.createDisallowed("Safe only supports state UP and MAINTENANCE, you tried: " + newState); } @@ -127,17 +128,17 @@ public class NodeStateChangeChecker { return Result.allowSettingOfWantedState(); } - private Result canSetStateMaintenance(Node node, int clusterStateVersion) { + private Result canSetStateMaintenance(Node node, ClusterState clusterState) { NodeInfo nodeInfo = clusterInfo.getNodeInfo(node); if (nodeInfo == null) { return Result.createDisallowed("Unknown node " + node); } - NodeState reportedState = nodeInfo.getReportedState(); - if (reportedState.getState() == State.DOWN) { + NodeState currentState = clusterState.getNodeState(node); + if (currentState.getState() == State.DOWN) { return Result.allowSettingOfWantedState(); } - Result checkDistributorsResult = checkDistributors(node, clusterStateVersion); + Result checkDistributorsResult = checkDistributors(node, clusterState.getVersion()); if (!checkDistributorsResult.settingWantedStateIsAllowed()) { return checkDistributorsResult; } @@ -151,7 +152,7 @@ public class NodeStateChangeChecker { return Result.createDisallowed("There are only " + clusterInfo.getStorageNodeInfo().size() + " storage nodes up, while config requires at least " + minStorageNodesUp); } - Result fractionCheck = isFractionHighEnough(); + Result fractionCheck = isFractionHighEnough(clusterState); if (!fractionCheck.settingWantedStateIsAllowed()) { return fractionCheck; } @@ -168,16 +169,23 @@ public class NodeStateChangeChecker { return Result.allowSettingOfWantedState(); } - private Result isFractionHighEnough() { + private int contentNodesWithAvailableNodeState(ClusterState clusterState) { + final int nodeCount = clusterState.getNodeCount(NodeType.STORAGE); int upNodesCount = 0; - int nodesCount = 0; - for (StorageNodeInfo storageNodeInfo : clusterInfo.getStorageNodeInfo()) { - nodesCount++; - State state = storageNodeInfo.getReportedState().getState(); + for (int i = 0; i < nodeCount; ++i) { + final Node node = new Node(NodeType.STORAGE, i); + final State state = clusterState.getNodeState(node).getState(); if (state == State.UP || state == State.RETIRED || state == State.INITIALIZING) { upNodesCount++; } } + return upNodesCount; + } + + private Result isFractionHighEnough(ClusterState clusterState) { + final int nodesCount = clusterInfo.getStorageNodeInfo().size(); + final int upNodesCount = contentNodesWithAvailableNodeState(clusterState); + if (nodesCount == 0) { return Result.createDisallowed("No storage nodes in cluster state, not safe to restart."); } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java index 5cf88b68f29..3dd1216d5d0 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java @@ -4,6 +4,7 @@ package com.yahoo.vespa.clustercontroller.core; import com.yahoo.jrt.Spec; import com.yahoo.log.LogLevel; import com.yahoo.vdslib.distribution.ConfiguredNode; +import com.yahoo.vdslib.distribution.Distribution; import com.yahoo.vdslib.state.*; import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler; import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo; @@ -18,7 +19,8 @@ import java.util.stream.Collectors; /** * This class get node state updates and uses them to decide the cluster state. */ - // TODO: Remove all current state from this and make it rely on state from ClusterInfo instead +// TODO: Remove all current state from this and make it rely on state from ClusterInfo instead +// TODO: Do this ASAP! SystemStateGenerator should ideally behave as a pure function! public class SystemStateGenerator { private static Logger log = Logger.getLogger(SystemStateGenerator.class.getName()); @@ -27,6 +29,7 @@ public class SystemStateGenerator { private final EventLogInterface eventLog; private ClusterStateView currentClusterStateView; private ClusterStateView nextClusterStateView; + private Distribution distribution; private boolean nextStateViewChanged = false; private boolean isMaster = false; @@ -41,6 +44,7 @@ public class SystemStateGenerator { private int minStorageNodesUp = 1; private double minRatioOfDistributorNodesUp = 0.50; private double minRatioOfStorageNodesUp = 0.50; + private double minNodeRatioPerGroup = 0.0; private int maxSlobrokDisconnectGracePeriod = 1000; private int idealDistributionBits = 16; private static final boolean disableUnstableNodes = true; @@ -116,17 +120,18 @@ public class SystemStateGenerator { minStorageNodesUp = minStorNodes; minRatioOfDistributorNodesUp = minDistRatio; minRatioOfStorageNodesUp = minStorRatio; - nextStateViewChanged = true; // ... maybe + nextStateViewChanged = true; + } + + public void setMinNodeRatioPerGroup(double upRatio) { + this.minNodeRatioPerGroup = upRatio; + nextStateViewChanged = true; } /** Sets the nodes of this and attempts to keep the node state in sync */ public void setNodes(ClusterInfo newClusterInfo) { this.nodes = new HashSet<>(newClusterInfo.getConfiguredNodes().values()); - // Nodes that are removed from config will be automatically marked as DOWN - // in the cluster state by createNextVersionOfClusterStateView, ensuring - // that these are not carried over into new cluster states. - for (ConfiguredNode node : this.nodes) { NodeInfo newNodeInfo = newClusterInfo.getStorageNodeInfo(node.index()); NodeState currentState = currentClusterStateView.getClusterState().getNodeState(new Node(NodeType.STORAGE, node.index())); @@ -134,6 +139,23 @@ public class SystemStateGenerator { proposeNewNodeState(newNodeInfo, new NodeState(NodeType.STORAGE, node.retired() ? State.RETIRED : State.UP)); } } + + // Ensure that any nodes that have been removed from the config are also + // promptly removed from the next (and subsequent) generated cluster states. + pruneAllNodesNotContainedInConfig(); + + nextStateViewChanged = true; + } + + private void pruneAllNodesNotContainedInConfig() { + Set<Integer> configuredIndices = this.nodes.stream().map(ConfiguredNode::index).collect(Collectors.toSet()); + final ClusterState candidateNextState = nextClusterStateView.getClusterState(); + pruneNodesNotContainedInConfig(candidateNextState, configuredIndices, NodeType.DISTRIBUTOR); + pruneNodesNotContainedInConfig(candidateNextState, configuredIndices, NodeType.STORAGE); + } + + public void setDistribution(Distribution distribution) { + this.distribution = distribution; nextStateViewChanged = true; } @@ -196,14 +218,13 @@ public class SystemStateGenerator { return state; } - private Event getDownDueToTooFewNodesEvent() { - Event clusterEvent = null; + private Optional<Event> getDownDueToTooFewNodesEvent(ClusterState nextClusterState) { int upStorageCount = 0, upDistributorCount = 0; int dcount = nodes.size(); int scount = nodes.size(); for (NodeType type : NodeType.getTypes()) { for (ConfiguredNode node : nodes) { - NodeState ns = nextClusterStateView.getClusterState().getNodeState(new Node(type, node.index())); + NodeState ns = nextClusterState.getNodeState(new Node(type, node.index())); if (ns.getState() == State.UP || ns.getState() == State.RETIRED || ns.getState() == State.INITIALIZING) { if (type.equals(NodeType.STORAGE)) ++upStorageCount; @@ -215,46 +236,149 @@ public class SystemStateGenerator { long timeNow = timer.getCurrentTimeInMillis(); if (upStorageCount < minStorageNodesUp) { - clusterEvent = new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE, + return Optional.of(new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE, "Less than " + minStorageNodesUp + " storage nodes available (" + upStorageCount + "). Setting cluster state down.", - timeNow); + timeNow)); } if (upDistributorCount < minDistributorNodesUp) { - clusterEvent = new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE, + return Optional.of(new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE, "Less than " + minDistributorNodesUp + " distributor nodes available (" + upDistributorCount + "). Setting cluster state down.", - timeNow); + timeNow)); } if (minRatioOfStorageNodesUp * scount > upStorageCount) { - clusterEvent = new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE, + return Optional.of(new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE, "Less than " + (100 * minRatioOfStorageNodesUp) + " % of storage nodes are available (" + upStorageCount + "/" + scount + "). Setting cluster state down.", - timeNow); + timeNow)); } if (minRatioOfDistributorNodesUp * dcount > upDistributorCount) { - clusterEvent = new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE, + return Optional.of(new ClusterEvent(ClusterEvent.Type.SYSTEMSTATE, "Less than " + (100 * minRatioOfDistributorNodesUp) + " % of distributor nodes are available (" + upDistributorCount + "/" + dcount + "). Setting cluster state down.", - timeNow); + timeNow)); + } + return Optional.empty(); + } + + private static Node storageNode(int index) { + return new Node(NodeType.STORAGE, index); + } + + private void performImplicitStorageNodeStateTransitions(ClusterState candidateState, ContentCluster cluster) { + if (distribution == null) { + return; // FIXME due to tests that don't bother setting distr config! Never happens in prod. + } + // First clear the states of any nodes that according to reported/wanted state alone + // should have their states cleared. We might still take these down again based on the + // decisions of the group availability calculator, but this way we ensure that groups + // that no longer should be down will have their nodes implicitly made available again. + // TODO this will be void once SystemStateGenerator has been rewritten to be stateless. + final Set<Integer> clearedNodes = clearDownStateForStorageNodesThatCanBeUp(candidateState, cluster); + + final GroupAvailabilityCalculator calc = new GroupAvailabilityCalculator.Builder() + .withMinNodeRatioPerGroup(minNodeRatioPerGroup) + .withDistribution(distribution) + .build(); + final Set<Integer> nodesToTakeDown = calc.nodesThatShouldBeDown(candidateState); + markNodesAsDownDueToGroupUnavailability(cluster, candidateState, nodesToTakeDown, clearedNodes); + + clearedNodes.removeAll(nodesToTakeDown); + logEventsForNodesThatWereTakenUp(clearedNodes, cluster); + } + + private void logEventsForNodesThatWereTakenUp(Set<Integer> newlyUpNodes, ContentCluster cluster) { + newlyUpNodes.forEach(i -> { + final NodeInfo info = cluster.getNodeInfo(storageNode(i)); // Should always be non-null here. + // TODO the fact that this only happens for group up events is implementation specific + // should generalize this if we get other such events. + eventLog.addNodeOnlyEvent(new NodeEvent(info, + "Group availability restored; taking node back up", + NodeEvent.Type.CURRENT, timer.getCurrentTimeInMillis()), LogLevel.INFO); + }); + } + + private void markNodesAsDownDueToGroupUnavailability(ContentCluster cluster, + ClusterState candidateState, + Set<Integer> nodesToTakeDown, + Set<Integer> clearedNodes) + { + for (Integer idx : nodesToTakeDown) { + final Node node = storageNode(idx); + NodeState newState = new NodeState(NodeType.STORAGE, State.DOWN); + newState.setDescription("group node availability below configured threshold"); + candidateState.setNodeState(node, newState); + + logNodeGroupDownEdgeEventOnce(clearedNodes, node, cluster); + } + } + + private void logNodeGroupDownEdgeEventOnce(Set<Integer> clearedNodes, Node node, ContentCluster cluster) { + final NodeInfo nodeInfo = cluster.getNodeInfo(node); + // If clearedNodes contains the index it means we're just re-downing a node + // that was previously down. If this is the case, we'd cause a duplicate + // event if we logged it now as well. + if (nodeInfo != null && !clearedNodes.contains(node.getIndex())) { + eventLog.addNodeOnlyEvent(new NodeEvent(nodeInfo, + "Setting node down as the total availability of its group is " + + "below the configured threshold", + NodeEvent.Type.CURRENT, timer.getCurrentTimeInMillis()), LogLevel.INFO); } - return clusterEvent; } - private ClusterStateView createNextVersionOfClusterStateView(Event clusterEvent) { + private NodeState baselineNodeState(NodeInfo info) { + NodeState reported = info.getReportedState(); + NodeState wanted = info.getWantedState(); + + final NodeState baseline = reported.clone(); + if (wanted.getState() != State.UP) { + baseline.setDescription(wanted.getDescription()); + if (reported.above(wanted)) { + baseline.setState(wanted.getState()); + } + } + return baseline; + } + + // Returns set of nodes whose state was cleared + private Set<Integer> clearDownStateForStorageNodesThatCanBeUp( + ClusterState candidateState, ContentCluster cluster) + { + final int nodeCount = candidateState.getNodeCount(NodeType.STORAGE); + final Set<Integer> clearedNodes = new HashSet<>(); + for (int i = 0; i < nodeCount; ++i) { + final Node node = storageNode(i); + final NodeInfo info = cluster.getNodeInfo(node); + final NodeState currentState = candidateState.getNodeState(node); + if (currentState.getState() == State.DOWN) { + if (mayClearNodeDownState(info)) { + candidateState.setNodeState(node, baselineNodeState(info)); + clearedNodes.add(i); + } + } + } + return clearedNodes; + } + + private boolean mayClearNodeDownState(NodeInfo info) { + if (info == null) { + // Nothing known about node in cluster info; we definitely don't want it + // to be taken up at this point. + return false; + } + return (info.getReportedState().getState() != State.DOWN + && info.getWantedState().getState().oneOf("ur")); + } + + private ClusterStateView createNextVersionOfClusterStateView(ContentCluster cluster) { // If you change this method, see *) in notifyIfNewSystemState ClusterStateView candidateClusterStateView = nextClusterStateView.cloneForNewState(); ClusterState candidateClusterState = candidateClusterStateView.getClusterState(); - candidateClusterState.setClusterState(clusterEvent == null ? State.UP : State.DOWN); - int currentDistributionBits = calculateMinDistributionBitCount(); if (currentDistributionBits != nextClusterStateView.getClusterState().getDistributionBitCount()) { candidateClusterState.setDistributionBits(currentDistributionBits); } - - Set<Integer> configuredIndices = this.nodes.stream().map(ConfiguredNode::index).collect(Collectors.toSet()); - - pruneNodesNotContainedInConfig(candidateClusterState, configuredIndices, NodeType.DISTRIBUTOR); - pruneNodesNotContainedInConfig(candidateClusterState, configuredIndices, NodeType.STORAGE); + performImplicitStorageNodeStateTransitions(candidateClusterState, cluster); return candidateClusterStateView; } @@ -310,12 +434,29 @@ public class SystemStateGenerator { } } - public boolean notifyIfNewSystemState(SystemStateListener stateListener) { + private void mergeIntoNextClusterState(ClusterState sourceState) { + final ClusterState nextState = nextClusterStateView.getClusterState(); + final int nodeCount = sourceState.getNodeCount(NodeType.STORAGE); + for (int i = 0; i < nodeCount; ++i) { + final Node node = storageNode(i); + final NodeState stateInSource = sourceState.getNodeState(node); + final NodeState stateInTarget = nextState.getNodeState(node); + if (stateInSource.getState() != stateInTarget.getState()) { + nextState.setNodeState(node, stateInSource); + } + } + } + + public boolean notifyIfNewSystemState(ContentCluster cluster, SystemStateListener stateListener) { if ( ! nextStateViewChanged) return false; - Event clusterEvent = getDownDueToTooFewNodesEvent(); - ClusterStateView newClusterStateView = createNextVersionOfClusterStateView(clusterEvent); + ClusterStateView newClusterStateView = createNextVersionOfClusterStateView(cluster); + ClusterState newClusterState = newClusterStateView.getClusterState(); + // Creating the next version of the state may implicitly take down nodes, so our checks + // for taking the entire cluster down must happen _after_ this + Optional<Event> clusterDown = getDownDueToTooFewNodesEvent(newClusterState); + newClusterState.setClusterState(clusterDown.isPresent() ? State.DOWN : State.UP); if (newClusterState.similarTo(currentClusterStateView.getClusterState())) { log.log(LogLevel.DEBUG, @@ -329,7 +470,7 @@ public class SystemStateGenerator { newClusterState.setVersion(currentClusterStateView.getClusterState().getVersion() + 1); recordNewClusterStateHasBeenChosen(currentClusterStateView.getClusterState(), - newClusterStateView.getClusterState(), clusterEvent); + newClusterStateView.getClusterState(), clusterDown.orElse(null)); // *) Ensure next state is still up to date. // This should make nextClusterStateView a deep-copy of currentClusterStateView. @@ -338,6 +479,7 @@ public class SystemStateGenerator { // This seems like a hack... nextClusterStateView.getClusterState().setDistributionBits(newClusterState.getDistributionBitCount()); nextClusterStateView.getClusterState().setClusterState(newClusterState.getClusterState()); + mergeIntoNextClusterState(newClusterState); currentClusterStateView = newClusterStateView; nextStateViewChanged = false; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java index d6dd6faa60d..1b97123d636 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java @@ -80,12 +80,11 @@ public class SetNodeStateRequest extends Request<SetResponse> { NodeState wantedState = nodeInfo.getUserWantedState(); NodeState newWantedState = getRequestedNodeState(newStates, node); - int version = currentClusterState.getVersion(); NodeStateChangeChecker.Result result = cluster.calculateEffectOfNewState( - node, version, condition, wantedState, newWantedState); + node, currentClusterState, condition, wantedState, newWantedState); log.log(LogLevel.DEBUG, "node=" + node + - " version=" + version + + " current-cluster-state=" + currentClusterState + // Includes version in output format " condition=" + condition + " wanted-state=" + wantedState + " new-wanted-state=" + newWantedState + diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFixture.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFixture.java new file mode 100644 index 00000000000..aca26000931 --- /dev/null +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFixture.java @@ -0,0 +1,136 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.clustercontroller.core; + +import com.yahoo.vdslib.distribution.ConfiguredNode; +import com.yahoo.vdslib.distribution.Distribution; +import com.yahoo.vdslib.state.Node; +import com.yahoo.vdslib.state.NodeState; +import com.yahoo.vdslib.state.NodeType; +import com.yahoo.vdslib.state.State; +import com.yahoo.vespa.clustercontroller.core.listeners.NodeStateOrHostInfoChangeHandler; +import com.yahoo.vespa.clustercontroller.core.mocks.TestEventLog; +import com.yahoo.vespa.clustercontroller.utils.util.NoMetricReporter; +import com.yahoo.vespa.config.content.StorDistributionConfig; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.mockito.Mockito.mock; + +class ClusterFixture { + public final ContentCluster cluster; + public final Distribution distribution; + public final FakeTimer timer; + public final EventLogInterface eventLog; + public final SystemStateGenerator generator; + + public ClusterFixture(ContentCluster cluster, Distribution distribution) { + this.cluster = cluster; + this.distribution = distribution; + this.timer = new FakeTimer(); + this.eventLog = mock(EventLogInterface.class); + this.generator = createGeneratorForFixtureCluster(); + } + + public SystemStateGenerator createGeneratorForFixtureCluster() { + final int controllerIndex = 0; + MetricUpdater metricUpdater = new MetricUpdater(new NoMetricReporter(), controllerIndex); + SystemStateGenerator generator = new SystemStateGenerator(timer, eventLog, metricUpdater); + generator.setNodes(cluster.clusterInfo()); + generator.setDistribution(distribution); + return generator; + } + + public void bringEntireClusterUp() { + cluster.clusterInfo().getConfiguredNodes().forEach((idx, node) -> { + reportStorageNodeState(idx, State.UP); + reportDistributorNodeState(idx, State.UP); + }); + } + + public void reportStorageNodeState(final int index, State state) { + final Node node = new Node(NodeType.STORAGE, index); + final NodeState nodeState = new NodeState(NodeType.STORAGE, state); + nodeState.setDescription("mockdesc"); + NodeStateOrHostInfoChangeHandler handler = mock(NodeStateOrHostInfoChangeHandler.class); + NodeInfo nodeInfo = cluster.getNodeInfo(node); + + generator.handleNewReportedNodeState(nodeInfo, nodeState, handler); + nodeInfo.setReportedState(nodeState, timer.getCurrentTimeInMillis()); + } + + public void reportStorageNodeState(final int index, NodeState nodeState) { + final Node node = new Node(NodeType.STORAGE, index); + final NodeInfo nodeInfo = cluster.getNodeInfo(node); + final long mockTime = 1234; + NodeStateOrHostInfoChangeHandler changeListener = mock(NodeStateOrHostInfoChangeHandler.class); + generator.handleNewReportedNodeState(nodeInfo, nodeState, changeListener); + nodeInfo.setReportedState(nodeState, mockTime); + } + + public void reportDistributorNodeState(final int index, State state) { + final Node node = new Node(NodeType.DISTRIBUTOR, index); + final NodeState nodeState = new NodeState(NodeType.DISTRIBUTOR, state); + NodeStateOrHostInfoChangeHandler handler = mock(NodeStateOrHostInfoChangeHandler.class); + NodeInfo nodeInfo = cluster.getNodeInfo(node); + + generator.handleNewReportedNodeState(nodeInfo, nodeState, handler); + nodeInfo.setReportedState(nodeState, timer.getCurrentTimeInMillis()); + } + + public void proposeStorageNodeWantedState(final int index, State state) { + final Node node = new Node(NodeType.STORAGE, index); + final NodeState nodeState = new NodeState(NodeType.STORAGE, state); + nodeState.setDescription("mockdesc"); + NodeInfo nodeInfo = cluster.getNodeInfo(node); + nodeInfo.setWantedState(nodeState); + + generator.proposeNewNodeState(nodeInfo, nodeState); + + } + + public void disableAutoClusterTakedown() { + generator.setMinNodesUp(0, 0, 0.0, 0.0); + } + + public void disableTransientMaintenanceModeOnDown() { + Map<NodeType, Integer> maxTransitionTime = new TreeMap<>(); + maxTransitionTime.put(NodeType.DISTRIBUTOR, 0); + maxTransitionTime.put(NodeType.STORAGE, 0); + generator.setMaxTransitionTime(maxTransitionTime); + } + + public void enableTransientMaintenanceModeOnDown(final int transitionTime) { + Map<NodeType, Integer> maxTransitionTime = new TreeMap<>(); + maxTransitionTime.put(NodeType.DISTRIBUTOR, transitionTime); + maxTransitionTime.put(NodeType.STORAGE, transitionTime); + generator.setMaxTransitionTime(maxTransitionTime); + } + + public String generatedClusterState() { + return generator.getClusterState().toString(); + } + + public String verboseGeneratedClusterState() { return generator.getClusterState().toString(true); } + + public static ClusterFixture forFlatCluster(int nodeCount) { + Collection<ConfiguredNode> nodes = DistributionBuilder.buildConfiguredNodes(nodeCount); + + Distribution distribution = DistributionBuilder.forFlatCluster(nodeCount); + ContentCluster cluster = new ContentCluster("foo", nodes, distribution, 0, 0.0); + + return new ClusterFixture(cluster, distribution); + } + + public static ClusterFixture forHierarchicCluster(DistributionBuilder.GroupBuilder root) { + List<ConfiguredNode> nodes = DistributionBuilder.buildConfiguredNodes(root.totalNodeCount()); + Distribution distribution = DistributionBuilder.forHierarchicCluster(root); + ContentCluster cluster = new ContentCluster("foo", nodes, distribution, 0, 0.0); + + return new ClusterFixture(cluster, distribution); + } +} diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBuilder.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBuilder.java new file mode 100644 index 00000000000..ebd87d08403 --- /dev/null +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/DistributionBuilder.java @@ -0,0 +1,107 @@ +package com.yahoo.vespa.clustercontroller.core; + +import com.yahoo.vdslib.distribution.ConfiguredNode; +import com.yahoo.vdslib.distribution.Distribution; +import com.yahoo.vespa.config.content.StorDistributionConfig; + +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class DistributionBuilder { + // TODO support nested groups + public static class GroupBuilder { + final int groupCount; + public List<Integer> groupsWithNodeCount; + + public GroupBuilder(int groupCount) { + this.groupCount = groupCount; + } + + public GroupBuilder(int... nodeCounts) { + this.groupCount = nodeCounts.length; + this.groupsWithNodeCount = IntStream.of(nodeCounts).boxed() + .collect(Collectors.toList()); + } + + public GroupBuilder eachWithNodeCount(int nodeCount) { + groupsWithNodeCount = IntStream.range(0, groupCount) + .map(i -> nodeCount).boxed() + .collect(Collectors.toList()); + return this; + } + + public int totalNodeCount() { + return groupsWithNodeCount.stream().reduce(0, Integer::sum); + } + + public String groupDistributionSpec() { + return IntStream.range(0, groupCount).mapToObj(i -> "1") + .collect(Collectors.joining("|")) + "|*"; + } + } + + public static GroupBuilder withGroups(int groups) { + return new GroupBuilder(groups); + } + + public static GroupBuilder withGroupNodes(int... nodeCounts) { + return new GroupBuilder(nodeCounts); + } + + public static List<ConfiguredNode> buildConfiguredNodes(int nodeCount) { + return IntStream.range(0, nodeCount) + .mapToObj(i -> new ConfiguredNode(i, false)) + .collect(Collectors.toList()); + } + + private static StorDistributionConfig.Group.Nodes.Builder configuredNode(ConfiguredNode node) { + StorDistributionConfig.Group.Nodes.Builder builder = new StorDistributionConfig.Group.Nodes.Builder(); + builder.index(node.index()); + return builder; + } + + private static StorDistributionConfig.Group.Builder configuredGroup( + String name, int index, Collection<ConfiguredNode> nodes) { + StorDistributionConfig.Group.Builder builder = new StorDistributionConfig.Group.Builder(); + builder.name(name); + builder.index(Integer.toString(index)); + nodes.forEach(n -> builder.nodes(configuredNode(n))); + return builder; + } + + public static Distribution forFlatCluster(int nodeCount) { + Collection<ConfiguredNode> nodes = buildConfiguredNodes(nodeCount); + + StorDistributionConfig.Builder configBuilder = new StorDistributionConfig.Builder(); + configBuilder.redundancy(2); + configBuilder.group(configuredGroup("bar", 0, nodes)); + + return new Distribution(new StorDistributionConfig(configBuilder)); + } + + public static Distribution forHierarchicCluster(GroupBuilder root) { + List<ConfiguredNode> nodes = buildConfiguredNodes(root.totalNodeCount()); + + StorDistributionConfig.Builder configBuilder = new StorDistributionConfig.Builder(); + configBuilder.redundancy(2); + + StorDistributionConfig.Group.Builder rootBuilder = new StorDistributionConfig.Group.Builder(); + rootBuilder.name("invalid"); + rootBuilder.index("invalid"); + rootBuilder.partitions(root.groupDistributionSpec()); + configBuilder.group(rootBuilder); + + int offset = 0; + for (int group = 0; group < root.groupsWithNodeCount.size(); ++group) { + int nodeCount = root.groupsWithNodeCount.get(group); + StorDistributionConfig.Group.Builder groupBuilder + = configuredGroup("group_" + (group + 1), group + 1, nodes.subList(offset, offset + nodeCount)); + configBuilder.group(groupBuilder); + offset += nodeCount; + } + + return new Distribution(new StorDistributionConfig(configBuilder)); + } +} diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java index 86248d2e1e3..f4b3e648f63 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FleetControllerTest.java @@ -10,6 +10,7 @@ import com.yahoo.vdslib.distribution.Distribution; import com.yahoo.vdslib.state.ClusterState; import com.yahoo.vdslib.state.Node; import com.yahoo.vdslib.state.NodeState; +import com.yahoo.vdslib.state.NodeType; import com.yahoo.vespa.clustercontroller.core.database.DatabaseHandler; import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo; import com.yahoo.vespa.clustercontroller.core.rpc.RPCCommunicator; @@ -232,90 +233,43 @@ public abstract class FleetControllerTest implements Waiter { return nodes; } - public interface NodeModifier { - void modify(NodeInfo node); + protected static Set<Integer> asIntSet(Integer... idx) { + return Arrays.asList(idx).stream().collect(Collectors.toSet()); } - NodeModifier makeDefaultTestNodeModifier() { - return new NodeModifier() { - @Override - public void modify(NodeInfo node) { - if (node.isDistributor()) { - if (node.getNodeIndex() == 13) { - node.setPrematureCrashCount(fleetController.getOptions().maxPrematureCrashes + 2); - } - return; - } - double latency = 75; - long count = 1000; - if (node.getNodeIndex() == 4) { - latency = 300; - count = 500; - } else if (node.getNodeIndex() == 7) { - latency = 120; - count = 800; - } else if (node.getNodeIndex() == 21) { - latency = 2000; - count = 600; - } else if (node.getNodeIndex() == 25) { - node.setPrematureCrashCount(fleetController.getOptions().maxPrematureCrashes + 1); - } else if (node.getNodeIndex() == 26) { - node.setPrematureCrashCount(fleetController.getOptions().maxPrematureCrashes); - } - String hostInfoString = generateHostInfo(latency, count); - node.setHostInfo(HostInfo.createHostInfo(hostInfoString)); - } - }; + protected static Set<ConfiguredNode> asConfiguredNodes(Set<Integer> indices) { + return indices.stream().map(idx -> new ConfiguredNode(idx, false)).collect(Collectors.toSet()); } - NodeModifier makeStdDevTestNodeModifier() { - return new NodeModifier() { - double[] latencies = new double[] { 30, 300, 60, 270 }; - int counter = 0; - + protected void waitForStateExcludingNodeSubset(String expectedState, Set<Integer> excludedNodes) throws Exception { + // Due to the implementation details of the test base, this.waitForState() will always + // wait until all nodes added in the test have received the latest cluster state. Since we + // want to entirely ignore node #6, it won't get a cluster state at all and the test will + // fail unless otherwise handled. We thus use a custom waiter which filters out nodes with + // the sneaky index (storage and distributors with same index are treated as different nodes + // in this context). + Waiter subsetWaiter = new Waiter.Impl(new DataRetriever() { @Override - public void modify(NodeInfo node) { - if (node.isDistributor()) { - return; - } - String hostInfo = generateHostInfo(latencies[counter++ % latencies.length], 1500); - node.setHostInfo(HostInfo.createHostInfo(hostInfo)); + public Object getMonitor() { return timer; } + @Override + public FleetController getFleetController() { return fleetController; } + @Override + public List<DummyVdsNode> getDummyNodes() { + return nodes.stream() + .filter(n -> !excludedNodes.contains(n.getNode().getIndex())) + .collect(Collectors.toList()); } - }; - } - - protected void setUpSlowDiskCluster(NodeModifier callback) throws Exception { - int nodeCount = 31; - FleetControllerOptions options = new FleetControllerOptions("mycluster"); - // TODO: multiple groups! - options.setStorageDistribution(new Distribution(Distribution.getDefaultDistributionConfig(3, nodeCount))); - setUpFleetController(true, options); - setUpVdsNodes(true, new DummyVdsNodeOptions(), false, nodeCount); - waitForStableSystem(nodeCount); - // Set one node as not being up. It should not contribute to the overall - // latency or operation metrics, nor should its disks be included. - nodes.get(2*13).disconnectAsShutdown(); - nodes.get(2*21+1).disconnectAsShutdown(); - waiter.waitForState("version:\\d+ distributor:31 .13.s:d storage:31 .21.s:m"); - - for (NodeInfo node : fleetController.getCluster().getNodeInfo()) { - callback.modify(node); - } + @Override + public int getTimeoutMS() { return timeoutMS; } + }); + subsetWaiter.waitForState(expectedState); } - protected void setUpSimpleCluster(int nodeCount) throws Exception { - FleetControllerOptions options = new FleetControllerOptions("mycluster"); - // TODO: multiple groups! - options.setStorageDistribution(new Distribution(Distribution.getDefaultDistributionConfig(3, nodeCount))); - setUpFleetController(true, options); - setUpVdsNodes(true, new DummyVdsNodeOptions(), false, nodeCount); - waitForStableSystem(nodeCount); - waiter.waitForState("version:\\d+ distributor:" + nodeCount + " storage:" + nodeCount); - - NodeModifier callback = makeDefaultTestNodeModifier(); - for (NodeInfo node : fleetController.getCluster().getNodeInfo()) { - callback.modify(node); - } + protected static Map<NodeType, Integer> transitionTimes(int milliseconds) { + Map<NodeType, Integer> maxTransitionTime = new TreeMap<>(); + maxTransitionTime.put(NodeType.DISTRIBUTOR, milliseconds); + maxTransitionTime.put(NodeType.STORAGE, milliseconds); + return maxTransitionTime; } protected void tearDownSystem() throws Exception { diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java new file mode 100644 index 00000000000..64bea57c6ad --- /dev/null +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownLiveConfigTest.java @@ -0,0 +1,100 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.clustercontroller.core; + +import com.yahoo.vdslib.state.NodeType; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertFalse; + +public class GroupAutoTakedownLiveConfigTest extends FleetControllerTest { + + private long mockConfigGeneration = 1; + + + private static FleetControllerOptions createOptions( + DistributionBuilder.GroupBuilder groupBuilder, double minNodeRatio) + { + FleetControllerOptions options = new FleetControllerOptions("mycluster"); + options.setStorageDistribution(DistributionBuilder.forHierarchicCluster(groupBuilder)); + options.nodes = DistributionBuilder.buildConfiguredNodes(groupBuilder.totalNodeCount()) + .stream().collect(Collectors.toSet()); + options.minNodeRatioPerGroup = minNodeRatio; + options.maxTransitionTime = transitionTimes(0); + return options; + } + + private void updateConfigLive(FleetControllerOptions newOptions) { + ++mockConfigGeneration; + this.fleetController.updateOptions(newOptions, mockConfigGeneration); + } + + private void reconfigureWithMinNodeRatio(double minNodeRatio) { + FleetControllerOptions newOptions = this.options.clone(); + newOptions.minNodeRatioPerGroup = minNodeRatio; + updateConfigLive(newOptions); + } + + private void reconfigureWithDistribution(DistributionBuilder.GroupBuilder groupBuilder) { + FleetControllerOptions newOptions = this.options.clone(); + newOptions.nodes = DistributionBuilder.buildConfiguredNodes(groupBuilder.totalNodeCount()) + .stream().collect(Collectors.toSet()); + newOptions.storageDistribution = DistributionBuilder.forHierarchicCluster(groupBuilder); + updateConfigLive(newOptions); + } + + private void setUp3x3ClusterWithMinNodeRatio(double minNodeRatio) throws Exception { + FleetControllerOptions options = createOptions( + DistributionBuilder.withGroups(3).eachWithNodeCount(3), + minNodeRatio); + setUpFleetController(true, options); + setUpVdsNodes(true, new DummyVdsNodeOptions(), false, 9); + waitForState("version:\\d+ distributor:9 storage:9"); + } + + private void takeDownContentNode(int index) { + // nodes list contains both distributors and storage nodes, with distributors + // in even positions and storage nodes in odd positions. + final int arrayIndex = index*2 + 1; + assertFalse(nodes.get(arrayIndex).isDistributor()); + nodes.get(arrayIndex).disconnect(); + } + + @Test + public void bootstrap_min_ratio_option_is_propagated_to_group_availability_logic() throws Exception { + setUp3x3ClusterWithMinNodeRatio(0.67); + takeDownContentNode(0); + waitForStateExcludingNodeSubset("version:\\d+ distributor:9 storage:9 .0.s:d .1.s:d .2.s:d", asIntSet(0)); + } + + @Test + public void min_ratio_live_reconfig_immediately_takes_effect() throws Exception { + // Initially, arbitrarily many nodes may be down in a group. + setUp3x3ClusterWithMinNodeRatio(0.0); + takeDownContentNode(3); + waitForStateExcludingNodeSubset("version:\\d+ distributor:9 storage:9 .3.s:d", asIntSet(3)); + + reconfigureWithMinNodeRatio(0.67); + waitForStateExcludingNodeSubset("version:\\d+ distributor:9 storage:9 .3.s:d .4.s:d .5.s:d", asIntSet(3)); + + reconfigureWithMinNodeRatio(0.0); + // Aaaand back up again! + waitForStateExcludingNodeSubset("version:\\d+ distributor:9 storage:9 .3.s:d", asIntSet(3)); + } + + @Test + public void live_distribution_config_changes_trigger_cluster_state_change() throws Exception { + setUp3x3ClusterWithMinNodeRatio(0.65); + takeDownContentNode(6); + + // Not enough nodes down to trigger group take-down yet + waitForStateExcludingNodeSubset("version:\\d+ distributor:9 storage:9 .6.s:d", asIntSet(6)); + // Removing a node from the same group as node 6 will dip it under the configured threshold, + // taking down the entire group. In this case we configure out node 8. + reconfigureWithDistribution(DistributionBuilder.withGroupNodes(3, 3, 2)); + waitForStateExcludingNodeSubset("version:\\d+ distributor:8 storage:6", asIntSet(6, 8)); + } +} diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java new file mode 100644 index 00000000000..93e34b1f772 --- /dev/null +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAutoTakedownTest.java @@ -0,0 +1,370 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.clustercontroller.core; + +import com.yahoo.vdslib.distribution.ConfiguredNode; +import com.yahoo.vdslib.state.DiskState; +import com.yahoo.vdslib.state.Node; +import com.yahoo.vdslib.state.NodeState; +import com.yahoo.vdslib.state.NodeType; +import com.yahoo.vdslib.state.State; +import com.yahoo.vespa.clustercontroller.core.listeners.NodeStateOrHostInfoChangeHandler; +import com.yahoo.vespa.clustercontroller.core.listeners.SystemStateListener; +import org.junit.Test; +import org.mockito.ArgumentMatcher; + +import java.util.HashSet; +import java.util.Set; + +import static org.hamcrest.core.AllOf.allOf; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.argThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; + +public class GroupAutoTakedownTest { + + private static ClusterFixture createFixtureForAllUpFlatCluster(int nodeCount, double minNodeRatioPerGroup) { + ClusterFixture fixture = ClusterFixture.forFlatCluster(nodeCount); + setSharedFixtureOptions(fixture, minNodeRatioPerGroup); + return fixture; + } + + private static ClusterFixture createFixtureForAllUpHierarchicCluster( + DistributionBuilder.GroupBuilder root, double minNodeRatioPerGroup) + { + ClusterFixture fixture = ClusterFixture.forHierarchicCluster(root); + setSharedFixtureOptions(fixture, minNodeRatioPerGroup); + return fixture; + } + + private static void setSharedFixtureOptions(ClusterFixture fixture, double minNodeRatioPerGroup) { + fixture.generator.setMinNodeRatioPerGroup(minNodeRatioPerGroup); + fixture.disableTransientMaintenanceModeOnDown(); + fixture.disableAutoClusterTakedown(); + fixture.bringEntireClusterUp(); + } + + private String stateAfterStorageTransition(ClusterFixture fixture, final int index, final State state) { + transitionStoreNodeToState(fixture, index, state); + return fixture.generatedClusterState(); + } + + private String verboseStateAfterStorageTransition(ClusterFixture fixture, final int index, final State state) { + transitionStoreNodeToState(fixture, index, state); + return fixture.verboseGeneratedClusterState(); + } + + private void transitionStoreNodeToState(ClusterFixture fixture, int index, State state) { + fixture.reportStorageNodeState(index, state); + SystemStateListener listener = mock(SystemStateListener.class); + assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener)); + } + + /** + * Use a per-group availability requirement ratio of 99%. Ensure that taking down a single + * node out of 5 in a flat hierarchy does not take down the cluster, i.e. the config should + * not apply to a flat structure. + */ + @Test + public void config_does_not_apply_to_flat_hierarchy_clusters() { + ClusterFixture fixture = createFixtureForAllUpFlatCluster(5, 0.99); + + SystemStateListener listener = mock(SystemStateListener.class); + // First invocation; generates initial state and clears "new state" flag + assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener)); + assertEquals("version:1 distributor:5 storage:5", fixture.generatedClusterState()); + + assertEquals("version:2 distributor:5 storage:5 .1.s:d", + stateAfterStorageTransition(fixture, 1, State.DOWN)); + } + + @Test + public void group_node_down_edge_implicitly_marks_down_rest_of_nodes_in_group() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51); + + SystemStateListener listener = mock(SystemStateListener.class); + assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener)); + assertEquals("version:1 distributor:6 storage:6", fixture.generatedClusterState()); + + // Same group as node 4 + assertEquals("version:2 distributor:6 storage:4", + stateAfterStorageTransition(fixture, 5, State.DOWN)); + // Same group as node 1 + assertEquals("version:3 distributor:6 storage:4 .0.s:d .1.s:d", + stateAfterStorageTransition(fixture, 0, State.DOWN)); + } + + @Test + public void restored_group_node_availability_takes_group_back_up_automatically() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51); + + // Group #2 -> down + assertEquals("version:1 distributor:6 storage:4", + stateAfterStorageTransition(fixture, 5, State.DOWN)); + + // Group #2 -> back up again + assertEquals("version:2 distributor:6 storage:6", + stateAfterStorageTransition(fixture, 5, State.UP)); + } + + @Test + public void no_op_for_downed_nodes_in_already_downed_group() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51); + + assertEquals("version:1 distributor:6 storage:4", + stateAfterStorageTransition(fixture, 5, State.DOWN)); + + // 4, 5 in same group; this should not cause a new state since it's already implicitly down + fixture.reportStorageNodeState(4, State.DOWN); + + SystemStateListener listener = mock(SystemStateListener.class); + assertFalse(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener)); + + assertEquals("version:1 distributor:6 storage:4", fixture.generatedClusterState()); + } + + @Test + public void verbose_node_state_description_updated_for_implicitly_downed_nodes() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.75); + + // Nodes 6 and 7 are taken down implicitly and should have a message reflecting this. + // Node 8 is taken down by the fixture and gets a fixture-assigned message that + // we should _not_ lose/overwrite. + assertEquals("version:1 distributor:9 storage:9 .6.s:d " + + ".6.m:group\\x20node\\x20availability\\x20below\\x20configured\\x20threshold " + + ".7.s:d " + + ".7.m:group\\x20node\\x20availability\\x20below\\x20configured\\x20threshold " + + ".8.s:d .8.m:mockdesc", + verboseStateAfterStorageTransition(fixture, 8, State.DOWN)); + } + + @Test + public void legacy_cluster_wide_availabilty_ratio_is_computed_after_group_takedowns() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51); + fixture.generator.setMinNodesUp(5, 5, 0.51, 0.51); + + // Taking down a node in a group forces the entire group down, which leaves us with + // only 4 content nodes (vs. minimum of 5 as specified above). The entire cluster + // should be marked as down in this case. + assertEquals("version:1 cluster:d distributor:6 storage:4", + stateAfterStorageTransition(fixture, 5, State.DOWN)); + } + + @Test + public void maintenance_wanted_state_not_overwritten() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.99); + + NodeInfo nodeInfo = fixture.cluster.getNodeInfo(new Node(NodeType.STORAGE, 5)); + fixture.generator.proposeNewNodeState(nodeInfo, new NodeState(NodeType.STORAGE, State.MAINTENANCE)); + SystemStateListener listener = mock(SystemStateListener.class); + assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener)); + + // Maintenance not counted as down, so group still up + assertEquals("version:1 distributor:9 storage:9 .5.s:m", fixture.generatedClusterState()); + + // Group goes down, but maintenance node should still be in maintenance + assertEquals("version:2 distributor:9 storage:9 .3.s:d .4.s:d .5.s:m", + stateAfterStorageTransition(fixture, 4, State.DOWN)); + } + + @Test + public void transient_maintenance_mode_on_down_edge_does_not_take_down_group() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.99); + fixture.enableTransientMaintenanceModeOnDown(1000); + + // Our timers are mocked, so taking down node 4 will deterministically transition to + // a transient maintenance mode. Group should not be taken down here. + assertEquals("version:1 distributor:9 storage:9 .4.s:m", + stateAfterStorageTransition(fixture, 4, State.DOWN)); + + // However, once grace period expires the group should be taken down. + fixture.timer.advanceTime(1001); + NodeStateOrHostInfoChangeHandler changeListener = mock(NodeStateOrHostInfoChangeHandler.class); + fixture.generator.watchTimers(fixture.cluster, changeListener); + SystemStateListener stateListener = mock(SystemStateListener.class); + assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, stateListener)); + + assertEquals("version:2 distributor:9 storage:9 .3.s:d .4.s:d .5.s:d", fixture.generatedClusterState()); + } + + private static class NodeEventWithDescription extends ArgumentMatcher<NodeEvent> { + private final String expected; + + NodeEventWithDescription(String expected) { + this.expected = expected; + } + + @Override + public boolean matches(Object o) { + return expected.equals(((NodeEvent)o).getDescription()); + } + } + + private static NodeEventWithDescription nodeEventWithDescription(String description) { + return new NodeEventWithDescription(description); + } + + private static class EventForNode extends ArgumentMatcher<NodeEvent> { + private final Node expected; + + EventForNode(Node expected) { + this.expected = expected; + } + + @Override + public boolean matches(Object o) { + return ((NodeEvent)o).getNode().getNode().equals(expected); + } + } + + private static EventForNode eventForNode(Node expected) { + return new EventForNode(expected); + } + + private static Node contentNode(int index) { + return new Node(NodeType.STORAGE, index); + } + + @Test + public void taking_down_node_adds_node_specific_event() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51); + + assertEquals("version:1 distributor:6 storage:4", + stateAfterStorageTransition(fixture, 5, State.DOWN)); + + verify(fixture.eventLog).addNodeOnlyEvent(argThat(allOf( + nodeEventWithDescription("Setting node down as the total availability of its group is " + + "below the configured threshold"), + eventForNode(contentNode(4)))), any()); + } + + @Test + public void bringing_node_back_up_adds_node_specific_event() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51); + + assertEquals("version:1 distributor:6 storage:4", + stateAfterStorageTransition(fixture, 5, State.DOWN)); + assertEquals("version:2 distributor:6 storage:6", + stateAfterStorageTransition(fixture, 5, State.UP)); + + verify(fixture.eventLog).addNodeOnlyEvent(argThat(allOf( + nodeEventWithDescription("Group availability restored; taking node back up"), + eventForNode(contentNode(4)))), any()); + } + + @Test + public void wanted_state_retired_implicitly_down_node_transitioned_it_to_retired_mode_immediately() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.99); + + assertEquals("version:1 distributor:9 storage:6", + stateAfterStorageTransition(fixture, 6, State.DOWN)); + // Node 7 is implicitly down. Mark wanted state as retired. It should now be Retired + // but not Down. + fixture.proposeStorageNodeWantedState(7, State.RETIRED); + + SystemStateListener stateListener = mock(SystemStateListener.class); + assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, stateListener)); + assertEquals("version:2 distributor:9 storage:8 .6.s:d .7.s:r", fixture.generatedClusterState()); + } + + @Test + public void downed_config_retired_node_transitions_back_to_retired_on_up_edge() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.49); + + assertEquals("version:1 distributor:6 storage:6 .4.s:d", + stateAfterStorageTransition(fixture, 4, State.DOWN)); + assertEquals("version:2 distributor:6 storage:4", + stateAfterStorageTransition(fixture, 5, State.DOWN)); + + // Node 5 gets config-retired under our feet. + Set<ConfiguredNode> nodes = new HashSet<>(fixture.cluster.clusterInfo().getConfiguredNodes().values()); + nodes.remove(new ConfiguredNode(5, false)); + nodes.add(new ConfiguredNode(5, true)); + // TODO this should ideally also set the retired flag in the distribution + // config, but only the ConfiguredNodes are actually looked at currently. + fixture.cluster.setNodes(nodes); + fixture.generator.setNodes(fixture.cluster.clusterInfo()); + + assertEquals("version:3 distributor:6 storage:6 .4.s:d .5.s:r", + stateAfterStorageTransition(fixture, 5, State.UP)); + } + + @Test + public void init_progress_is_preserved_across_group_down_up_edge() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51); + + final Node node = new Node(NodeType.STORAGE, 4); + final NodeState newState = new NodeState(NodeType.STORAGE, State.INITIALIZING); + newState.setInitProgress(0.5); + + fixture.reportStorageNodeState(4, newState); + SystemStateListener stateListener = mock(SystemStateListener.class); + assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, stateListener)); + + assertEquals("version:1 distributor:6 storage:6 .4.s:i .4.i:0.5", fixture.generatedClusterState()); + + assertEquals("version:2 distributor:6 storage:4", + stateAfterStorageTransition(fixture, 5, State.DOWN)); + assertEquals("version:3 distributor:6 storage:6 .4.s:i .4.i:0.5", + stateAfterStorageTransition(fixture, 5, State.UP)); + } + + @Test + public void disk_states_are_preserved_across_group_down_up_edge() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51); + + final Node node = new Node(NodeType.STORAGE, 4); + final NodeState newState = new NodeState(NodeType.STORAGE, State.UP); + newState.setDiskCount(7); + newState.setDiskState(5, new DiskState(State.DOWN)); + + fixture.reportStorageNodeState(4, newState); + SystemStateListener stateListener = mock(SystemStateListener.class); + assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, stateListener)); + + assertEquals("version:1 distributor:6 storage:6 .4.d:7 .4.d.5.s:d", fixture.generatedClusterState()); + + assertEquals("version:2 distributor:6 storage:4", + stateAfterStorageTransition(fixture, 5, State.DOWN)); + assertEquals("version:3 distributor:6 storage:6 .4.d:7 .4.d.5.s:d", + stateAfterStorageTransition(fixture, 5, State.UP)); + } + + @Test + public void down_wanted_state_is_preserved_across_group_down_up_edge() { + ClusterFixture fixture = createFixtureForAllUpHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.60); + + NodeInfo nodeInfo = fixture.cluster.getNodeInfo(new Node(NodeType.STORAGE, 5)); + nodeInfo.setWantedState(new NodeState(NodeType.STORAGE, State.DOWN).setDescription("borkbork")); + fixture.generator.proposeNewNodeState(nodeInfo, nodeInfo.getWantedState()); + SystemStateListener listener = mock(SystemStateListener.class); + assertTrue(fixture.generator.notifyIfNewSystemState(fixture.cluster, listener)); + + assertEquals("version:1 distributor:9 storage:9 .5.s:d .5.m:borkbork", fixture.verboseGeneratedClusterState()); + + assertEquals("version:2 distributor:9 storage:9 " + + ".3.s:d .3.m:group\\x20node\\x20availability\\x20below\\x20configured\\x20threshold " + + ".4.s:d .4.m:mockdesc .5.s:d .5.m:borkbork", + verboseStateAfterStorageTransition(fixture, 4, State.DOWN)); + assertEquals("version:3 distributor:9 storage:9 .5.s:d .5.m:borkbork", + verboseStateAfterStorageTransition(fixture, 4, State.UP)); + } + +} diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java new file mode 100644 index 00000000000..7d44afda68f --- /dev/null +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java @@ -0,0 +1,188 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.clustercontroller.core; + +import com.yahoo.vdslib.state.ClusterState; +import org.junit.Test; + +import java.text.ParseException; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.junit.Assert.assertThat; + +public class GroupAvailabilityCalculatorTest { + + private static ClusterState clusterState(String state) { + try { + return new ClusterState(state); + } catch (ParseException e) { + throw new RuntimeException(e); + } + } + + private static GroupAvailabilityCalculator calcForFlatCluster( + final int nodeCount, + final double minNodeRatioPerGroup) + { + return GroupAvailabilityCalculator.builder() + .withDistribution(DistributionBuilder.forFlatCluster(nodeCount)) + .withMinNodeRatioPerGroup(minNodeRatioPerGroup) + .build(); + } + + private static GroupAvailabilityCalculator calcForHierarchicCluster( + DistributionBuilder.GroupBuilder rootGroup, + final double minNodeRatioPerGroup) + { + return GroupAvailabilityCalculator.builder() + .withDistribution(DistributionBuilder.forHierarchicCluster(rootGroup)) + .withMinNodeRatioPerGroup(minNodeRatioPerGroup) + .build(); + } + + private static Set<Integer> indices(Integer... idx) { + Set<Integer> indices = new HashSet<>(); + Collections.addAll(indices, idx); + return indices; + } + + private static Set<Integer> emptySet() { return indices(); } + + @Test + public void flat_cluster_does_not_implicitly_take_down_nodes() { + GroupAvailabilityCalculator calc = calcForFlatCluster(5, 0.99); + + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:5 storage:5 .1.s:d .2.s:d")), equalTo(emptySet())); + + } + + @Test + public void group_node_down_edge_implicitly_marks_down_rest_of_nodes_in_group() { + // 3 groups of 2 nodes, take down node #4 (1st node in last group). Since we require + // at least 51% of group capacity to be available, implicitly take down the last group + // entirely. + GroupAvailabilityCalculator calc = calcForHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.51); + + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:6 storage:6 .4.s:d")), equalTo(indices(5))); + } + + // Setting 50% as min ratio in a group with 2 nodes should let group be up if + // one node goes down. + @Test + public void min_ratio_per_group_is_closed_interval() { + GroupAvailabilityCalculator calc = calcForHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.50); + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:6 storage:6 .4.s:d")), equalTo(emptySet())); + } + + @Test + public void retired_node_is_counted_as_down() { + GroupAvailabilityCalculator calc = calcForHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.99); + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:6 storage:6 .1.s:r")), equalTo(indices(0))); + } + + @Test + public void initializing_node_not_counted_as_down() { + GroupAvailabilityCalculator calc = calcForHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.99); + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:6 storage:6 .4.s:i")), equalTo(emptySet())); + } + + @Test + public void maintenance_node_not_counted_as_down() { + GroupAvailabilityCalculator calc = calcForHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.99); + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:6 storage:6 .4.s:m")), equalTo(emptySet())); + } + + @Test + public void existing_maintenance_node_not_implicitly_downed_when_group_taken_down() { + GroupAvailabilityCalculator calc = calcForHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.99); + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:9 storage:9 .4.s:m .5.s:d")), equalTo(indices(3))); // _not_ {3, 4} + } + + @Test + public void existing_retired_node_not_implicitly_downed_when_group_taken_down() { + GroupAvailabilityCalculator calc = calcForHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(3), 0.99); + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:9 storage:9 .4.s:r .5.s:d")), equalTo(indices(3))); // _not_ {3, 4} + } + + @Test + public void down_to_down_edge_keeps_group_down() { + GroupAvailabilityCalculator calc = calcForHierarchicCluster( + DistributionBuilder.withGroups(2).eachWithNodeCount(4), 0.76); + + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:8 storage:8 .1.s:d")), equalTo(indices(0, 2, 3))); + + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:8 storage:8 .1.s:d .2.s:d")), equalTo(indices(0, 3))); + } + + // Cluster state representations "prune" downed nodes at the end of the state, + // causing "storage:6 .5.s:d" to be reduced to "storage:5". This still implies a + // node is down according to the distribution config and must be handled as such. + @Test + public void implicitly_downed_node_at_state_end_is_counted_as_explicitly_down() { + GroupAvailabilityCalculator calc = calcForHierarchicCluster( + DistributionBuilder.withGroups(3).eachWithNodeCount(2), 0.99); + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:6 storage:5")), equalTo(indices(4))); + } + + @Test + public void non_uniform_group_sizes_are_supported() { + GroupAvailabilityCalculator calc = calcForHierarchicCluster( + DistributionBuilder.withGroupNodes(1, 2, 3, 4), 0.67); + + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:10 storage:10")), equalTo(emptySet())); + // Group 0 has only 1 node and should not cause any other nodes to be taken down + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:10 storage:10 .0.s:d")), equalTo(emptySet())); + // Too little availability in group 1 + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:10 storage:10 .1.s:d")), equalTo(indices(2))); + // Too little availability in group 2 + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:10 storage:10 .3.s:d")), equalTo(indices(4, 5))); + // Group 4 has 75% availability (>= 67%), so no auto take-down there + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:10 storage:10 .7.s:d")), equalTo(emptySet())); + // Drop group 4 availability to 50%; it should now be taken down entirely + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:10 storage:9 .7.s:d")), equalTo(indices(6, 8))); + } + + @Test + public void min_ratio_of_zero_never_takes_down_groups_implicitly() { + GroupAvailabilityCalculator calc = calcForHierarchicCluster( + DistributionBuilder.withGroups(2).eachWithNodeCount(4), 0.0); + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:8 storage:8")), equalTo(emptySet())); + // 1 down in each group + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:8 storage:8 .0.s:d .4.s:d")), equalTo(emptySet())); + // 2 down in each group + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:8 storage:8 .0.s:d .1.s:d .4.s:d .5.s:d")), equalTo(emptySet())); + // 3 down in each group + assertThat(calc.nodesThatShouldBeDown(clusterState( + "distributor:8 storage:8 .0.s:d .1.s:d .2.s:d .4.s:d .5.s:d .6.s:d")), equalTo(emptySet())); + } + +} diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java index 10305de116a..ef4c80a2256 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeSlobrokConfigurationMembershipTest.java @@ -18,38 +18,6 @@ public class NodeSlobrokConfigurationMembershipTest extends FleetControllerTest private final Set<Integer> nodeIndices = asIntSet(0, 1, 2, 3); private final int foreignNode = 6; - private void waitForStateExcludingNodeSubset(String expectedState, Set<Integer> excludedNodes) throws Exception { - // Due to the implementation details of the test base, this.waitForState() will always - // wait until all nodes added in the test have received the latest cluster state. Since we - // want to entirely ignore node #6, it won't get a cluster state at all and the test will - // fail unless otherwise handled. We thus use a custom waiter which filters out nodes with - // the sneaky index (storage and distributors with same index are treated as different nodes - // in this context). - Waiter subsetWaiter = new Waiter.Impl(new DataRetriever() { - @Override - public Object getMonitor() { return timer; } - @Override - public FleetController getFleetController() { return fleetController; } - @Override - public List<DummyVdsNode> getDummyNodes() { - return nodes.stream() - .filter(n -> !excludedNodes.contains(n.getNode().getIndex())) - .collect(Collectors.toList()); - } - @Override - public int getTimeoutMS() { return timeoutMS; } - }); - subsetWaiter.waitForState(expectedState); - } - - private static Set<Integer> asIntSet(Integer... idx) { - return Arrays.asList(idx).stream().collect(Collectors.toSet()); - } - - private static Set<ConfiguredNode> asConfiguredNodes(Set<Integer> indices) { - return indices.stream().map(idx -> new ConfiguredNode(idx, false)).collect(Collectors.toSet()); - } - private void setUpClusterWithForeignNode(Set<Integer> validIndices, final int foreignNodeIndex) throws Exception { final Set<ConfiguredNode> configuredNodes = asConfiguredNodes(validIndices); FleetControllerOptions options = optionsForConfiguredNodes(configuredNodes); @@ -63,6 +31,7 @@ public class NodeSlobrokConfigurationMembershipTest extends FleetControllerTest FleetControllerOptions options = new FleetControllerOptions("mycluster", configuredNodes); options.maxSlobrokDisconnectGracePeriod = 60 * 1000; options.nodeStateRequestTimeoutMS = 10000 * 60 * 1000; + options.maxTransitionTime = transitionTimes(0); return options; } @@ -108,10 +77,14 @@ public class NodeSlobrokConfigurationMembershipTest extends FleetControllerTest assertTrue(configuredNodes.remove(new ConfiguredNode(0, true))); fleetController.updateOptions(options, 0); - // The previously retired node should now be marked as done, as it no longer + // The previously retired node should now be marked as down, as it no longer // exists from the point of view of the content cluster. We have to use a subset // state waiter, as the controller will not send the new state to node 0. waitForStateExcludingNodeSubset("version:\\d+ distributor:4 .0.s:d storage:4 .0.s:d", asIntSet(0)); + + // Ensure it remains down for subsequent cluster state versions as well. + nodes.get(3).disconnect(); + waitForStateExcludingNodeSubset("version:\\d+ distributor:4 .0.s:d storage:4 .0.s:d .1.s:d", asIntSet(0, 1)); } } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java index cb5cee70486..5b53e524102 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java @@ -4,15 +4,16 @@ package com.yahoo.vespa.clustercontroller.core; import com.yahoo.vdslib.distribution.ConfiguredNode; import com.yahoo.vdslib.distribution.Distribution; import com.yahoo.vdslib.distribution.Group; +import com.yahoo.vdslib.state.ClusterState; import com.yahoo.vdslib.state.Node; import com.yahoo.vdslib.state.NodeState; import com.yahoo.vdslib.state.NodeType; import com.yahoo.vdslib.state.State; import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo; import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest; -import org.junit.Before; import org.junit.Test; +import java.text.ParseException; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; @@ -29,7 +30,7 @@ public class NodeStateChangeCheckerTest { private static final int minStorageNodesUp = 3; private static final int requiredRedundancy = 4; - private static final int currentClusterState = 2; + private static final int currentClusterStateVersion = 2; private static final double minRatioOfStorageNodesUp = 0.9; private static final Node nodeDistributor = new Node(NodeType.DISTRIBUTOR, 1); @@ -42,6 +43,18 @@ public class NodeStateChangeCheckerTest { return new NodeState(NodeType.STORAGE, state).setDescription(description); } + private static ClusterState clusterState(String state) { + try { + return new ClusterState(state); + } catch (ParseException e) { + throw new RuntimeException(e); + } + } + + private static ClusterState defaultAllUpClusterState() { + return clusterState(String.format("version:%d distributor:4 storage:4", currentClusterStateVersion)); + } + private NodeStateChangeChecker createChangeChecker(ContentCluster cluster) { return new NodeStateChangeChecker(minStorageNodesUp, minRatioOfStorageNodesUp, requiredRedundancy, cluster.clusterInfo()); } @@ -93,12 +106,22 @@ public class NodeStateChangeCheckerTest { "}\n"; } + private void markAllNodesAsReportingStateUp(ContentCluster cluster) { + final ClusterInfo clusterInfo = cluster.clusterInfo(); + final int configuredNodeCount = cluster.clusterInfo().getConfiguredNodes().size(); + for (int i = 0; i < configuredNodeCount; i++) { + clusterInfo.getDistributorNodeInfo(i).setReportedState(new NodeState(NodeType.DISTRIBUTOR, State.UP), 0); + clusterInfo.getDistributorNodeInfo(i).setHostInfo(HostInfo.createHostInfo(createDistributorHostInfo(4, 5, 6))); + clusterInfo.getStorageNodeInfo(i).setReportedState(new NodeState(NodeType.STORAGE, State.UP), 0); + } + } + @Test public void testCanUpgradeForce() { NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(createCluster(createNodes(1))); NodeState newState = new NodeState(NodeType.STORAGE, State.INITIALIZING); NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( - nodeDistributor, currentClusterState, SetUnitStateRequest.Condition.FORCE, + nodeDistributor, defaultAllUpClusterState(), SetUnitStateRequest.Condition.FORCE, upNodeState, newState); assertTrue(result.settingWantedStateIsAllowed()); assertTrue(!result.wantedStateAlreadySet()); @@ -108,7 +131,7 @@ public class NodeStateChangeCheckerTest { public void testSafeSetStateDistributors() { NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(createCluster(createNodes(1))); NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( - nodeDistributor, currentClusterState, SetUnitStateRequest.Condition.SAFE, + nodeDistributor, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState); assertFalse(result.settingWantedStateIsAllowed()); assertFalse(result.wantedStateAlreadySet()); @@ -122,7 +145,7 @@ public class NodeStateChangeCheckerTest { NodeStateChangeChecker nodeStateChangeChecker = new NodeStateChangeChecker( 5 /* min storage nodes */, minRatioOfStorageNodesUp, requiredRedundancy, cluster.clusterInfo()); NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( - nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE, + nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState); assertFalse(result.settingWantedStateIsAllowed()); assertFalse(result.wantedStateAlreadySet()); @@ -143,12 +166,32 @@ public class NodeStateChangeCheckerTest { // Not setting nodes up -> all are down NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( - nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE, + nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE, maintenanceNodeState, upNodeState); assertFalse(result.settingWantedStateIsAllowed()); assertFalse(result.wantedStateAlreadySet()); } + // A node may be reported as Up but have a generated state of Down if it's part of + // nodes taken down implicitly due to a group having too low node availability. + @Test + public void testSetUpSucceedsIfReportedIsUpButGeneratedIsDown() { + ContentCluster cluster = createCluster(createNodes(4)); + NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster); + + markAllNodesAsReportingStateUp(cluster); + + ClusterState stateWithNodeDown = clusterState(String.format( + "version:%d distributor:4 storage:4 .%d.s:d", + currentClusterStateVersion, nodeStorage.getIndex())); + + NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( + nodeStorage, stateWithNodeDown, SetUnitStateRequest.Condition.SAFE, + maintenanceNodeState, upNodeState); + assertTrue(result.settingWantedStateIsAllowed()); + assertFalse(result.wantedStateAlreadySet()); + } + @Test public void testCannotSetUpIfUnknownOldStateAndReportedIsDown() { ContentCluster cluster = createCluster(createNodes(4)); @@ -156,7 +199,7 @@ public class NodeStateChangeCheckerTest { // Not setting nodes up -> all are down NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( - nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE, + nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE, new NodeState(NodeType.STORAGE, State.DOWN), upNodeState); assertFalse(result.settingWantedStateIsAllowed()); assertFalse(result.wantedStateAlreadySet()); @@ -170,7 +213,7 @@ public class NodeStateChangeCheckerTest { setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 3, 6))); NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( - nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE, + nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState); assertFalse(result.settingWantedStateIsAllowed()); assertFalse(result.wantedStateAlreadySet()); @@ -185,7 +228,7 @@ public class NodeStateChangeCheckerTest { setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 3, 6))); NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( - new Node(NodeType.STORAGE, 3), currentClusterState, SetUnitStateRequest.Condition.SAFE, + new Node(NodeType.STORAGE, 3), defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState); assertTrue(result.settingWantedStateIsAllowed()); assertFalse(result.wantedStateAlreadySet()); @@ -209,7 +252,7 @@ public class NodeStateChangeCheckerTest { setAllNodesUp(cluster, HostInfo.createHostInfo(hostInfo)); NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( - new Node(NodeType.STORAGE, 1), currentClusterState, SetUnitStateRequest.Condition.SAFE, + new Node(NodeType.STORAGE, 1), defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState); assertTrue(result.settingWantedStateIsAllowed()); assertFalse(result.wantedStateAlreadySet()); @@ -222,7 +265,7 @@ public class NodeStateChangeCheckerTest { cluster.clusterInfo().getStorageNodeInfo(1).setReportedState(new NodeState(NodeType.STORAGE, State.UP), 0); NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( - nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState); + nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState); assertFalse(result.settingWantedStateIsAllowed()); assertFalse(result.wantedStateAlreadySet()); assertThat(result.getReason(), is("Distributor node (0) has not reported any cluster state version yet.")); @@ -235,7 +278,7 @@ public class NodeStateChangeCheckerTest { NodeState currentNodeState = createNodeState(state, oldDescription); NodeState newNodeState = createNodeState(state, newDescription); return nodeStateChangeChecker.evaluateTransition( - nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE, + nodeStorage, defaultAllUpClusterState(), SetUnitStateRequest.Condition.SAFE, currentNodeState, newNodeState); } @@ -280,13 +323,16 @@ public class NodeStateChangeCheckerTest { cluster.clusterInfo().getStorageNodeInfo(x).setReportedState(new NodeState(NodeType.STORAGE, state), 0); } + ClusterState clusterState = defaultAllUpClusterState(); + if (storageNodeIndex >= 0) { // TODO: Move this into the calling test NodeState downNodeState = new NodeState(NodeType.STORAGE, State.DOWN); cluster.clusterInfo().getStorageNodeInfo(storageNodeIndex).setReportedState(downNodeState, 4 /* time */); + clusterState.setNodeState(new Node(NodeType.STORAGE, storageNodeIndex), downNodeState); } return nodeStateChangeChecker.evaluateTransition( - nodeStorage, currentClusterState, SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState); + nodeStorage, clusterState, SetUnitStateRequest.Condition.SAFE, upNodeState, maintenanceNodeState); } private void setAllNodesUp(ContentCluster cluster, HostInfo distributorHostInfo) { @@ -339,6 +385,28 @@ public class NodeStateChangeCheckerTest { assertThat(result.getReason(), containsString("Not enough storage nodes running")); } + @Test + public void testNodeRatioRequirementConsidersGeneratedNodeStates() { + ContentCluster cluster = createCluster(createNodes(4)); + NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster); + + markAllNodesAsReportingStateUp(cluster); + + // Both minRatioOfStorageNodesUp and minStorageNodesUp imply that a single node being + // in state Down should halt the upgrade. This must also take into account the generated + // state, not just the reported state. In this case, all nodes are reported as being Up + // but one node has a generated state of Down. + ClusterState stateWithNodeDown = clusterState(String.format( + "version:%d distributor:4 storage:4 .3.s:d", + currentClusterStateVersion)); + + NodeStateChangeChecker.Result result = nodeStateChangeChecker.evaluateTransition( + nodeStorage, stateWithNodeDown, SetUnitStateRequest.Condition.SAFE, + upNodeState, maintenanceNodeState); + assertFalse(result.settingWantedStateIsAllowed()); + assertFalse(result.wantedStateAlreadySet()); + } + private List<ConfiguredNode> createNodes(int count) { List<ConfiguredNode> nodes = new ArrayList<>(); for (int i = 0; i < count; i++) diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateGeneratorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateGeneratorTest.java index ab6185d2b56..35118933b42 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateGeneratorTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SystemStateGeneratorTest.java @@ -99,7 +99,7 @@ public class SystemStateGeneratorTest extends TestCase { } private void assertNewClusterStateReceived() { - assertTrue(generator.notifyIfNewSystemState(systemStateListener)); + assertTrue(generator.notifyIfNewSystemState(cluster, systemStateListener)); assertTrue(systemStateListener.toString(), systemStateListener.states.size() == 1); systemStateListener.states.clear(); } @@ -147,7 +147,7 @@ public class SystemStateGeneratorTest extends TestCase { private void verifyClusterStateChanged(Node node, State state) { log.info("Verifying cluster state has been updated for " + node + " to " + state); - assertTrue(generator.notifyIfNewSystemState(systemStateListener)); + assertTrue(generator.notifyIfNewSystemState(cluster, systemStateListener)); assertEquals(1, systemStateListener.states.size()); assertEquals(state, systemStateListener.states.get(0).getNodeState(node).getState()); systemStateListener.states.clear(); diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/ClusterControllerConfig.java b/config-model/src/main/java/com/yahoo/vespa/model/content/ClusterControllerConfig.java index fd1bc8a3362..4a0ce99914a 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/content/ClusterControllerConfig.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/content/ClusterControllerConfig.java @@ -31,8 +31,12 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F ModelElement tuning = null; ModelElement clusterTuning = clusterElement.getChild("tuning"); + Integer bucketSplittingMinimumBits = null; + Double minNodeRatioPerGroup = null; if (clusterTuning != null) { tuning = clusterTuning.getChild("cluster-controller"); + minNodeRatioPerGroup = clusterTuning.childAsDouble("min-node-ratio-per-group"); + bucketSplittingMinimumBits = clusterTuning.childAsInteger("bucket-splitting.minimum-bits"); } if (tuning != null) { @@ -43,10 +47,11 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F tuning.childAsDuration("stable-state-period"), tuning.childAsDouble("min-distributor-up-ratio"), tuning.childAsDouble("min-storage-up-ratio"), - clusterElement.childAsInteger("tuning.bucket-splitting.minimum-bits")); + bucketSplittingMinimumBits, + minNodeRatioPerGroup); } else { return new ClusterControllerConfig(ancestor, clusterName, null, null, null, null, null, null, - clusterElement.childAsInteger("tuning.bucket-splitting.minimum-bits")); + bucketSplittingMinimumBits, minNodeRatioPerGroup); } } } @@ -59,7 +64,9 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F Double minDistributorUpRatio; Double minStorageUpRatio; Integer minSplitBits; + private Double minNodeRatioPerGroup; + // TODO refactor; too many args private ClusterControllerConfig(AbstractConfigProducer parent, String clusterName, Duration initProgressTime, @@ -68,7 +75,8 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F Duration stableStateTimePeriod, Double minDistributorUpRatio, Double minStorageUpRatio, - Integer minSplitBits) { + Integer minSplitBits, + Double minNodeRatioPerGroup) { super(parent, "fleetcontroller"); this.clusterName = clusterName; @@ -79,6 +87,7 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F this.minDistributorUpRatio = minDistributorUpRatio; this.minStorageUpRatio = minStorageUpRatio; this.minSplitBits = minSplitBits; + this.minNodeRatioPerGroup = minNodeRatioPerGroup; } @Override @@ -117,5 +126,8 @@ public class ClusterControllerConfig extends AbstractConfigProducer implements F if (minSplitBits != null) { builder.ideal_distribution_bits(minSplitBits); } + if (minNodeRatioPerGroup != null) { + builder.min_node_ratio_per_group(minNodeRatioPerGroup); + } } } diff --git a/config-model/src/main/resources/schema/content.rnc b/config-model/src/main/resources/schema/content.rnc index 36a8dd276ef..1c20acdc0bd 100644 --- a/config-model/src/main/resources/schema/content.rnc +++ b/config-model/src/main/resources/schema/content.rnc @@ -60,6 +60,10 @@ PersistenceThreads = element persistence-threads { Thread+ } +MinNodeRatioPerGroup = element min-node-ratio-per-group { + xsd:double { minInclusive = "0" maxInclusive = "1" } +} + ClusterControllerTuning = element cluster-controller { element init-progress-time { xsd:string { pattern = "([0-9\.]+)\s*([a-z]+)?" } }? & element transition-time { xsd:string { pattern = "([0-9\.]+)\s*([a-z]+)?" } }? & @@ -85,7 +89,8 @@ ClusterTuning = element tuning { VisitorTuning? & ClusterControllerTuning? & Maintenance? & - PersistenceThreads? + PersistenceThreads? & + MinNodeRatioPerGroup? } Content = element content { diff --git a/config-model/src/test/java/com/yahoo/vespa/model/content/FleetControllerClusterTest.java b/config-model/src/test/java/com/yahoo/vespa/model/content/FleetControllerClusterTest.java index 204491b0724..98cc4e5cc47 100644 --- a/config-model/src/test/java/com/yahoo/vespa/model/content/FleetControllerClusterTest.java +++ b/config-model/src/test/java/com/yahoo/vespa/model/content/FleetControllerClusterTest.java @@ -10,13 +10,6 @@ import org.w3c.dom.Document; import static org.junit.Assert.assertEquals; -/** - * Created with IntelliJ IDEA. - * User: thomasg - * Date: 5/10/12 - * Time: 2:29 PM - * To change this template use File | Settings | File Templates. - */ public class FleetControllerClusterTest { ClusterControllerConfig parse(String xml) { Document doc = XML.getDocument(xml); @@ -69,4 +62,31 @@ public class FleetControllerClusterTest { FleetcontrollerConfig config = new FleetcontrollerConfig(builder); assertEquals(13, config.init_progress_time()); } + + @Test + public void min_node_ratio_per_group_tuning_config_is_propagated() { + FleetcontrollerConfig.Builder builder = new FleetcontrollerConfig.Builder(); + parse("<cluster id=\"storage\">\n" + + " <documents/>\n" + + " <tuning>\n" + + " <min-node-ratio-per-group>0.75</min-node-ratio-per-group>\n" + + " </tuning>\n" + + "</cluster>"). + getConfig(builder); + + FleetcontrollerConfig config = new FleetcontrollerConfig(builder); + assertEquals(0.75, config.min_node_ratio_per_group(), 0.01); + } + + @Test + public void min_node_ratio_per_group_is_implicitly_zero_when_omitted() { + FleetcontrollerConfig.Builder builder = new FleetcontrollerConfig.Builder(); + parse("<cluster id=\"storage\">\n" + + " <documents/>\n" + + "</cluster>"). + getConfig(builder); + + FleetcontrollerConfig config = new FleetcontrollerConfig(builder); + assertEquals(0.0, config.min_node_ratio_per_group(), 0.01); + } } diff --git a/configdefinitions/src/vespa/fleetcontroller.def b/configdefinitions/src/vespa/fleetcontroller.def index 919e9fc2209..0862beac64a 100644 --- a/configdefinitions/src/vespa/fleetcontroller.def +++ b/configdefinitions/src/vespa/fleetcontroller.def @@ -138,3 +138,11 @@ show_local_systemstates_in_event_log bool default=true ## The ideal number of distribution bits this system should have ideal_distribution_bits int default=16 + +## Minimum ratio of nodes that have to be available (i.e. not Down) in any +## hierarchic content cluster group. If a higher ratio than this is Down at +## any point, the remaning nodes in the group will be automatically marked +## as down. Group nodes will automatically be taken back up as soon as node +## availability has been restored above the given threshold. +## Default is 0, i.e. functionality is for all intents and purposes disabled. +min_node_ratio_per_group double default=0.0 |