From aa43ca229ae1deaf19a1f60460461ad824c49c0c Mon Sep 17 00:00:00 2001 From: HÃ¥kon Hallingstad Date: Tue, 19 Jan 2021 17:16:24 +0100 Subject: Support group maintenance [run-systemtest] --- .../core/ClusterStateGenerator.java | 40 ++++++-- .../clustercontroller/core/ContentCluster.java | 23 +++++ .../core/GroupAvailabilityCalculator.java | 107 ++++++++++++++++++--- .../clustercontroller/core/NodeStateReason.java | 2 +- .../core/ClusterStateGeneratorTest.java | 28 ++++++ .../core/GroupAvailabilityCalculatorTest.java | 41 ++++++++ .../java/com/yahoo/vdslib/state/NodeState.java | 2 + 7 files changed, 218 insertions(+), 25 deletions(-) diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java index 74f3cc276a5..f46badf2fd9 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java @@ -10,7 +10,6 @@ import com.yahoo.vdslib.state.State; import java.util.HashMap; import java.util.Map; import java.util.Optional; -import java.util.Set; import java.util.TreeMap; /** @@ -26,7 +25,7 @@ public class ClusterStateGenerator { static class Params { public ContentCluster cluster; - public Map transitionTimes; + public Map transitionTimes = buildTransitionTimeMap(0, 0); public long currentTimeInMillis = 0; public int maxPrematureCrashes = 0; public int minStorageNodesUp = 1; @@ -40,7 +39,6 @@ public class ClusterStateGenerator { public int maxInitProgressTimeMs = 5000; Params() { - this.transitionTimes = buildTransitionTimeMap(0, 0); } // FIXME de-dupe @@ -269,18 +267,40 @@ public class ClusterStateGenerator { final GroupAvailabilityCalculator calc = new GroupAvailabilityCalculator.Builder() .withMinNodeRatioPerGroup(params.minNodeRatioPerGroup) .withDistribution(params.cluster.getDistribution()) + .withNodesSafelySetToMaintenance(params.cluster.nodesSafelySetTo(State.MAINTENANCE)) .build(); - final Set nodesToTakeDown = calc.nodesThatShouldBeDown(workingState); + GroupAvailabilityCalculator.Result result = calc.calculate(workingState); - for (Integer idx : nodesToTakeDown) { - final Node node = storageNode(idx); - final NodeState newState = new NodeState(NodeType.STORAGE, State.DOWN); - newState.setDescription("group node availability below configured threshold"); - workingState.setNodeState(node, newState); - nodeStateReasons.put(node, NodeStateReason.GROUP_IS_DOWN); + for (int index : result.nodesThatShouldBeMaintained()) { + setNewNodeState(index, NodeType.STORAGE, State.MAINTENANCE, + "too many safe maintenance nodes in group", NodeStateReason.GROUP_IN_MAINTENANCE, + workingState, nodeStateReasons); + + setNewNodeState(index, NodeType.DISTRIBUTOR, State.DOWN, + "too many safe maintenance nodes in group", NodeStateReason.GROUP_IN_MAINTENANCE, + workingState, nodeStateReasons); + } + + for (int index : result.nodesThatShouldBeDown()) { + setNewNodeState(index, NodeType.STORAGE, State.DOWN, + "group node availability below configured threshold", NodeStateReason.GROUP_IS_DOWN, + workingState, nodeStateReasons); } } + private static void setNewNodeState(int index, + NodeType nodeType, + State newState, + String description, + NodeStateReason nodeStateReason, + ClusterState workingState, + Map nodeStateReasons) { + final Node node = new Node(nodeType, index); + final NodeState newNodeState = new NodeState(nodeType, newState).setDescription(description); + workingState.setNodeState(node, newNodeState); + nodeStateReasons.put(node, nodeStateReason); + } + private static Node storageNode(int index) { return new Node(NodeType.STORAGE, index); } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java index 5e775116bf3..dad2b91d3cc 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java @@ -14,6 +14,9 @@ import com.yahoo.vespa.clustercontroller.core.status.statuspage.VdsClusterHtmlRe import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest; import java.util.*; +import java.util.stream.Collectors; + +import static com.yahoo.vdslib.state.NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION; public class ContentCluster { @@ -188,6 +191,26 @@ public class ContentCluster { return nodeStateChangeChecker.evaluateTransition(node, clusterState, condition, oldState, newState); } + /** Returns the indices of the nodes that have been safely set to the given state by the Orchestrator (best guess). */ + public List nodesSafelySetTo(State state) { + switch (state) { + case MAINTENANCE: // Orchestrator's ALLOWED_TO_BE_DOWN + case DOWN: // Orchestrator's PERMANENTLY_DOWN + return clusterInfo.getStorageNodeInfo().stream() + .filter(storageNodeInfo -> { + NodeState userWantedState = storageNodeInfo.getUserWantedState(); + return userWantedState.getState() == state && + Objects.equals(userWantedState.getDescription(), ORCHESTRATOR_RESERVED_DESCRIPTION); + }) + .map(NodeInfo::getNodeIndex) + .collect(Collectors.toList()); + default: + // Note: There is no trace left if the Orchestrator set the state to UP, so that's handled + // like any other state: + return List.of(); + } + } + public void setMinStorageNodesUp(int minStorageNodesUp) { this.minStorageNodesUp = minStorageNodesUp; } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java index 686ef0dee6c..0afad4a0efe 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java @@ -7,29 +7,37 @@ import com.yahoo.vdslib.distribution.Group; import com.yahoo.vdslib.distribution.GroupVisitor; import com.yahoo.vdslib.state.ClusterState; import com.yahoo.vdslib.state.Node; -import com.yahoo.vdslib.state.NodeState; import com.yahoo.vdslib.state.NodeType; -import com.yahoo.vdslib.state.State; -import java.util.Collections; +import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import java.util.Set; +import java.util.stream.Collectors; import java.util.stream.Stream; class GroupAvailabilityCalculator { private final Distribution distribution; private final double minNodeRatioPerGroup; + private final int safeMaintenanceGroupThreshold; + private List nodesSafelySetToMaintenance; private GroupAvailabilityCalculator(Distribution distribution, - double minNodeRatioPerGroup) { + double minNodeRatioPerGroup, + int safeMaintenanceGroupThreshold, + List nodesSafelySetToMaintenance) { this.distribution = distribution; this.minNodeRatioPerGroup = minNodeRatioPerGroup; + this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold; + this.nodesSafelySetToMaintenance = nodesSafelySetToMaintenance; } public static class Builder { private Distribution distribution; private double minNodeRatioPerGroup = 1.0; + private int safeMaintenanceGroupThreshold = 2; + private List nodesSafelySetToMaintenance = new ArrayList<>(); Builder withDistribution(Distribution distribution) { this.distribution = distribution; @@ -39,8 +47,23 @@ class GroupAvailabilityCalculator { this.minNodeRatioPerGroup = minRatio; return this; } + /** + * If the number of nodes safely set to maintenance is at least this number, the remaining + * nodes in the group will be set to maintenance (storage nodes) or down (distributors). + * + *

This feature is disabled if safeMaintenanceGroupThreshold is 0 (not default).

+ */ + Builder withSafeMaintenanceGroupThreshold(int safeMaintenanceGroupThreshold) { + this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold; + return this; + } + Builder withNodesSafelySetToMaintenance(List nodesSafelySetToMaintenance) { + this.nodesSafelySetToMaintenance.addAll(nodesSafelySetToMaintenance); + return this; + } GroupAvailabilityCalculator build() { - return new GroupAvailabilityCalculator(distribution, minNodeRatioPerGroup); + return new GroupAvailabilityCalculator(distribution, minNodeRatioPerGroup, + safeMaintenanceGroupThreshold, nodesSafelySetToMaintenance); } } @@ -49,11 +72,18 @@ class GroupAvailabilityCalculator { } private class InsufficientAvailabilityGroupVisitor implements GroupVisitor { + private final Set implicitlyMaintained = new HashSet<>(); private final Set implicitlyDown = new HashSet<>(); private final ClusterState clusterState; + private final Set nodesSafelySetToMaintenance; + private final int safeMaintenanceGroupThreshold; - public InsufficientAvailabilityGroupVisitor(ClusterState clusterState) { + public InsufficientAvailabilityGroupVisitor(ClusterState clusterState, + List nodesSafelySetToMaintenance, + int safeMaintenanceGroupThreshold) { this.clusterState = clusterState; + this.nodesSafelySetToMaintenance = Set.copyOf(nodesSafelySetToMaintenance); + this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold; } private boolean nodeIsAvailableInState(final int index, final String states) { @@ -75,6 +105,14 @@ class GroupAvailabilityCalculator { return g.getNodes().stream().filter(n -> nodeIsAvailableInState(n.index(), "ui")); } + private Stream candidateNodesForSettingMaintenance(Group g) { + // Most states should be set in maintenance, e.g. retirement may take a long time, + // so force maintenance to allow upgrades. + return g.getNodes().stream() + // "m" is NOT included since that would be a no-op. + .filter(n -> nodeIsAvailableInState(n.index(), "uird")); + } + private double computeGroupAvailability(Group g) { // TODO also look at distributors final long availableNodes = availableNodesIn(g).count(); @@ -83,22 +121,43 @@ class GroupAvailabilityCalculator { return availableNodes / (double)g.getNodes().size(); } + private int computeNodesSafelySetToMaintenance(Group group) { + Set nodesInGroupSafelySetToMaintenance = group.getNodes().stream() + .filter(configuredNode -> nodesSafelySetToMaintenance.contains(configuredNode.index())) + .collect(Collectors.toSet()); + + return nodesInGroupSafelySetToMaintenance.size(); + } + private void markAllAvailableGroupNodeIndicesAsDown(Group group) { candidateNodesForSettingDown(group).forEach(n -> implicitlyDown.add(n.index())); } + private void markAllAvailableGroupNodeIndicesAsMaintained(Group group) { + candidateNodesForSettingMaintenance(group).forEach(n -> implicitlyMaintained.add(n.index())); + } + @Override public boolean visitGroup(Group group) { if (group.isLeafGroup()) { - if (computeGroupAvailability(group) < minNodeRatioPerGroup) { + if (safeMaintenanceGroupThreshold > 0 && + computeNodesSafelySetToMaintenance(group) >= safeMaintenanceGroupThreshold) { + markAllAvailableGroupNodeIndicesAsMaintained(group); + } else if (computeGroupAvailability(group) < minNodeRatioPerGroup) { markAllAvailableGroupNodeIndicesAsDown(group); } } return true; } - Set implicitlyDownNodeIndices() { - return implicitlyDown; + Result result() { + var intersection = new HashSet<>(implicitlyMaintained); + intersection.retainAll(implicitlyDown); + if (intersection.size() > 0) { + throw new IllegalStateException("Nodes implicitly both maintenance and down: " + intersection); + } + + return new Result(implicitlyMaintained, implicitlyDown); } } @@ -106,17 +165,37 @@ class GroupAvailabilityCalculator { return root.isLeafGroup(); } - public Set nodesThatShouldBeDown(ClusterState state) { + public static class Result { + private final Set shouldBeMaintained; + private final Set shouldBeDown; + + public Result() { this(Set.of(), Set.of()); } + + public Result(Set shouldBeMaintained, Set shouldBeDown) { + this.shouldBeMaintained = Set.copyOf(shouldBeMaintained); + this.shouldBeDown = Set.copyOf(shouldBeDown); + } + + public Set nodesThatShouldBeMaintained() { return shouldBeMaintained; } + public Set nodesThatShouldBeDown() { return shouldBeDown; } + } + + public Result calculate(ClusterState state) { if (distribution == null) { // FIXME: for tests that don't set distribution properly! - return Collections.emptySet(); + return new Result(); } if (isFlatCluster(distribution.getRootGroup())) { // Implicit group takedown only applies to hierarchic cluster setups. - return new HashSet<>(); + return new Result(); } - InsufficientAvailabilityGroupVisitor visitor = new InsufficientAvailabilityGroupVisitor(state); + InsufficientAvailabilityGroupVisitor visitor = new InsufficientAvailabilityGroupVisitor( + state, nodesSafelySetToMaintenance, safeMaintenanceGroupThreshold); distribution.visitGroups(visitor); - return visitor.implicitlyDownNodeIndices(); + return visitor.result(); + } + + public Set nodesThatShouldBeDown(ClusterState state) { + return calculate(state).nodesThatShouldBeDown(); } } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java index 3f550724cef..77ab8539219 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java @@ -7,7 +7,7 @@ public enum NodeStateReason { NODE_TOO_UNSTABLE, WITHIN_MAINTENANCE_GRACE_PERIOD, NODE_NOT_BACK_UP_WITHIN_GRACE_PERIOD, - FORCED_INTO_MAINTENANCE, + GROUP_IN_MAINTENANCE, GROUP_IS_DOWN, MAY_HAVE_MERGES_PENDING } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java index 08329c874b5..0ca8b010191 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java @@ -704,6 +704,34 @@ public class ClusterStateGeneratorTest { assertThat(state.toString(), equalTo("distributor:9 storage:9 .3.s:m .4.s:d .5.s:d")); } + @Test + public void group_nodes_are_marked_maintenance_if_group_availability_too_low_by_orchestrator() { + final ClusterFixture fixture = ClusterFixture + .forHierarchicCluster(DistributionBuilder.withGroups(3).eachWithNodeCount(3)) + .bringEntireClusterUp() + .proposeStorageNodeWantedState(4, State.MAINTENANCE, NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION) + .proposeStorageNodeWantedState(5, State.MAINTENANCE, NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION); + final ClusterStateGenerator.Params params = fixture.generatorParams(); + + // Both node 4 & 5 are in maintenance by Orchestrator, which will force the other nodes in the + // group to maintenance (node 3). + final AnnotatedClusterState state = ClusterStateGenerator.generatedStateFrom(params); + assertThat(state.toString(), equalTo("distributor:9 .3.s:d storage:9 .3.s:m .4.s:m .5.s:m")); + } + + @Test + public void group_nodes_are_not_marked_maintenance_if_group_availability_high_by_orchestrator() { + final ClusterFixture fixture = ClusterFixture + .forHierarchicCluster(DistributionBuilder.withGroups(3).eachWithNodeCount(3)) + .bringEntireClusterUp() + .proposeStorageNodeWantedState(4, State.MAINTENANCE, NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION); + final ClusterStateGenerator.Params params = fixture.generatorParams(); + + // Node 4 is in maintenance by Orchestrator, which is not sufficient to force group into maintenance. + final AnnotatedClusterState state = ClusterStateGenerator.generatedStateFrom(params); + assertThat(state.toString(), equalTo("distributor:9 storage:9 .4.s:m")); + } + /** * Cluster-wide distribution bit count cannot be higher than the lowest split bit * count reported by the set of storage nodes. This is because the distribution bit diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java index 07bd18c3667..f77a68e0fda 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java @@ -7,6 +7,7 @@ import org.junit.Test; import java.text.ParseException; import java.util.Collections; import java.util.HashSet; +import java.util.List; import java.util.Set; import static org.hamcrest.CoreMatchers.equalTo; @@ -182,4 +183,44 @@ public class GroupAvailabilityCalculatorTest { "distributor:8 storage:8 .0.s:d .1.s:d .2.s:d .4.s:d .5.s:d .6.s:d")), equalTo(emptySet())); } + @Test + public void one_safe_maintenance_node_does_not_take_down_group() { + // 2 groups of 5 nodes each. Set node #5 safely in maintenance (1st node in last group). + // Since the minimum number of nodes that can safely be set to maintenance before taking + // the whole group down is 2, the whole group should NOT be taken down. + + DistributionBuilder.GroupBuilder groupBuilder = DistributionBuilder.withGroups(2).eachWithNodeCount(5); + GroupAvailabilityCalculator calculator = GroupAvailabilityCalculator.builder() + .withDistribution(DistributionBuilder.forHierarchicCluster(groupBuilder)) + .withMinNodeRatioPerGroup(0) + .withSafeMaintenanceGroupThreshold(2) + .withNodesSafelySetToMaintenance(List.of(5)) + .build(); + + GroupAvailabilityCalculator.Result result = calculator + .calculate(clusterState("distributor:10 storage:10 .5.s:m .6.s:m .8.s:r .9.s:d")); + assertThat(result.nodesThatShouldBeMaintained(), equalTo(indices())); + assertThat(result.nodesThatShouldBeDown(), equalTo(indices())); + } + + @Test + public void two_safe_maintenance_nodes_takes_down_group() { + // 2 groups of 5 nodes each. Set nodes #5 and #6 safely in maintenance (1st and 2nd nodes + // in last group, respectively). Since the minimum number of nodes that can safely be set to + // maintenance before taking the whole group down is 2, the whole group should be taken down. + + DistributionBuilder.GroupBuilder groupBuilder = DistributionBuilder.withGroups(2).eachWithNodeCount(5); + GroupAvailabilityCalculator calculator = GroupAvailabilityCalculator.builder() + .withDistribution(DistributionBuilder.forHierarchicCluster(groupBuilder)) + .withMinNodeRatioPerGroup(0) + .withSafeMaintenanceGroupThreshold(2) + .withNodesSafelySetToMaintenance(List.of(5, 6)) + .build(); + + GroupAvailabilityCalculator.Result result = calculator + .calculate(clusterState("distributor:10 storage:10 .5.s:m .6.s:m .8.s:r .9.s:d")); + assertThat(result.nodesThatShouldBeMaintained(), equalTo(indices(7, 8, 9))); + assertThat(result.nodesThatShouldBeDown(), equalTo(indices())); + } + } diff --git a/vdslib/src/main/java/com/yahoo/vdslib/state/NodeState.java b/vdslib/src/main/java/com/yahoo/vdslib/state/NodeState.java index 8c38cd07a34..28913d94560 100644 --- a/vdslib/src/main/java/com/yahoo/vdslib/state/NodeState.java +++ b/vdslib/src/main/java/com/yahoo/vdslib/state/NodeState.java @@ -18,6 +18,8 @@ import java.util.StringTokenizer; */ public class NodeState implements Cloneable { + public static final String ORCHESTRATOR_RESERVED_DESCRIPTION = "Orchestrator"; + private final NodeType type; private State state = State.UP; private String description = ""; -- cgit v1.2.3