summaryrefslogtreecommitdiffstats
path: root/clustercontroller-core
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2021-01-19 17:16:24 +0100
committerHåkon Hallingstad <hakon@verizonmedia.com>2021-01-19 17:16:24 +0100
commitaa43ca229ae1deaf19a1f60460461ad824c49c0c (patch)
tree77af46c141192a18908660e86b8435b6ae6fc00c /clustercontroller-core
parenta5f0f0f06c53386f32307ccef3b68c9e4dc3ad26 (diff)
Support group maintenance [run-systemtest]
Diffstat (limited to 'clustercontroller-core')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java40
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java23
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java107
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java2
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java28
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java41
6 files changed, 216 insertions, 25 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java
index 74f3cc276a5..f46badf2fd9 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java
@@ -10,7 +10,6 @@ import com.yahoo.vdslib.state.State;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
-import java.util.Set;
import java.util.TreeMap;
/**
@@ -26,7 +25,7 @@ public class ClusterStateGenerator {
static class Params {
public ContentCluster cluster;
- public Map<NodeType, Integer> transitionTimes;
+ public Map<NodeType, Integer> transitionTimes = buildTransitionTimeMap(0, 0);
public long currentTimeInMillis = 0;
public int maxPrematureCrashes = 0;
public int minStorageNodesUp = 1;
@@ -40,7 +39,6 @@ public class ClusterStateGenerator {
public int maxInitProgressTimeMs = 5000;
Params() {
- this.transitionTimes = buildTransitionTimeMap(0, 0);
}
// FIXME de-dupe
@@ -269,18 +267,40 @@ public class ClusterStateGenerator {
final GroupAvailabilityCalculator calc = new GroupAvailabilityCalculator.Builder()
.withMinNodeRatioPerGroup(params.minNodeRatioPerGroup)
.withDistribution(params.cluster.getDistribution())
+ .withNodesSafelySetToMaintenance(params.cluster.nodesSafelySetTo(State.MAINTENANCE))
.build();
- final Set<Integer> nodesToTakeDown = calc.nodesThatShouldBeDown(workingState);
+ GroupAvailabilityCalculator.Result result = calc.calculate(workingState);
- for (Integer idx : nodesToTakeDown) {
- final Node node = storageNode(idx);
- final NodeState newState = new NodeState(NodeType.STORAGE, State.DOWN);
- newState.setDescription("group node availability below configured threshold");
- workingState.setNodeState(node, newState);
- nodeStateReasons.put(node, NodeStateReason.GROUP_IS_DOWN);
+ for (int index : result.nodesThatShouldBeMaintained()) {
+ setNewNodeState(index, NodeType.STORAGE, State.MAINTENANCE,
+ "too many safe maintenance nodes in group", NodeStateReason.GROUP_IN_MAINTENANCE,
+ workingState, nodeStateReasons);
+
+ setNewNodeState(index, NodeType.DISTRIBUTOR, State.DOWN,
+ "too many safe maintenance nodes in group", NodeStateReason.GROUP_IN_MAINTENANCE,
+ workingState, nodeStateReasons);
+ }
+
+ for (int index : result.nodesThatShouldBeDown()) {
+ setNewNodeState(index, NodeType.STORAGE, State.DOWN,
+ "group node availability below configured threshold", NodeStateReason.GROUP_IS_DOWN,
+ workingState, nodeStateReasons);
}
}
+ private static void setNewNodeState(int index,
+ NodeType nodeType,
+ State newState,
+ String description,
+ NodeStateReason nodeStateReason,
+ ClusterState workingState,
+ Map<Node, NodeStateReason> nodeStateReasons) {
+ final Node node = new Node(nodeType, index);
+ final NodeState newNodeState = new NodeState(nodeType, newState).setDescription(description);
+ workingState.setNodeState(node, newNodeState);
+ nodeStateReasons.put(node, nodeStateReason);
+ }
+
private static Node storageNode(int index) {
return new Node(NodeType.STORAGE, index);
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
index 5e775116bf3..dad2b91d3cc 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
@@ -14,6 +14,9 @@ import com.yahoo.vespa.clustercontroller.core.status.statuspage.VdsClusterHtmlRe
import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest;
import java.util.*;
+import java.util.stream.Collectors;
+
+import static com.yahoo.vdslib.state.NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION;
public class ContentCluster {
@@ -188,6 +191,26 @@ public class ContentCluster {
return nodeStateChangeChecker.evaluateTransition(node, clusterState, condition, oldState, newState);
}
+ /** Returns the indices of the nodes that have been safely set to the given state by the Orchestrator (best guess). */
+ public List<Integer> nodesSafelySetTo(State state) {
+ switch (state) {
+ case MAINTENANCE: // Orchestrator's ALLOWED_TO_BE_DOWN
+ case DOWN: // Orchestrator's PERMANENTLY_DOWN
+ return clusterInfo.getStorageNodeInfo().stream()
+ .filter(storageNodeInfo -> {
+ NodeState userWantedState = storageNodeInfo.getUserWantedState();
+ return userWantedState.getState() == state &&
+ Objects.equals(userWantedState.getDescription(), ORCHESTRATOR_RESERVED_DESCRIPTION);
+ })
+ .map(NodeInfo::getNodeIndex)
+ .collect(Collectors.toList());
+ default:
+ // Note: There is no trace left if the Orchestrator set the state to UP, so that's handled
+ // like any other state:
+ return List.of();
+ }
+ }
+
public void setMinStorageNodesUp(int minStorageNodesUp) {
this.minStorageNodesUp = minStorageNodesUp;
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java
index 686ef0dee6c..0afad4a0efe 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java
@@ -7,29 +7,37 @@ import com.yahoo.vdslib.distribution.Group;
import com.yahoo.vdslib.distribution.GroupVisitor;
import com.yahoo.vdslib.state.ClusterState;
import com.yahoo.vdslib.state.Node;
-import com.yahoo.vdslib.state.NodeState;
import com.yahoo.vdslib.state.NodeType;
-import com.yahoo.vdslib.state.State;
-import java.util.Collections;
+import java.util.ArrayList;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
+import java.util.stream.Collectors;
import java.util.stream.Stream;
class GroupAvailabilityCalculator {
private final Distribution distribution;
private final double minNodeRatioPerGroup;
+ private final int safeMaintenanceGroupThreshold;
+ private List<Integer> nodesSafelySetToMaintenance;
private GroupAvailabilityCalculator(Distribution distribution,
- double minNodeRatioPerGroup) {
+ double minNodeRatioPerGroup,
+ int safeMaintenanceGroupThreshold,
+ List<Integer> nodesSafelySetToMaintenance) {
this.distribution = distribution;
this.minNodeRatioPerGroup = minNodeRatioPerGroup;
+ this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold;
+ this.nodesSafelySetToMaintenance = nodesSafelySetToMaintenance;
}
public static class Builder {
private Distribution distribution;
private double minNodeRatioPerGroup = 1.0;
+ private int safeMaintenanceGroupThreshold = 2;
+ private List<Integer> nodesSafelySetToMaintenance = new ArrayList<>();
Builder withDistribution(Distribution distribution) {
this.distribution = distribution;
@@ -39,8 +47,23 @@ class GroupAvailabilityCalculator {
this.minNodeRatioPerGroup = minRatio;
return this;
}
+ /**
+ * If the number of nodes safely set to maintenance is at least this number, the remaining
+ * nodes in the group will be set to maintenance (storage nodes) or down (distributors).
+ *
+ * <p>This feature is disabled if safeMaintenanceGroupThreshold is 0 (not default).</p>
+ */
+ Builder withSafeMaintenanceGroupThreshold(int safeMaintenanceGroupThreshold) {
+ this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold;
+ return this;
+ }
+ Builder withNodesSafelySetToMaintenance(List<Integer> nodesSafelySetToMaintenance) {
+ this.nodesSafelySetToMaintenance.addAll(nodesSafelySetToMaintenance);
+ return this;
+ }
GroupAvailabilityCalculator build() {
- return new GroupAvailabilityCalculator(distribution, minNodeRatioPerGroup);
+ return new GroupAvailabilityCalculator(distribution, minNodeRatioPerGroup,
+ safeMaintenanceGroupThreshold, nodesSafelySetToMaintenance);
}
}
@@ -49,11 +72,18 @@ class GroupAvailabilityCalculator {
}
private class InsufficientAvailabilityGroupVisitor implements GroupVisitor {
+ private final Set<Integer> implicitlyMaintained = new HashSet<>();
private final Set<Integer> implicitlyDown = new HashSet<>();
private final ClusterState clusterState;
+ private final Set<Integer> nodesSafelySetToMaintenance;
+ private final int safeMaintenanceGroupThreshold;
- public InsufficientAvailabilityGroupVisitor(ClusterState clusterState) {
+ public InsufficientAvailabilityGroupVisitor(ClusterState clusterState,
+ List<Integer> nodesSafelySetToMaintenance,
+ int safeMaintenanceGroupThreshold) {
this.clusterState = clusterState;
+ this.nodesSafelySetToMaintenance = Set.copyOf(nodesSafelySetToMaintenance);
+ this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold;
}
private boolean nodeIsAvailableInState(final int index, final String states) {
@@ -75,6 +105,14 @@ class GroupAvailabilityCalculator {
return g.getNodes().stream().filter(n -> nodeIsAvailableInState(n.index(), "ui"));
}
+ private Stream<ConfiguredNode> candidateNodesForSettingMaintenance(Group g) {
+ // Most states should be set in maintenance, e.g. retirement may take a long time,
+ // so force maintenance to allow upgrades.
+ return g.getNodes().stream()
+ // "m" is NOT included since that would be a no-op.
+ .filter(n -> nodeIsAvailableInState(n.index(), "uird"));
+ }
+
private double computeGroupAvailability(Group g) {
// TODO also look at distributors
final long availableNodes = availableNodesIn(g).count();
@@ -83,22 +121,43 @@ class GroupAvailabilityCalculator {
return availableNodes / (double)g.getNodes().size();
}
+ private int computeNodesSafelySetToMaintenance(Group group) {
+ Set<ConfiguredNode> nodesInGroupSafelySetToMaintenance = group.getNodes().stream()
+ .filter(configuredNode -> nodesSafelySetToMaintenance.contains(configuredNode.index()))
+ .collect(Collectors.toSet());
+
+ return nodesInGroupSafelySetToMaintenance.size();
+ }
+
private void markAllAvailableGroupNodeIndicesAsDown(Group group) {
candidateNodesForSettingDown(group).forEach(n -> implicitlyDown.add(n.index()));
}
+ private void markAllAvailableGroupNodeIndicesAsMaintained(Group group) {
+ candidateNodesForSettingMaintenance(group).forEach(n -> implicitlyMaintained.add(n.index()));
+ }
+
@Override
public boolean visitGroup(Group group) {
if (group.isLeafGroup()) {
- if (computeGroupAvailability(group) < minNodeRatioPerGroup) {
+ if (safeMaintenanceGroupThreshold > 0 &&
+ computeNodesSafelySetToMaintenance(group) >= safeMaintenanceGroupThreshold) {
+ markAllAvailableGroupNodeIndicesAsMaintained(group);
+ } else if (computeGroupAvailability(group) < minNodeRatioPerGroup) {
markAllAvailableGroupNodeIndicesAsDown(group);
}
}
return true;
}
- Set<Integer> implicitlyDownNodeIndices() {
- return implicitlyDown;
+ Result result() {
+ var intersection = new HashSet<>(implicitlyMaintained);
+ intersection.retainAll(implicitlyDown);
+ if (intersection.size() > 0) {
+ throw new IllegalStateException("Nodes implicitly both maintenance and down: " + intersection);
+ }
+
+ return new Result(implicitlyMaintained, implicitlyDown);
}
}
@@ -106,17 +165,37 @@ class GroupAvailabilityCalculator {
return root.isLeafGroup();
}
- public Set<Integer> nodesThatShouldBeDown(ClusterState state) {
+ public static class Result {
+ private final Set<Integer> shouldBeMaintained;
+ private final Set<Integer> shouldBeDown;
+
+ public Result() { this(Set.of(), Set.of()); }
+
+ public Result(Set<Integer> shouldBeMaintained, Set<Integer> shouldBeDown) {
+ this.shouldBeMaintained = Set.copyOf(shouldBeMaintained);
+ this.shouldBeDown = Set.copyOf(shouldBeDown);
+ }
+
+ public Set<Integer> nodesThatShouldBeMaintained() { return shouldBeMaintained; }
+ public Set<Integer> nodesThatShouldBeDown() { return shouldBeDown; }
+ }
+
+ public Result calculate(ClusterState state) {
if (distribution == null) { // FIXME: for tests that don't set distribution properly!
- return Collections.emptySet();
+ return new Result();
}
if (isFlatCluster(distribution.getRootGroup())) {
// Implicit group takedown only applies to hierarchic cluster setups.
- return new HashSet<>();
+ return new Result();
}
- InsufficientAvailabilityGroupVisitor visitor = new InsufficientAvailabilityGroupVisitor(state);
+ InsufficientAvailabilityGroupVisitor visitor = new InsufficientAvailabilityGroupVisitor(
+ state, nodesSafelySetToMaintenance, safeMaintenanceGroupThreshold);
distribution.visitGroups(visitor);
- return visitor.implicitlyDownNodeIndices();
+ return visitor.result();
+ }
+
+ public Set<Integer> nodesThatShouldBeDown(ClusterState state) {
+ return calculate(state).nodesThatShouldBeDown();
}
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java
index 3f550724cef..77ab8539219 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java
@@ -7,7 +7,7 @@ public enum NodeStateReason {
NODE_TOO_UNSTABLE,
WITHIN_MAINTENANCE_GRACE_PERIOD,
NODE_NOT_BACK_UP_WITHIN_GRACE_PERIOD,
- FORCED_INTO_MAINTENANCE,
+ GROUP_IN_MAINTENANCE,
GROUP_IS_DOWN,
MAY_HAVE_MERGES_PENDING
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java
index 08329c874b5..0ca8b010191 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java
@@ -704,6 +704,34 @@ public class ClusterStateGeneratorTest {
assertThat(state.toString(), equalTo("distributor:9 storage:9 .3.s:m .4.s:d .5.s:d"));
}
+ @Test
+ public void group_nodes_are_marked_maintenance_if_group_availability_too_low_by_orchestrator() {
+ final ClusterFixture fixture = ClusterFixture
+ .forHierarchicCluster(DistributionBuilder.withGroups(3).eachWithNodeCount(3))
+ .bringEntireClusterUp()
+ .proposeStorageNodeWantedState(4, State.MAINTENANCE, NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION)
+ .proposeStorageNodeWantedState(5, State.MAINTENANCE, NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION);
+ final ClusterStateGenerator.Params params = fixture.generatorParams();
+
+ // Both node 4 & 5 are in maintenance by Orchestrator, which will force the other nodes in the
+ // group to maintenance (node 3).
+ final AnnotatedClusterState state = ClusterStateGenerator.generatedStateFrom(params);
+ assertThat(state.toString(), equalTo("distributor:9 .3.s:d storage:9 .3.s:m .4.s:m .5.s:m"));
+ }
+
+ @Test
+ public void group_nodes_are_not_marked_maintenance_if_group_availability_high_by_orchestrator() {
+ final ClusterFixture fixture = ClusterFixture
+ .forHierarchicCluster(DistributionBuilder.withGroups(3).eachWithNodeCount(3))
+ .bringEntireClusterUp()
+ .proposeStorageNodeWantedState(4, State.MAINTENANCE, NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION);
+ final ClusterStateGenerator.Params params = fixture.generatorParams();
+
+ // Node 4 is in maintenance by Orchestrator, which is not sufficient to force group into maintenance.
+ final AnnotatedClusterState state = ClusterStateGenerator.generatedStateFrom(params);
+ assertThat(state.toString(), equalTo("distributor:9 storage:9 .4.s:m"));
+ }
+
/**
* Cluster-wide distribution bit count cannot be higher than the lowest split bit
* count reported by the set of storage nodes. This is because the distribution bit
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java
index 07bd18c3667..f77a68e0fda 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java
@@ -7,6 +7,7 @@ import org.junit.Test;
import java.text.ParseException;
import java.util.Collections;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import static org.hamcrest.CoreMatchers.equalTo;
@@ -182,4 +183,44 @@ public class GroupAvailabilityCalculatorTest {
"distributor:8 storage:8 .0.s:d .1.s:d .2.s:d .4.s:d .5.s:d .6.s:d")), equalTo(emptySet()));
}
+ @Test
+ public void one_safe_maintenance_node_does_not_take_down_group() {
+ // 2 groups of 5 nodes each. Set node #5 safely in maintenance (1st node in last group).
+ // Since the minimum number of nodes that can safely be set to maintenance before taking
+ // the whole group down is 2, the whole group should NOT be taken down.
+
+ DistributionBuilder.GroupBuilder groupBuilder = DistributionBuilder.withGroups(2).eachWithNodeCount(5);
+ GroupAvailabilityCalculator calculator = GroupAvailabilityCalculator.builder()
+ .withDistribution(DistributionBuilder.forHierarchicCluster(groupBuilder))
+ .withMinNodeRatioPerGroup(0)
+ .withSafeMaintenanceGroupThreshold(2)
+ .withNodesSafelySetToMaintenance(List.of(5))
+ .build();
+
+ GroupAvailabilityCalculator.Result result = calculator
+ .calculate(clusterState("distributor:10 storage:10 .5.s:m .6.s:m .8.s:r .9.s:d"));
+ assertThat(result.nodesThatShouldBeMaintained(), equalTo(indices()));
+ assertThat(result.nodesThatShouldBeDown(), equalTo(indices()));
+ }
+
+ @Test
+ public void two_safe_maintenance_nodes_takes_down_group() {
+ // 2 groups of 5 nodes each. Set nodes #5 and #6 safely in maintenance (1st and 2nd nodes
+ // in last group, respectively). Since the minimum number of nodes that can safely be set to
+ // maintenance before taking the whole group down is 2, the whole group should be taken down.
+
+ DistributionBuilder.GroupBuilder groupBuilder = DistributionBuilder.withGroups(2).eachWithNodeCount(5);
+ GroupAvailabilityCalculator calculator = GroupAvailabilityCalculator.builder()
+ .withDistribution(DistributionBuilder.forHierarchicCluster(groupBuilder))
+ .withMinNodeRatioPerGroup(0)
+ .withSafeMaintenanceGroupThreshold(2)
+ .withNodesSafelySetToMaintenance(List.of(5, 6))
+ .build();
+
+ GroupAvailabilityCalculator.Result result = calculator
+ .calculate(clusterState("distributor:10 storage:10 .5.s:m .6.s:m .8.s:r .9.s:d"));
+ assertThat(result.nodesThatShouldBeMaintained(), equalTo(indices(7, 8, 9)));
+ assertThat(result.nodesThatShouldBeDown(), equalTo(indices()));
+ }
+
}