Support group maintenance [run-systemtest]

author: Håkon Hallingstad <hakon@verizonmedia.com> 2021-01-19 17:16:24 +0100
committer: Håkon Hallingstad <hakon@verizonmedia.com> 2021-01-19 17:16:24 +0100
commit: aa43ca229ae1deaf19a1f60460461ad824c49c0c (patch)
tree: 77af46c141192a18908660e86b8435b6ae6fc00c /clustercontroller-core
parent: a5f0f0f06c53386f32307ccef3b68c9e4dc3ad26 (diff)
6 files changed, 216 insertions, 25 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java
index 74f3cc276a5..f46badf2fd9 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGenerator.java
@@ -10,7 +10,6 @@ import com.yahoo.vdslib.state.State;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Optional;
-import java.util.Set;
 import java.util.TreeMap;
 
 /**
@@ -26,7 +25,7 @@ public class ClusterStateGenerator {
 
     static class Params {
         public ContentCluster cluster;
-        public Map<NodeType, Integer> transitionTimes;
+        public Map<NodeType, Integer> transitionTimes = buildTransitionTimeMap(0, 0);
         public long currentTimeInMillis = 0;
         public int maxPrematureCrashes = 0;
         public int minStorageNodesUp = 1;
@@ -40,7 +39,6 @@ public class ClusterStateGenerator {
         public int maxInitProgressTimeMs = 5000;
 
         Params() {
-            this.transitionTimes = buildTransitionTimeMap(0, 0);
         }
 
         // FIXME de-dupe
@@ -269,18 +267,40 @@ public class ClusterStateGenerator {
         final GroupAvailabilityCalculator calc = new GroupAvailabilityCalculator.Builder()
                 .withMinNodeRatioPerGroup(params.minNodeRatioPerGroup)
                 .withDistribution(params.cluster.getDistribution())
+                .withNodesSafelySetToMaintenance(params.cluster.nodesSafelySetTo(State.MAINTENANCE))
                 .build();
-        final Set<Integer> nodesToTakeDown = calc.nodesThatShouldBeDown(workingState);
+        GroupAvailabilityCalculator.Result result = calc.calculate(workingState);
 
-        for (Integer idx : nodesToTakeDown) {
-            final Node node = storageNode(idx);
-            final NodeState newState = new NodeState(NodeType.STORAGE, State.DOWN);
-            newState.setDescription("group node availability below configured threshold");
-            workingState.setNodeState(node, newState);
-            nodeStateReasons.put(node, NodeStateReason.GROUP_IS_DOWN);
+        for (int index : result.nodesThatShouldBeMaintained()) {
+            setNewNodeState(index, NodeType.STORAGE, State.MAINTENANCE,
+                    "too many safe maintenance nodes in group", NodeStateReason.GROUP_IN_MAINTENANCE,
+                    workingState, nodeStateReasons);
+
+            setNewNodeState(index, NodeType.DISTRIBUTOR, State.DOWN,
+                    "too many safe maintenance nodes in group", NodeStateReason.GROUP_IN_MAINTENANCE,
+                    workingState, nodeStateReasons);
+        }
+
+        for (int index : result.nodesThatShouldBeDown()) {
+            setNewNodeState(index, NodeType.STORAGE, State.DOWN,
+                    "group node availability below configured threshold", NodeStateReason.GROUP_IS_DOWN,
+                    workingState, nodeStateReasons);
         }
     }
 
+    private static void setNewNodeState(int index,
+                                        NodeType nodeType,
+                                        State newState,
+                                        String description,
+                                        NodeStateReason nodeStateReason,
+                                        ClusterState workingState,
+                                        Map<Node, NodeStateReason> nodeStateReasons) {
+        final Node node = new Node(nodeType, index);
+        final NodeState newNodeState = new NodeState(nodeType, newState).setDescription(description);
+        workingState.setNodeState(node, newNodeState);
+        nodeStateReasons.put(node, nodeStateReason);
+    }
+
     private static Node storageNode(int index) {
         return new Node(NodeType.STORAGE, index);
     }
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
index 5e775116bf3..dad2b91d3cc 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
@@ -14,6 +14,9 @@ import com.yahoo.vespa.clustercontroller.core.status.statuspage.VdsClusterHtmlRe
 import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest;
 
 import java.util.*;
+import java.util.stream.Collectors;
+
+import static com.yahoo.vdslib.state.NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION;
 
 public class ContentCluster {
 
@@ -188,6 +191,26 @@ public class ContentCluster {
         return nodeStateChangeChecker.evaluateTransition(node, clusterState, condition, oldState, newState);
     }
 
+    /** Returns the indices of the nodes that have been safely set to the given state by the Orchestrator (best guess). */
+    public List<Integer> nodesSafelySetTo(State state) {
+        switch (state) {
+            case MAINTENANCE:  // Orchestrator's ALLOWED_TO_BE_DOWN
+            case DOWN:  // Orchestrator's PERMANENTLY_DOWN
+                return clusterInfo.getStorageNodeInfo().stream()
+                        .filter(storageNodeInfo -> {
+                            NodeState userWantedState = storageNodeInfo.getUserWantedState();
+                            return userWantedState.getState() == state &&
+                                    Objects.equals(userWantedState.getDescription(), ORCHESTRATOR_RESERVED_DESCRIPTION);
+                        })
+                        .map(NodeInfo::getNodeIndex)
+                        .collect(Collectors.toList());
+            default:
+                // Note: There is no trace left if the Orchestrator set the state to UP, so that's handled
+                // like any other state:
+                return List.of();
+        }
+    }
+
     public void setMinStorageNodesUp(int minStorageNodesUp) {
         this.minStorageNodesUp = minStorageNodesUp;
     }
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java
index 686ef0dee6c..0afad4a0efe 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculator.java
@@ -7,29 +7,37 @@ import com.yahoo.vdslib.distribution.Group;
 import com.yahoo.vdslib.distribution.GroupVisitor;
 import com.yahoo.vdslib.state.ClusterState;
 import com.yahoo.vdslib.state.Node;
-import com.yahoo.vdslib.state.NodeState;
 import com.yahoo.vdslib.state.NodeType;
-import com.yahoo.vdslib.state.State;
 
-import java.util.Collections;
+import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
+import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
 class GroupAvailabilityCalculator {
 
     private final Distribution distribution;
     private final double minNodeRatioPerGroup;
+    private final int safeMaintenanceGroupThreshold;
+    private List<Integer> nodesSafelySetToMaintenance;
 
     private GroupAvailabilityCalculator(Distribution distribution,
-                                        double minNodeRatioPerGroup) {
+                                        double minNodeRatioPerGroup,
+                                        int safeMaintenanceGroupThreshold,
+                                        List<Integer> nodesSafelySetToMaintenance) {
         this.distribution = distribution;
         this.minNodeRatioPerGroup = minNodeRatioPerGroup;
+        this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold;
+        this.nodesSafelySetToMaintenance = nodesSafelySetToMaintenance;
     }
 
     public static class Builder {
         private Distribution distribution;
         private double minNodeRatioPerGroup = 1.0;
+        private int safeMaintenanceGroupThreshold = 2;
+        private List<Integer> nodesSafelySetToMaintenance = new ArrayList<>();
 
         Builder withDistribution(Distribution distribution) {
             this.distribution = distribution;
@@ -39,8 +47,23 @@ class GroupAvailabilityCalculator {
             this.minNodeRatioPerGroup = minRatio;
             return this;
         }
+        /**
+         * If the number of nodes safely set to maintenance is at least this number, the remaining
+         * nodes in the group will be set to maintenance (storage nodes) or down (distributors).
+         *
+         * <p>This feature is disabled if safeMaintenanceGroupThreshold is 0 (not default).</p>
+         */
+        Builder withSafeMaintenanceGroupThreshold(int safeMaintenanceGroupThreshold) {
+            this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold;
+            return this;
+        }
+        Builder withNodesSafelySetToMaintenance(List<Integer> nodesSafelySetToMaintenance) {
+            this.nodesSafelySetToMaintenance.addAll(nodesSafelySetToMaintenance);
+            return this;
+        }
         GroupAvailabilityCalculator build() {
-            return new GroupAvailabilityCalculator(distribution, minNodeRatioPerGroup);
+            return new GroupAvailabilityCalculator(distribution, minNodeRatioPerGroup,
+                    safeMaintenanceGroupThreshold, nodesSafelySetToMaintenance);
         }
     }
 
@@ -49,11 +72,18 @@ class GroupAvailabilityCalculator {
     }
 
     private class InsufficientAvailabilityGroupVisitor implements GroupVisitor {
+        private final Set<Integer> implicitlyMaintained = new HashSet<>();
         private final Set<Integer> implicitlyDown = new HashSet<>();
         private final ClusterState clusterState;
+        private final Set<Integer> nodesSafelySetToMaintenance;
+        private final int safeMaintenanceGroupThreshold;
 
-        public InsufficientAvailabilityGroupVisitor(ClusterState clusterState) {
+        public InsufficientAvailabilityGroupVisitor(ClusterState clusterState,
+                                                    List<Integer> nodesSafelySetToMaintenance,
+                                                    int safeMaintenanceGroupThreshold) {
             this.clusterState = clusterState;
+            this.nodesSafelySetToMaintenance = Set.copyOf(nodesSafelySetToMaintenance);
+            this.safeMaintenanceGroupThreshold = safeMaintenanceGroupThreshold;
         }
 
         private boolean nodeIsAvailableInState(final int index, final String states) {
@@ -75,6 +105,14 @@ class GroupAvailabilityCalculator {
             return g.getNodes().stream().filter(n -> nodeIsAvailableInState(n.index(), "ui"));
         }
 
+        private Stream<ConfiguredNode> candidateNodesForSettingMaintenance(Group g) {
+            // Most states should be set in maintenance, e.g. retirement may take a long time,
+            // so force maintenance to allow upgrades.
+            return g.getNodes().stream()
+                    // "m" is NOT included since that would be a no-op.
+                    .filter(n -> nodeIsAvailableInState(n.index(), "uird"));
+        }
+
         private double computeGroupAvailability(Group g) {
             // TODO also look at distributors
             final long availableNodes = availableNodesIn(g).count();
@@ -83,22 +121,43 @@ class GroupAvailabilityCalculator {
             return availableNodes / (double)g.getNodes().size();
         }
 
+        private int computeNodesSafelySetToMaintenance(Group group) {
+            Set<ConfiguredNode> nodesInGroupSafelySetToMaintenance = group.getNodes().stream()
+                    .filter(configuredNode -> nodesSafelySetToMaintenance.contains(configuredNode.index()))
+                    .collect(Collectors.toSet());
+
+            return nodesInGroupSafelySetToMaintenance.size();
+        }
+
         private void markAllAvailableGroupNodeIndicesAsDown(Group group) {
             candidateNodesForSettingDown(group).forEach(n -> implicitlyDown.add(n.index()));
         }
 
+        private void markAllAvailableGroupNodeIndicesAsMaintained(Group group) {
+            candidateNodesForSettingMaintenance(group).forEach(n -> implicitlyMaintained.add(n.index()));
+        }
+
         @Override
         public boolean visitGroup(Group group) {
             if (group.isLeafGroup()) {
-                if (computeGroupAvailability(group) < minNodeRatioPerGroup) {
+                if (safeMaintenanceGroupThreshold > 0 &&
+                        computeNodesSafelySetToMaintenance(group) >= safeMaintenanceGroupThreshold) {
+                    markAllAvailableGroupNodeIndicesAsMaintained(group);
+                } else if (computeGroupAvailability(group) < minNodeRatioPerGroup) {
                     markAllAvailableGroupNodeIndicesAsDown(group);
                 }
             }
             return true;
         }
 
-        Set<Integer> implicitlyDownNodeIndices() {
-            return implicitlyDown;
+        Result result() {
+            var intersection = new HashSet<>(implicitlyMaintained);
+            intersection.retainAll(implicitlyDown);
+            if (intersection.size() > 0) {
+                throw new IllegalStateException("Nodes implicitly both maintenance and down: " + intersection);
+            }
+
+            return new Result(implicitlyMaintained, implicitlyDown);
         }
     }
 
@@ -106,17 +165,37 @@ class GroupAvailabilityCalculator {
         return root.isLeafGroup();
     }
 
-    public Set<Integer> nodesThatShouldBeDown(ClusterState state) {
+    public static class Result {
+        private final Set<Integer> shouldBeMaintained;
+        private final Set<Integer> shouldBeDown;
+
+        public Result() { this(Set.of(), Set.of()); }
+
+        public Result(Set<Integer> shouldBeMaintained, Set<Integer> shouldBeDown) {
+            this.shouldBeMaintained = Set.copyOf(shouldBeMaintained);
+            this.shouldBeDown = Set.copyOf(shouldBeDown);
+        }
+
+        public Set<Integer> nodesThatShouldBeMaintained() { return shouldBeMaintained; }
+        public Set<Integer> nodesThatShouldBeDown() { return shouldBeDown; }
+    }
+
+    public Result calculate(ClusterState state) {
         if (distribution == null) { // FIXME: for tests that don't set distribution properly!
-            return Collections.emptySet();
+            return new Result();
         }
         if (isFlatCluster(distribution.getRootGroup())) {
             // Implicit group takedown only applies to hierarchic cluster setups.
-            return new HashSet<>();
+            return new Result();
         }
-        InsufficientAvailabilityGroupVisitor visitor = new InsufficientAvailabilityGroupVisitor(state);
+        InsufficientAvailabilityGroupVisitor visitor = new InsufficientAvailabilityGroupVisitor(
+                state, nodesSafelySetToMaintenance, safeMaintenanceGroupThreshold);
         distribution.visitGroups(visitor);
-        return visitor.implicitlyDownNodeIndices();
+        return visitor.result();
+    }
+
+    public Set<Integer> nodesThatShouldBeDown(ClusterState state) {
+        return calculate(state).nodesThatShouldBeDown();
     }
 
 }
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java
index 3f550724cef..77ab8539219 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateReason.java
@@ -7,7 +7,7 @@ public enum NodeStateReason {
     NODE_TOO_UNSTABLE,
     WITHIN_MAINTENANCE_GRACE_PERIOD,
     NODE_NOT_BACK_UP_WITHIN_GRACE_PERIOD,
-    FORCED_INTO_MAINTENANCE,
+    GROUP_IN_MAINTENANCE,
     GROUP_IS_DOWN,
     MAY_HAVE_MERGES_PENDING
 }
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java
index 08329c874b5..0ca8b010191 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterStateGeneratorTest.java
@@ -704,6 +704,34 @@ public class ClusterStateGeneratorTest {
         assertThat(state.toString(), equalTo("distributor:9 storage:9 .3.s:m .4.s:d .5.s:d"));
     }
 
+    @Test
+    public void group_nodes_are_marked_maintenance_if_group_availability_too_low_by_orchestrator() {
+        final ClusterFixture fixture = ClusterFixture
+                .forHierarchicCluster(DistributionBuilder.withGroups(3).eachWithNodeCount(3))
+                .bringEntireClusterUp()
+                .proposeStorageNodeWantedState(4, State.MAINTENANCE, NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION)
+                .proposeStorageNodeWantedState(5, State.MAINTENANCE, NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION);
+        final ClusterStateGenerator.Params params = fixture.generatorParams();
+
+        // Both node 4 & 5 are in maintenance by Orchestrator, which will force the other nodes in the
+        // group to maintenance (node 3).
+        final AnnotatedClusterState state = ClusterStateGenerator.generatedStateFrom(params);
+        assertThat(state.toString(), equalTo("distributor:9 .3.s:d storage:9 .3.s:m .4.s:m .5.s:m"));
+    }
+
+    @Test
+    public void group_nodes_are_not_marked_maintenance_if_group_availability_high_by_orchestrator() {
+        final ClusterFixture fixture = ClusterFixture
+                .forHierarchicCluster(DistributionBuilder.withGroups(3).eachWithNodeCount(3))
+                .bringEntireClusterUp()
+                .proposeStorageNodeWantedState(4, State.MAINTENANCE, NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION);
+        final ClusterStateGenerator.Params params = fixture.generatorParams();
+
+        // Node 4 is in maintenance by Orchestrator, which is not sufficient to force group into maintenance.
+        final AnnotatedClusterState state = ClusterStateGenerator.generatedStateFrom(params);
+        assertThat(state.toString(), equalTo("distributor:9 storage:9 .4.s:m"));
+    }
+
     /**
      * Cluster-wide distribution bit count cannot be higher than the lowest split bit
      * count reported by the set of storage nodes. This is because the distribution bit
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java
index 07bd18c3667..f77a68e0fda 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/GroupAvailabilityCalculatorTest.java
@@ -7,6 +7,7 @@ import org.junit.Test;
 import java.text.ParseException;
 import java.util.Collections;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
 import static org.hamcrest.CoreMatchers.equalTo;
@@ -182,4 +183,44 @@ public class GroupAvailabilityCalculatorTest {
                 "distributor:8 storage:8 .0.s:d .1.s:d .2.s:d .4.s:d .5.s:d .6.s:d")), equalTo(emptySet()));
     }
 
+    @Test
+    public void one_safe_maintenance_node_does_not_take_down_group() {
+        // 2 groups of 5 nodes each.  Set node #5 safely in maintenance (1st node in last group).
+        // Since the minimum number of nodes that can safely be set to maintenance before taking
+        // the whole group down is 2, the whole group should NOT be taken down.
+
+        DistributionBuilder.GroupBuilder groupBuilder = DistributionBuilder.withGroups(2).eachWithNodeCount(5);
+        GroupAvailabilityCalculator calculator = GroupAvailabilityCalculator.builder()
+                .withDistribution(DistributionBuilder.forHierarchicCluster(groupBuilder))
+                .withMinNodeRatioPerGroup(0)
+                .withSafeMaintenanceGroupThreshold(2)
+                .withNodesSafelySetToMaintenance(List.of(5))
+                .build();
+
+        GroupAvailabilityCalculator.Result result = calculator
+                .calculate(clusterState("distributor:10 storage:10 .5.s:m .6.s:m .8.s:r .9.s:d"));
+        assertThat(result.nodesThatShouldBeMaintained(), equalTo(indices()));
+        assertThat(result.nodesThatShouldBeDown(), equalTo(indices()));
+    }
+
+    @Test
+    public void two_safe_maintenance_nodes_takes_down_group() {
+        // 2 groups of 5 nodes each.  Set nodes #5 and #6 safely in maintenance (1st and 2nd nodes
+        // in last group, respectively).  Since the minimum number of nodes that can safely be set to
+        // maintenance before taking the whole group down is 2, the whole group should be taken down.
+
+        DistributionBuilder.GroupBuilder groupBuilder = DistributionBuilder.withGroups(2).eachWithNodeCount(5);
+        GroupAvailabilityCalculator calculator = GroupAvailabilityCalculator.builder()
+                .withDistribution(DistributionBuilder.forHierarchicCluster(groupBuilder))
+                .withMinNodeRatioPerGroup(0)
+                .withSafeMaintenanceGroupThreshold(2)
+                .withNodesSafelySetToMaintenance(List.of(5, 6))
+                .build();
+
+        GroupAvailabilityCalculator.Result result = calculator
+                .calculate(clusterState("distributor:10 storage:10 .5.s:m .6.s:m .8.s:r .9.s:d"));
+        assertThat(result.nodesThatShouldBeMaintained(), equalTo(indices(7, 8, 9)));
+        assertThat(result.nodesThatShouldBeDown(), equalTo(indices()));
+    }
+
 }
author	Håkon Hallingstad <hakon@verizonmedia.com>	2021-01-19 17:16:24 +0100
committer	Håkon Hallingstad <hakon@verizonmedia.com>	2021-01-19 17:16:24 +0100
commit	aa43ca229ae1deaf19a1f60460461ad824c49c0c (patch)
tree	77af46c141192a18908660e86b8435b6ae6fc00c /clustercontroller-core
parent	a5f0f0f06c53386f32307ccef3b68c9e4dc3ad26 (diff)