From 05b4c5f5b2192657f6c1ecd29e04593abfa4ea6f Mon Sep 17 00:00:00 2001 From: Harald Musum Date: Thu, 30 Mar 2023 15:28:39 +0200 Subject: Allow more than 1 group in a content to be down at the same time Based on config, all functional changes guarded by config field max_number_of_groups_allowed_to_be_down in fleetcontroller config --- .../core/NodeStateChangeChecker.java | 90 ++++++++++++- .../core/NodeStateChangeCheckerTest.java | 143 +++++++++++++++++++-- 2 files changed, 217 insertions(+), 16 deletions(-) diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java index e242833fd0c..adbc8da8d08 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java @@ -13,13 +13,18 @@ import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo; import com.yahoo.vespa.clustercontroller.core.hostinfo.Metrics; import com.yahoo.vespa.clustercontroller.core.hostinfo.StorageNode; import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest; +import java.util.Collection; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.logging.Logger; +import java.util.stream.Collectors; import static com.yahoo.vdslib.state.NodeType.STORAGE; import static com.yahoo.vdslib.state.State.DOWN; +import static com.yahoo.vdslib.state.State.MAINTENANCE; import static com.yahoo.vdslib.state.State.RETIRED; import static com.yahoo.vdslib.state.State.UP; import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Result.allowSettingOfWantedState; @@ -27,6 +32,8 @@ import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Resu import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Result.createDisallowed; import static com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest.Condition.FORCE; import static com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest.Condition.SAFE; +import static java.util.logging.Level.FINE; +import static java.util.logging.Level.INFO; /** * Checks if a node can be upgraded. @@ -35,8 +42,9 @@ import static com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetU */ public class NodeStateChangeChecker { - public static final String BUCKETS_METRIC_NAME = "vds.datastored.bucket_space.buckets_total"; - public static final Map BUCKETS_METRIC_DIMENSIONS = Map.of("bucketSpace", "default"); + private static final Logger log = Logger.getLogger(NodeStateChangeChecker.class.getName()); + private static final String BUCKETS_METRIC_NAME = "vds.datastored.bucket_space.buckets_total"; + private static final Map BUCKETS_METRIC_DIMENSIONS = Map.of("bucketSpace", "default"); private final int requiredRedundancy; private final HierarchicalGroupVisiting groupVisiting; @@ -50,6 +58,8 @@ public class NodeStateChangeChecker { this.clusterInfo = cluster.clusterInfo(); this.inMoratorium = inMoratorium; this.maxNumberOfGroupsAllowedToBeDown = cluster.maxNumberOfGroupsAllowedToBeDown(); + if ( ! groupVisiting.isHierarchical() && maxNumberOfGroupsAllowedToBeDown > 1) + throw new IllegalArgumentException("Cannot have both 1 group and maxNumberOfGroupsAllowedToBeDown > 1"); } public static class Result { @@ -214,26 +224,37 @@ public class NodeStateChangeChecker { oldWantedState.getState() + ": " + oldWantedState.getDescription()); } - Result otherGroupCheck = anotherNodeInAnotherGroupHasWantedState(nodeInfo); - if (!otherGroupCheck.settingWantedStateIsAllowed()) { - return otherGroupCheck; + if (moreThanOneGroupAllowedToBeDown()) { + Result otherGroupCheck = nodesInOtherGroups(nodeInfo.getNode()); + if (!otherGroupCheck.settingWantedStateIsAllowed()) { + return otherGroupCheck; + } + } else { + Result otherGroupCheck = anotherNodeInAnotherGroupHasWantedState(nodeInfo); + if (!otherGroupCheck.settingWantedStateIsAllowed()) { + return otherGroupCheck; + } } if (clusterState.getNodeState(nodeInfo.getNode()).getState() == DOWN) { + log.log(FINE, "node is DOWN, allow"); return allowSettingOfWantedState(); } if (anotherNodeInGroupAlreadyAllowed(nodeInfo, newDescription)) { + log.log(FINE, "anotherNodeInGroupAlreadyAllowed, allow"); return allowSettingOfWantedState(); } Result allNodesAreUpCheck = checkAllNodesAreUp(clusterState); if (!allNodesAreUpCheck.settingWantedStateIsAllowed()) { + log.log(FINE, "allNodesAreUpCheck: " + allNodesAreUpCheck); return allNodesAreUpCheck; } Result checkDistributorsResult = checkDistributors(nodeInfo.getNode(), clusterState.getVersion()); if (!checkDistributorsResult.settingWantedStateIsAllowed()) { + log.log(FINE, "checkDistributors: "+ checkDistributorsResult); return checkDistributorsResult; } @@ -268,6 +289,29 @@ public class NodeStateChangeChecker { } } + /** + * Returns a disallow-result if there is other groups with nodes that are allowed to be down + * so that the number of groups > maxNumberOfGroupsAllowedToBeDown + */ + private Result nodesInOtherGroups(Node node) { + log.log(INFO, maxNumberOfGroupsAllowedToBeDown + " groups allowed to be down"); + var groupsWithNodesInMaintenance = groupsWithStorageNodesInMaintenance(); + log.log(INFO, "groupsWithStorageNodesInMaintenance " + groupsWithNodesInMaintenance); + if (! groupsWithNodesInMaintenance.isEmpty() && oneOfGroupsContainsNode(groupsWithNodesInMaintenance, node)) { + log.log(INFO, "a group with storage nodes in maintenance contains node " + node); + return allowSettingOfWantedState(); + } + if (groupsWithNodesInMaintenance.size() >= maxNumberOfGroupsAllowedToBeDown) + return createDisallowed("Nodes in " + groupsWithNodesInMaintenance.size() + " groups are already down, cannot take down another node"); + else { + return allowSettingOfWantedState(); + } + } + + private boolean moreThanOneGroupAllowedToBeDown() { + return maxNumberOfGroupsAllowedToBeDown > 1; + } + /** Returns a disallow-result, if there is a node in the group with wanted state != UP. */ private Result otherNodeInGroupHasWantedState(Group group) { for (var configuredNode : group.getNodes()) { @@ -354,7 +398,22 @@ public class NodeStateChangeChecker { return false; } + private static boolean oneOfGroupsContainsNode(Collection groups, Node node) { + for (Group group : groups) { + for (ConfiguredNode configuredNode : group.getNodes()) { + if (configuredNode.index() == node.getIndex()) { + return true; + } + } + } + + return false; + } + private Result checkAllNodesAreUp(ClusterState clusterState) { + // TODO Revisit this, check that all nodes in other groups are up? + if (moreThanOneGroupAllowedToBeDown()) return allowSettingOfWantedState(); + // This method verifies both storage nodes and distributors are up (or retired). // The complicated part is making a summary error message. @@ -441,4 +500,25 @@ public class NodeStateChangeChecker { return allowSettingOfWantedState(); } + private Collection groupsWithStorageNodesInMaintenance() { + var groupIndexes = clusterInfo.getStorageNodeInfos().stream() + .filter(sni -> MAINTENANCE.equals(sni.getWantedState().getState())) + .map(NodeInfo::getGroup) + .filter(Objects::nonNull) + .filter(Group::isLeafGroup) + .map(Group::getIndex) + .collect(Collectors.toSet()); + + // Groups are not equal even if they have the same index + var groups = new HashMap(); + for (NodeInfo nodeInfo : clusterInfo.getStorageNodeInfos()) { + Group group = nodeInfo.getGroup(); + int index = group.getIndex(); + if (groupIndexes.contains(index) && !groups.containsKey(index)) + groups.put(index, group); + } + + return groups.values(); + } + } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java index a22785e6d7a..58f9b563288 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java @@ -19,6 +19,7 @@ import static com.yahoo.vdslib.state.NodeType.DISTRIBUTOR; import static com.yahoo.vdslib.state.NodeType.STORAGE; import static com.yahoo.vdslib.state.State.DOWN; import static com.yahoo.vdslib.state.State.INITIALIZING; +import static com.yahoo.vdslib.state.State.MAINTENANCE; import static com.yahoo.vdslib.state.State.UP; import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Result; import static com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest.Condition.FORCE; @@ -37,7 +38,7 @@ public class NodeStateChangeCheckerTest { private static final Node nodeStorage = new Node(STORAGE, 1); private static final NodeState UP_NODE_STATE = new NodeState(STORAGE, UP); - private static final NodeState MAINTENANCE_NODE_STATE = createNodeState(State.MAINTENANCE, "Orchestrator"); + private static final NodeState MAINTENANCE_NODE_STATE = createNodeState(MAINTENANCE, "Orchestrator"); private static final NodeState DOWN_NODE_STATE = createNodeState(DOWN, "RetireEarlyExpirer"); private static NodeState createNodeState(State state, String description) { @@ -53,7 +54,11 @@ public class NodeStateChangeCheckerTest { } private static ClusterState defaultAllUpClusterState() { - return clusterState(String.format("version:%d distributor:4 storage:4", currentClusterStateVersion)); + return defaultAllUpClusterState(4); + } + + private static ClusterState defaultAllUpClusterState(int nodeCount) { + return clusterState(String.format("version:%d distributor:" + nodeCount + " storage:" + nodeCount, currentClusterStateVersion)); } private NodeStateChangeChecker createChangeChecker(ContentCluster cluster) { @@ -144,7 +149,7 @@ public class NodeStateChangeCheckerTest { void testSafeMaintenanceDisallowedWhenOtherStorageNodeInFlatClusterIsSuspended() { // Nodes 0-3, storage node 0 being in maintenance with "Orchestrator" description. ContentCluster cluster = createCluster(4); - cluster.clusterInfo().getStorageNodeInfo(0).setWantedState(new NodeState(STORAGE, State.MAINTENANCE).setDescription("Orchestrator")); + setStorageNodeWantedStateToMaintenance(cluster, 0); var nodeStateChangeChecker = createChangeChecker(cluster); ClusterState clusterStateWith0InMaintenance = clusterState(String.format( "version:%d distributor:4 storage:4 .0.s:m", @@ -159,12 +164,106 @@ public class NodeStateChangeCheckerTest { result.getReason()); } + @Test + void testMaintenanceAllowedFor2Of4Groups() { + // 4 groups with 1 node in each group + Collection nodes = createNodes(4); + StorDistributionConfig config = createDistributionConfig(4, 4); + + int maxNumberOfGroupsAllowedToBeDown = 2; + var cluster = new ContentCluster("Clustername", nodes, new Distribution(config), maxNumberOfGroupsAllowedToBeDown); + setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 5, 6))); + var nodeStateChangeChecker = createChangeChecker(cluster); + + // All nodes up, set a storage node in group 0 to maintenance + { + int nodeIndex = 0; + checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, defaultAllUpClusterState()); + setStorageNodeWantedStateToMaintenance(cluster, nodeIndex); + } + + // Node in group 0 in maintenance, set storage node in group 1 to maintenance + { + ClusterState clusterState = clusterState(String.format("version:%d distributor:4 .0.s:d storage:4 .0.s:m", currentClusterStateVersion)); + int nodeIndex = 1; + checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, clusterState); + setStorageNodeWantedStateToMaintenance(cluster, nodeIndex); + } + + // Nodes in group 0 and 1 in maintenance, try to set storage node in group 2 to maintenance, should fail + { + ClusterState clusterState = clusterState(String.format("version:%d distributor:4 storage:4 .0.s:m .1.s:m", currentClusterStateVersion)); + int nodeIndex = 2; + Node node = new Node(STORAGE, nodeIndex); + Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); + assertFalse(result.settingWantedStateIsAllowed(), result.toString()); + assertFalse(result.wantedStateAlreadySet()); + assertEquals("Nodes in 2 groups are already down, cannot take down another node", result.getReason()); + } + + } + + @Test + void testMaintenanceAllowedFor2Of4Groups8Nodes() { + // 4 groups with 2 node in each group + Collection nodes = createNodes(8); + StorDistributionConfig config = createDistributionConfig(8, 4); + + int maxNumberOfGroupsAllowedToBeDown = 2; + var cluster = new ContentCluster("Clustername", nodes, new Distribution(config), maxNumberOfGroupsAllowedToBeDown); + setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 5, 6))); + var nodeStateChangeChecker = createChangeChecker(cluster); + + // All nodes up, set a storage node in group 0 to maintenance + { + ClusterState clusterState = defaultAllUpClusterState(8); + int nodeIndex = 0; + checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, clusterState); + setStorageNodeWantedStateToMaintenance(cluster, nodeIndex); + } + + // 1 Node in group 0 in maintenance, try to set node 1 in group 0 to maintenance + { + ClusterState clusterState = clusterState(String.format("version:%d distributor:8 .0.s:d storage:8 .0.s:m", currentClusterStateVersion)); + int nodeIndex = 1; + checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, clusterState); + setStorageNodeWantedStateToMaintenance(cluster, nodeIndex); + } + + // Nodes in group 0 in maintenance, try to set storage node 2 in group 1 to maintenance + { + ClusterState clusterState = clusterState(String.format("version:%d distributor:8 storage:8 .0.s:m .1.s:m", currentClusterStateVersion)); + int nodeIndex = 2; + checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, clusterState); + setStorageNodeWantedStateToMaintenance(cluster, nodeIndex); + } + + // 2 nodes in group 0 and 1 in group 1 in maintenance, try to set storage node 4 in group 2 to maintenance, should fail (different group) + { + ClusterState clusterState = clusterState(String.format("version:%d distributor:8 storage:8 .0.s:m .1.s:m .2.s:m", currentClusterStateVersion)); + int nodeIndex = 4; + Node node = new Node(STORAGE, nodeIndex); + Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); + assertFalse(result.settingWantedStateIsAllowed(), result.toString()); + assertFalse(result.wantedStateAlreadySet()); + assertEquals("Nodes in 2 groups are already down, cannot take down another node", result.getReason()); + } + + // 2 nodes in group 0 and 1 in group 1 in maintenance, try to set storage node 3 in group 1 to maintenance + { + ClusterState clusterState = clusterState(String.format("version:%d distributor:8 storage:8 .0.s:m .1.s:m .2.s:m", currentClusterStateVersion)); + int nodeIndex = 3; + checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, clusterState); + setStorageNodeWantedStateToMaintenance(cluster, nodeIndex); + } + + } + @Test void testSafeMaintenanceDisallowedWhenOtherDistributorInFlatClusterIsSuspended() { // Nodes 0-3, distributor 0 being down with "Orchestrator" description. ContentCluster cluster = createCluster(4); - cluster.clusterInfo().getDistributorNodeInfo(0) - .setWantedState(new NodeState(DISTRIBUTOR, DOWN).setDescription("Orchestrator")); + setDistributorNodeWantedState(cluster, 0, new NodeState(DISTRIBUTOR, DOWN), "Orchestrator"); var nodeStateChangeChecker = createChangeChecker(cluster); ClusterState clusterStateWith0InMaintenance = clusterState(String.format( "version:%d distributor:4 .0.s:d storage:4", @@ -184,8 +283,7 @@ public class NodeStateChangeCheckerTest { // Nodes 0-3, distributor 0 being in maintenance with "Orchestrator" description. // 2 groups: nodes 0-1 is group 0, 2-3 is group 1. ContentCluster cluster = createCluster(4, 2); - cluster.clusterInfo().getDistributorNodeInfo(0) - .setWantedState(new NodeState(STORAGE, DOWN).setDescription("Orchestrator")); + setDistributorNodeWantedState(cluster, 0, new NodeState(DISTRIBUTOR, DOWN), "Orchestrator"); var nodeStateChangeChecker = new NodeStateChangeChecker(cluster, false); ClusterState clusterStateWith0InMaintenance = clusterState(String.format( "version:%d distributor:4 .0.s:d storage:4", @@ -217,7 +315,7 @@ public class NodeStateChangeCheckerTest { // Nodes 0-3, storage node 0 being in maintenance with "Orchestrator" description. // 2 groups: nodes 0-1 is group 0, 2-3 is group 1. ContentCluster cluster = createCluster(4, 2); - cluster.clusterInfo().getStorageNodeInfo(0).setWantedState(new NodeState(STORAGE, State.MAINTENANCE).setDescription("Orchestrator")); + setStorageNodeWantedState(cluster, 0, new NodeState(STORAGE, MAINTENANCE), "Orchestrator"); var nodeStateChangeChecker = new NodeStateChangeChecker(cluster, false); ClusterState clusterStateWith0InMaintenance = clusterState(String.format( "version:%d distributor:4 storage:4 .0.s:m", @@ -433,10 +531,9 @@ public class NodeStateChangeCheckerTest { NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster); for (int x = 0; x < cluster.clusterInfo().getConfiguredNodes().size(); x++) { - State state = UP; - cluster.clusterInfo().getDistributorNodeInfo(x).setReportedState(new NodeState(DISTRIBUTOR, state), 0); + cluster.clusterInfo().getDistributorNodeInfo(x).setReportedState(new NodeState(DISTRIBUTOR, UP), 0); cluster.clusterInfo().getDistributorNodeInfo(x).setHostInfo(HostInfo.createHostInfo(createDistributorHostInfo(4, 5, 6))); - cluster.clusterInfo().getStorageNodeInfo(x).setReportedState(new NodeState(STORAGE, state), 0); + cluster.clusterInfo().getStorageNodeInfo(x).setReportedState(new NodeState(STORAGE, UP), 0); } return nodeStateChangeChecker.evaluateTransition( @@ -757,4 +854,28 @@ public class NodeStateChangeCheckerTest { return configBuilder.build(); } + private void checkSettingToMaintenanceIsAllowed(int nodeIndex, NodeStateChangeChecker nodeStateChangeChecker, ClusterState clusterState) { + Node node = new Node(STORAGE, nodeIndex); + Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); + assertTrue(result.settingWantedStateIsAllowed()); + assertFalse(result.wantedStateAlreadySet()); + assertEquals("Preconditions fulfilled and new state different", result.getReason()); + } + + private void setStorageNodeWantedStateToMaintenance(ContentCluster cluster, int nodeIndex) { + setStorageNodeWantedStateToMaintenance(cluster, nodeIndex, "Orchestrator"); + } + + private void setStorageNodeWantedStateToMaintenance(ContentCluster cluster, int nodeIndex, String description) { + cluster.clusterInfo().getStorageNodeInfo(nodeIndex).setWantedState(MAINTENANCE_NODE_STATE.setDescription(description)); + } + + private void setStorageNodeWantedState(ContentCluster cluster, int nodeIndex, NodeState state, String description) { + cluster.clusterInfo().getStorageNodeInfo(nodeIndex).setWantedState(state.setDescription(description)); + } + + private void setDistributorNodeWantedState(ContentCluster cluster, int nodeIndex, NodeState state, String description) { + cluster.clusterInfo().getDistributorNodeInfo(nodeIndex).setWantedState(state.setDescription(description)); + } + } -- cgit v1.2.3