diff options
author | Harald Musum <musum@yahooinc.com> | 2023-04-18 10:52:34 +0200 |
---|---|---|
committer | Harald Musum <musum@yahooinc.com> | 2023-04-18 10:52:34 +0200 |
commit | 0e68174077b41501e7cb297a95d6326eb2537d19 (patch) | |
tree | 33335dbfbee78b6abbf93b1161e4644d0cff3184 /clustercontroller-core | |
parent | e87be82cddf3145659a9ed75a0f5730fcb345fe3 (diff) |
Handle case where a node has another description for wanted state
Also add group indexes for disallow messages where relevant
Diffstat (limited to 'clustercontroller-core')
2 files changed, 90 insertions, 19 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java index 84deeeac4e3..c25f0195e41 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java @@ -13,7 +13,9 @@ import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo; import com.yahoo.vespa.clustercontroller.core.hostinfo.Metrics; import com.yahoo.vespa.clustercontroller.core.hostinfo.StorageNode; import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest; +import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Objects; @@ -24,6 +26,7 @@ import java.util.stream.Collectors; import static com.yahoo.vdslib.state.NodeType.STORAGE; import static com.yahoo.vdslib.state.State.DOWN; +import static com.yahoo.vdslib.state.State.MAINTENANCE; import static com.yahoo.vdslib.state.State.RETIRED; import static com.yahoo.vdslib.state.State.UP; import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Result.allowSettingOfWantedState; @@ -231,7 +234,7 @@ public class NodeStateChangeChecker { return allowSettingOfWantedState(); } } else { - var result = otherNodesHaveWantedState(nodeInfo, newDescription); + var result = otherNodesHaveWantedState(nodeInfo, newDescription, clusterState); if (result.isPresent()) return result.get(); } @@ -292,20 +295,42 @@ public class NodeStateChangeChecker { * if less than maxNumberOfGroupsAllowedToBeDown: return Optional.of(allowed) * else: if node is in group with nodes already down: return Optional.of(allowed), else Optional.of(disallowed) */ - private Optional<Result> otherNodesHaveWantedState(StorageNodeInfo nodeInfo, String newDescription) { + private Optional<Result> otherNodesHaveWantedState(StorageNodeInfo nodeInfo, String newDescription, ClusterState clusterState) { Node node = nodeInfo.getNode(); if (groupVisiting.isHierarchical()) { - Set<Integer> groupsWithStorageNodesWantedStateNotUp = groupsWithUserWantedStateNotUp(); - String disallowMessage = "At most nodes in " + maxNumberOfGroupsAllowedToBeDown + " groups can have wanted state"; - if (groupsWithStorageNodesWantedStateNotUp.size() == 0) + Set<Integer> groupsWithNodesWantedStateNotUp = groupsWithUserWantedStateNotUp(); + if (groupsWithNodesWantedStateNotUp.size() == 0) { + log.log(FINE, "groupsWithNodesWantedStateNotUp=0"); return Optional.empty(); - if (groupsWithStorageNodesWantedStateNotUp.size() < maxNumberOfGroupsAllowedToBeDown) + } + + Set<Integer> groupsWithSameStateAndDescription = groupsWithSameStateAndDescription(MAINTENANCE, newDescription); + if (aGroupContainsNode(groupsWithSameStateAndDescription, node)) { + log.log(FINE, "Node is in group with same state and description, allow"); return Optional.of(allowSettingOfWantedState()); - if (aGroupContainsNode(groupsWithStorageNodesWantedStateNotUp, node)) + } + // There are groups with nodes not up, but with another description, probably operator set + if (groupsWithSameStateAndDescription.size() == 0) { + return Optional.of(createDisallowed("Wanted state already set for another node in groups: " + + sortSetIntoList(groupsWithNodesWantedStateNotUp))); + } + + Set<Integer> retiredOrNotUpGroups = retiredOrNotUpGroups(clusterState, MAINTENANCE); + int numberOfGroupsToConsider = retiredOrNotUpGroups.size(); + // Subtract one group if node is in a group with nodes already retired or not up, since number of such groups wil + // not increase if we allow node to go down + if (aGroupContainsNode(retiredOrNotUpGroups, node)) { + numberOfGroupsToConsider = retiredOrNotUpGroups.size() - 1; + } + if (numberOfGroupsToConsider < maxNumberOfGroupsAllowedToBeDown) { + log.log(FINE, "Allow, retiredOrNotUpGroups=" + retiredOrNotUpGroups); return Optional.of(allowSettingOfWantedState()); + } - return Optional.of(createDisallowed(disallowMessage)); + return Optional.of(createDisallowed(String.format("At most %d groups can have wanted state: %s", + maxNumberOfGroupsAllowedToBeDown, + sortSetIntoList(retiredOrNotUpGroups)))); } else { // Return a disallow-result if there is another node with a wanted state var otherNodeHasWantedState = otherNodeHasWantedState(nodeInfo); @@ -315,6 +340,12 @@ public class NodeStateChangeChecker { return Optional.empty(); } + private ArrayList<Integer> sortSetIntoList(Set<Integer> retiredOrNotUpGroups) { + var groupsWithNodesDown = new ArrayList<>(retiredOrNotUpGroups); + Collections.sort(groupsWithNodesDown); + return groupsWithNodesDown; + } + /** Returns a disallow-result, if there is a node in the group with wanted state != UP. */ private Result otherNodeInGroupHasWantedState(Group group) { for (var configuredNode : group.getNodes()) { @@ -514,4 +545,31 @@ public class NodeStateChangeChecker { .collect(Collectors.toSet()); } + // groups with at least one node with the same state & description + private Set<Integer> groupsWithSameStateAndDescription(State state, String newDescription) { + return clusterInfo.getAllNodeInfos().stream() + .filter(nodeInfo -> { + var userWantedState = nodeInfo.getUserWantedState(); + return userWantedState.getState() == state && + Objects.equals(userWantedState.getDescription(), newDescription); + }) + .map(NodeInfo::getGroup) + .filter(Objects::nonNull) + .filter(Group::isLeafGroup) + .map(Group::getIndex) + .collect(Collectors.toSet()); + } + + // groups with at least one node with the same state & description + private Set<Integer> retiredOrNotUpGroups(ClusterState clusterState, State... states) { + return clusterInfo.getAllNodeInfos().stream() + .filter(nodeInfo -> Set.of(states).contains(nodeInfo.getUserWantedState().getState()) + || Set.of(states).contains(clusterState.getNodeState(nodeInfo.getNode()).getState())) + .map(NodeInfo::getGroup) + .filter(Objects::nonNull) + .filter(Group::isLeafGroup) + .map(Group::getIndex) + .collect(Collectors.toSet()); + } + } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java index 551b60be741..4c55dfa2131 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java @@ -201,7 +201,7 @@ public class NodeStateChangeCheckerTest { Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); assertFalse(result.settingWantedStateIsAllowed(), result.toString()); assertFalse(result.wantedStateAlreadySet()); - assertEquals("At most nodes in 2 groups can have wanted state", result.getReason()); + assertEquals("At most 2 groups can have wanted state: [0, 1]", result.getReason()); } // Nodes in group 0 and 1 in maintenance, try to set storage node in group 2 to maintenance, should fail @@ -212,14 +212,14 @@ public class NodeStateChangeCheckerTest { Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); assertFalse(result.settingWantedStateIsAllowed(), result.toString()); assertFalse(result.wantedStateAlreadySet()); - assertEquals("At most nodes in 2 groups can have wanted state", result.getReason()); + assertEquals("At most 2 groups can have wanted state: [0, 1]", result.getReason()); } } @Test void testMaintenanceAllowedFor2Of4Groups8Nodes() { - // 4 groups with 2 node in each group + // 4 groups with 2 nodes in each group Collection<ConfiguredNode> nodes = createNodes(8); StorDistributionConfig config = createDistributionConfig(8, 4); @@ -244,7 +244,7 @@ public class NodeStateChangeCheckerTest { setStorageNodeWantedStateToMaintenance(cluster, nodeIndex); } - // Nodes in group 0 in maintenance, try to set storage node 2 in group 1 to maintenance + // 2 nodes in group 0 in maintenance, try to set storage node 2 in group 1 to maintenance { ClusterState clusterState = clusterState(String.format("version:%d distributor:8 storage:8 .0.s:m .1.s:m", currentClusterStateVersion)); int nodeIndex = 2; @@ -260,7 +260,7 @@ public class NodeStateChangeCheckerTest { Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); assertFalse(result.settingWantedStateIsAllowed(), result.toString()); assertFalse(result.wantedStateAlreadySet()); - assertEquals("At most nodes in 2 groups can have wanted state", result.getReason()); + assertEquals("At most 2 groups can have wanted state: [0, 1]", result.getReason()); } // 2 nodes in group 0 and 1 in group 1 in maintenance, try to set storage node 3 in group 1 to maintenance @@ -271,6 +271,19 @@ public class NodeStateChangeCheckerTest { setStorageNodeWantedStateToMaintenance(cluster, nodeIndex); } + // 2 nodes in group 0 in maintenance, storage node 3 in group 1 is in maintenance with another description + // (set in maintenance by operator), try to set storage node 3 in group 1 to maintenance, should bew allowed + { + ClusterState clusterState = clusterState(String.format("version:%d distributor:8 storage:8 .0.s:m .1.s:m .3.s:m", currentClusterStateVersion)); + setStorageNodeWantedState(cluster, 3, MAINTENANCE, "Maintenance, set by operator"); // Set to another description + setStorageNodeWantedState(cluster, 2, UP, ""); // Set back to UP, want to set this to maintenance again + int nodeIndex = 2; + Node node = new Node(STORAGE, nodeIndex); + Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); + assertTrue(result.settingWantedStateIsAllowed(), result.toString()); + assertFalse(result.wantedStateAlreadySet()); + } + } @ParameterizedTest @@ -313,7 +326,7 @@ public class NodeStateChangeCheckerTest { assertFalse(result.settingWantedStateIsAllowed()); assertFalse(result.wantedStateAlreadySet()); if (maxNumberOfGroupsAllowedToBeDown >= 1) - assertEquals("At most nodes in 1 groups can have wanted state", result.getReason()); + assertEquals("Wanted state already set for another node in groups: [0]", result.getReason()); else assertEquals("At most one group can have wanted state: Other distributor 0 in group 0 has wanted state Down", result.getReason()); } @@ -325,8 +338,8 @@ public class NodeStateChangeCheckerTest { new Node(STORAGE, 1), clusterStateWith0InMaintenance, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); if (maxNumberOfGroupsAllowedToBeDown >= 1) { - assertTrue(result.settingWantedStateIsAllowed(), result.getReason()); - assertFalse(result.wantedStateAlreadySet()); + assertFalse(result.settingWantedStateIsAllowed(), result.getReason()); + assertEquals("Wanted state already set for another node in groups: [0]", result.getReason()); } else { assertFalse(result.settingWantedStateIsAllowed(), result.getReason()); assertEquals("Another distributor wants state DOWN: 0", result.getReason()); @@ -354,7 +367,7 @@ public class NodeStateChangeCheckerTest { assertFalse(result.settingWantedStateIsAllowed()); assertFalse(result.wantedStateAlreadySet()); if (maxNumberOfGroupsAllowedToBeDown >= 1) - assertEquals("At most nodes in 1 groups can have wanted state", result.getReason()); + assertEquals("At most 1 groups can have wanted state: [0]", result.getReason()); else assertEquals("At most one group can have wanted state: Other storage node 0 in group 0 has wanted state Maintenance", result.getReason()); @@ -914,13 +927,13 @@ public class NodeStateChangeCheckerTest { private void checkSettingToMaintenanceIsAllowed(int nodeIndex, NodeStateChangeChecker nodeStateChangeChecker, ClusterState clusterState) { Node node = new Node(STORAGE, nodeIndex); Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); - assertTrue(result.settingWantedStateIsAllowed()); + assertTrue(result.settingWantedStateIsAllowed(), result.toString()); assertFalse(result.wantedStateAlreadySet()); assertEquals("Preconditions fulfilled and new state different", result.getReason()); } private void setStorageNodeWantedStateToMaintenance(ContentCluster cluster, int nodeIndex) { - cluster.clusterInfo().getStorageNodeInfo(nodeIndex).setWantedState(MAINTENANCE_NODE_STATE.setDescription("Orchestrator")); + setStorageNodeWantedState(cluster, nodeIndex, MAINTENANCE, "Orchestrator"); } private void setStorageNodeWantedState(ContentCluster cluster, int nodeIndex, State state, String description) { |