aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarald Musum <musum@verizonmedia.com>2023-04-19 12:55:25 +0200
committerGitHub <noreply@github.com>2023-04-19 12:55:25 +0200
commit1b0363508a5a14e8d9d39bb7421ed1040b1a2a6a (patch)
treebd8c0fef6267611e314cb35251bc497c72998055
parentf8f367921956b5e0a7e9927fdecaf1713a80fbf8 (diff)
parentc38a75f69c5206e30aa8023be08707fdb7adb132 (diff)
Merge pull request #26652 from vespa-engine/hmusum/allow-groups-to-be-down
Hmusum/allow groups to be down
-rw-r--r--clustercontroller-core/pom.xml5
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java8
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java151
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java407
-rw-r--r--configdefinitions/src/vespa/fleetcontroller.def7
-rw-r--r--parent/pom.xml5
6 files changed, 450 insertions, 133 deletions
diff --git a/clustercontroller-core/pom.xml b/clustercontroller-core/pom.xml
index b4ac5ca869c..647d8ca4e64 100644
--- a/clustercontroller-core/pom.xml
+++ b/clustercontroller-core/pom.xml
@@ -64,6 +64,11 @@
<scope>test</scope>
</dependency>
<dependency>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter-params</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<scope>provided</scope>
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
index b83d70b8656..2535589395d 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
@@ -32,7 +32,7 @@ public class ContentCluster {
private final int maxNumberOfGroupsAllowedToBeDown;
public ContentCluster(String clusterName, Collection<ConfiguredNode> configuredNodes, Distribution distribution) {
- this(clusterName, configuredNodes, distribution, 1);
+ this(clusterName, configuredNodes, distribution, -1);
}
public ContentCluster(FleetControllerOptions options) {
@@ -40,9 +40,9 @@ public class ContentCluster {
}
ContentCluster(String clusterName,
- Collection<ConfiguredNode> configuredNodes,
- Distribution distribution,
- int maxNumberOfGroupsAllowedToBeDown) {
+ Collection<ConfiguredNode> configuredNodes,
+ Distribution distribution,
+ int maxNumberOfGroupsAllowedToBeDown) {
if (configuredNodes == null) throw new IllegalArgumentException("Nodes must be set");
this.clusterName = clusterName;
this.distribution = distribution;
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
index e242833fd0c..c823c94afd1 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
@@ -13,13 +13,20 @@ import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
import com.yahoo.vespa.clustercontroller.core.hostinfo.Metrics;
import com.yahoo.vespa.clustercontroller.core.hostinfo.StorageNode;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
+import java.util.Set;
+import java.util.logging.Logger;
+import java.util.stream.Collectors;
import static com.yahoo.vdslib.state.NodeType.STORAGE;
import static com.yahoo.vdslib.state.State.DOWN;
+import static com.yahoo.vdslib.state.State.MAINTENANCE;
import static com.yahoo.vdslib.state.State.RETIRED;
import static com.yahoo.vdslib.state.State.UP;
import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Result.allowSettingOfWantedState;
@@ -27,6 +34,7 @@ import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Resu
import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Result.createDisallowed;
import static com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest.Condition.FORCE;
import static com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest.Condition.SAFE;
+import static java.util.logging.Level.FINE;
/**
* Checks if a node can be upgraded.
@@ -35,8 +43,9 @@ import static com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetU
*/
public class NodeStateChangeChecker {
- public static final String BUCKETS_METRIC_NAME = "vds.datastored.bucket_space.buckets_total";
- public static final Map<String, String> BUCKETS_METRIC_DIMENSIONS = Map.of("bucketSpace", "default");
+ private static final Logger log = Logger.getLogger(NodeStateChangeChecker.class.getName());
+ private static final String BUCKETS_METRIC_NAME = "vds.datastored.bucket_space.buckets_total";
+ private static final Map<String, String> BUCKETS_METRIC_DIMENSIONS = Map.of("bucketSpace", "default");
private final int requiredRedundancy;
private final HierarchicalGroupVisiting groupVisiting;
@@ -50,6 +59,8 @@ public class NodeStateChangeChecker {
this.clusterInfo = cluster.clusterInfo();
this.inMoratorium = inMoratorium;
this.maxNumberOfGroupsAllowedToBeDown = cluster.maxNumberOfGroupsAllowedToBeDown();
+ if ( ! groupVisiting.isHierarchical() && maxNumberOfGroupsAllowedToBeDown > 1)
+ throw new IllegalArgumentException("Cannot have both 1 group and maxNumberOfGroupsAllowedToBeDown > 1");
}
public static class Result {
@@ -214,26 +225,34 @@ public class NodeStateChangeChecker {
oldWantedState.getState() + ": " + oldWantedState.getDescription());
}
- Result otherGroupCheck = anotherNodeInAnotherGroupHasWantedState(nodeInfo);
- if (!otherGroupCheck.settingWantedStateIsAllowed()) {
- return otherGroupCheck;
+ if (maxNumberOfGroupsAllowedToBeDown == -1) {
+ var otherGroupCheck = anotherNodeInAnotherGroupHasWantedState(nodeInfo);
+ if (!otherGroupCheck.settingWantedStateIsAllowed()) {
+ return otherGroupCheck;
+ }
+ if (anotherNodeInGroupAlreadyAllowed(nodeInfo, newDescription)) {
+ return allowSettingOfWantedState();
+ }
+ } else {
+ var result = otherNodesHaveWantedState(nodeInfo, newDescription, clusterState);
+ if (result.isPresent())
+ return result.get();
}
if (clusterState.getNodeState(nodeInfo.getNode()).getState() == DOWN) {
- return allowSettingOfWantedState();
- }
-
- if (anotherNodeInGroupAlreadyAllowed(nodeInfo, newDescription)) {
+ log.log(FINE, "node is DOWN, allow");
return allowSettingOfWantedState();
}
Result allNodesAreUpCheck = checkAllNodesAreUp(clusterState);
if (!allNodesAreUpCheck.settingWantedStateIsAllowed()) {
+ log.log(FINE, "allNodesAreUpCheck: " + allNodesAreUpCheck);
return allNodesAreUpCheck;
}
Result checkDistributorsResult = checkDistributors(nodeInfo.getNode(), clusterState.getVersion());
if (!checkDistributorsResult.settingWantedStateIsAllowed()) {
+ log.log(FINE, "checkDistributors: "+ checkDistributorsResult);
return checkDistributorsResult;
}
@@ -268,6 +287,65 @@ public class NodeStateChangeChecker {
}
}
+ /**
+ * Returns an optional Result, where return value is:
+ * For flat setup: Return Optional.of(disallowed) if wanted state is set on some node, else Optional.empty
+ * For hierarchical setup: No wanted state for other nodes, return Optional.empty
+ * Wanted state for nodes/groups are not UP:
+ * if less than maxNumberOfGroupsAllowedToBeDown: return Optional.of(allowed)
+ * else: if node is in group with nodes already down: return Optional.of(allowed), else Optional.of(disallowed)
+ */
+ private Optional<Result> otherNodesHaveWantedState(StorageNodeInfo nodeInfo, String newDescription, ClusterState clusterState) {
+ Node node = nodeInfo.getNode();
+
+ if (groupVisiting.isHierarchical()) {
+ Set<Integer> groupsWithNodesWantedStateNotUp = groupsWithUserWantedStateNotUp();
+ if (groupsWithNodesWantedStateNotUp.size() == 0) {
+ log.log(FINE, "groupsWithNodesWantedStateNotUp=0");
+ return Optional.empty();
+ }
+
+ Set<Integer> groupsWithSameStateAndDescription = groupsWithSameStateAndDescription(MAINTENANCE, newDescription);
+ if (aGroupContainsNode(groupsWithSameStateAndDescription, node)) {
+ log.log(FINE, "Node is in group with same state and description, allow");
+ return Optional.of(allowSettingOfWantedState());
+ }
+ // There are groups with nodes not up, but with another description, probably operator set
+ if (groupsWithSameStateAndDescription.size() == 0) {
+ return Optional.of(createDisallowed("Wanted state already set for another node in groups: " +
+ sortSetIntoList(groupsWithNodesWantedStateNotUp)));
+ }
+
+ Set<Integer> retiredAndNotUpGroups = groupsWithNotRetiredAndNotUp(clusterState);
+ int numberOfGroupsToConsider = retiredAndNotUpGroups.size();
+ // Subtract one group if node is in a group with nodes already retired or not up, since number of such groups will
+ // not increase if we allow node to go down
+ if (aGroupContainsNode(retiredAndNotUpGroups, node)) {
+ numberOfGroupsToConsider = retiredAndNotUpGroups.size() - 1;
+ }
+ if (numberOfGroupsToConsider < maxNumberOfGroupsAllowedToBeDown) {
+ log.log(FINE, "Allow, retiredAndNotUpGroups=" + retiredAndNotUpGroups);
+ return Optional.of(allowSettingOfWantedState());
+ }
+
+ return Optional.of(createDisallowed(String.format("At most %d groups can have wanted state: %s",
+ maxNumberOfGroupsAllowedToBeDown,
+ sortSetIntoList(retiredAndNotUpGroups))));
+ } else {
+ // Return a disallow-result if there is another node with a wanted state
+ var otherNodeHasWantedState = otherNodeHasWantedState(nodeInfo);
+ if ( ! otherNodeHasWantedState.settingWantedStateIsAllowed())
+ return Optional.of(otherNodeHasWantedState);
+ }
+ return Optional.empty();
+ }
+
+ private ArrayList<Integer> sortSetIntoList(Set<Integer> set) {
+ var sortedList = new ArrayList<>(set);
+ Collections.sort(sortedList);
+ return sortedList;
+ }
+
/** Returns a disallow-result, if there is a node in the group with wanted state != UP. */
private Result otherNodeInGroupHasWantedState(Group group) {
for (var configuredNode : group.getNodes()) {
@@ -354,6 +432,22 @@ public class NodeStateChangeChecker {
return false;
}
+ private boolean aGroupContainsNode(Collection<Integer> groupIndexes, Node node) {
+ for (Group group : getGroupsWithIndexes(groupIndexes)) {
+ if (groupContainsNode(group, node))
+ return true;
+ }
+
+ return false;
+ }
+
+ private List<Group> getGroupsWithIndexes(Collection<Integer> groupIndexes) {
+ return clusterInfo.getStorageNodeInfos().stream()
+ .map(NodeInfo::getGroup)
+ .filter(group -> groupIndexes.contains(group.getIndex()))
+ .collect(Collectors.toList());
+ }
+
private Result checkAllNodesAreUp(ClusterState clusterState) {
// This method verifies both storage nodes and distributors are up (or retired).
// The complicated part is making a summary error message.
@@ -441,4 +535,43 @@ public class NodeStateChangeChecker {
return allowSettingOfWantedState();
}
+ private Set<Integer> groupsWithUserWantedStateNotUp() {
+ return clusterInfo.getAllNodeInfos().stream()
+ .filter(sni -> !UP.equals(sni.getUserWantedState().getState()))
+ .map(NodeInfo::getGroup)
+ .filter(Objects::nonNull)
+ .filter(Group::isLeafGroup)
+ .map(Group::getIndex)
+ .collect(Collectors.toSet());
+ }
+
+ // groups with at least one node with the same state & description
+ private Set<Integer> groupsWithSameStateAndDescription(State state, String newDescription) {
+ return clusterInfo.getAllNodeInfos().stream()
+ .filter(nodeInfo -> {
+ var userWantedState = nodeInfo.getUserWantedState();
+ return userWantedState.getState() == state &&
+ Objects.equals(userWantedState.getDescription(), newDescription);
+ })
+ .map(NodeInfo::getGroup)
+ .filter(Objects::nonNull)
+ .filter(Group::isLeafGroup)
+ .map(Group::getIndex)
+ .collect(Collectors.toSet());
+ }
+
+ // groups with at least one node in state (not retired AND not up)
+ private Set<Integer> groupsWithNotRetiredAndNotUp(ClusterState clusterState) {
+ return clusterInfo.getAllNodeInfos().stream()
+ .filter(nodeInfo -> (nodeInfo.getUserWantedState().getState() != RETIRED
+ && nodeInfo.getUserWantedState().getState() != UP)
+ || (clusterState.getNodeState(nodeInfo.getNode()).getState() != RETIRED
+ && clusterState.getNodeState(nodeInfo.getNode()).getState() != UP))
+ .map(NodeInfo::getGroup)
+ .filter(Objects::nonNull)
+ .filter(Group::isLeafGroup)
+ .map(Group::getIndex)
+ .collect(Collectors.toSet());
+ }
+
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
index 45ca07c88e4..c4fd7cb69b9 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
@@ -10,7 +10,8 @@ import com.yahoo.vdslib.state.State;
import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;
import com.yahoo.vespa.config.content.StorDistributionConfig;
import org.junit.jupiter.api.Test;
-import java.text.ParseException;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
@@ -45,13 +46,7 @@ public class NodeStateChangeCheckerTest {
return new NodeState(STORAGE, state).setDescription(description);
}
- private static ClusterState clusterState(String state) {
- try {
- return new ClusterState(state);
- } catch (ParseException e) {
- throw new RuntimeException(e);
- }
- }
+ private static ClusterState clusterState(String state) { return ClusterState.stateFromString(state); }
private static ClusterState defaultAllUpClusterState() {
return defaultAllUpClusterState(4);
@@ -68,14 +63,14 @@ public class NodeStateChangeCheckerTest {
return new NodeStateChangeChecker(cluster, false);
}
- private ContentCluster createCluster(int nodeCount) {
- return createCluster(nodeCount, 1);
+ private ContentCluster createCluster(int nodeCount, int maxNumberOfGroupsAllowedToBeDown) {
+ return createCluster(nodeCount, 1, maxNumberOfGroupsAllowedToBeDown);
}
- private ContentCluster createCluster(int nodeCount, int groupCount) {
- Collection<ConfiguredNode> nodes = createNodes(nodeCount);
+ private ContentCluster createCluster(int nodeCount, int groupCount, int maxNumberOfGroupsAllowedToBeDown) {
+ List<ConfiguredNode> nodes = createNodes(nodeCount);
Distribution distribution = new Distribution(createDistributionConfig(nodeCount, groupCount));
- return new ContentCluster("Clustername", nodes, distribution);
+ return new ContentCluster("Clustername", nodes, distribution, maxNumberOfGroupsAllowedToBeDown);
}
private String createDistributorHostInfo(int replicationfactor1, int replicationfactor2, int replicationfactor3) {
@@ -113,9 +108,10 @@ public class NodeStateChangeCheckerTest {
}
}
- @Test
- void testCanUpgradeForce() {
- var nodeStateChangeChecker = createChangeChecker(createCluster(1));
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testCanUpgradeWithForce(int maxNumberOfGroupsAllowedToBeDown) {
+ var nodeStateChangeChecker = createChangeChecker(createCluster(1, maxNumberOfGroupsAllowedToBeDown));
NodeState newState = new NodeState(STORAGE, INITIALIZING);
Result result = nodeStateChangeChecker.evaluateTransition(
nodeDistributor, defaultAllUpClusterState(), FORCE,
@@ -124,9 +120,10 @@ public class NodeStateChangeCheckerTest {
assertFalse(result.wantedStateAlreadySet());
}
- @Test
- void testDeniedInMoratorium() {
- ContentCluster cluster = createCluster(4);
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testDeniedInMoratorium(int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
var nodeStateChangeChecker = new NodeStateChangeChecker(cluster, true);
Result result = nodeStateChangeChecker.evaluateTransition(
new Node(STORAGE, 10), defaultAllUpClusterState(), SAFE,
@@ -136,9 +133,10 @@ public class NodeStateChangeCheckerTest {
assertEquals("Master cluster controller is bootstrapping and in moratorium", result.getReason());
}
- @Test
- void testUnknownStorageNode() {
- ContentCluster cluster = createCluster(4);
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testUnknownStorageNode(int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
var nodeStateChangeChecker = createChangeChecker(cluster);
Result result = nodeStateChangeChecker.evaluateTransition(
new Node(STORAGE, 10), defaultAllUpClusterState(), SAFE,
@@ -148,11 +146,12 @@ public class NodeStateChangeCheckerTest {
assertEquals("Unknown node storage.10", result.getReason());
}
- @Test
- void testSafeMaintenanceDisallowedWhenOtherStorageNodeInFlatClusterIsSuspended() {
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testSafeMaintenanceDisallowedWhenOtherStorageNodeInFlatClusterIsSuspended(int maxNumberOfGroupsAllowedToBeDown) {
// Nodes 0-3, storage node 0 being in maintenance with "Orchestrator" description.
- ContentCluster cluster = createCluster(4);
- cluster.clusterInfo().getStorageNodeInfo(0).setWantedState(new NodeState(STORAGE, MAINTENANCE).setDescription("Orchestrator"));
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
+ setStorageNodeWantedStateToMaintenance(cluster, 0);
var nodeStateChangeChecker = createChangeChecker(cluster);
ClusterState clusterStateWith0InMaintenance = clusterState(String.format(
"version:%d distributor:4 storage:4 .0.s:m",
@@ -168,9 +167,130 @@ public class NodeStateChangeCheckerTest {
}
@Test
- void testSafeMaintenanceDisallowedWhenOtherDistributorInFlatClusterIsSuspended() {
+ void testMaintenanceAllowedFor2Of4Groups() {
+ // 4 groups with 1 node in each group
+ Collection<ConfiguredNode> nodes = createNodes(4);
+ StorDistributionConfig config = createDistributionConfig(4, 4);
+
+ int maxNumberOfGroupsAllowedToBeDown = 2;
+ var cluster = new ContentCluster("Clustername", nodes, new Distribution(config), maxNumberOfGroupsAllowedToBeDown);
+ setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 5, 6)));
+ var nodeStateChangeChecker = createChangeChecker(cluster);
+
+ // All nodes up, set a storage node in group 0 to maintenance
+ {
+ int nodeIndex = 0;
+ checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, defaultAllUpClusterState());
+ setStorageNodeWantedStateToMaintenance(cluster, nodeIndex);
+ }
+
+ // Node in group 0 in maintenance, set storage node in group 1 to maintenance
+ {
+ ClusterState clusterState = clusterState(String.format("version:%d distributor:4 .0.s:d storage:4 .0.s:m", currentClusterStateVersion));
+ int nodeIndex = 1;
+ checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, clusterState);
+ setStorageNodeWantedStateToMaintenance(cluster, nodeIndex);
+ }
+
+ // Nodes in group 0 and 1 in maintenance, try to set storage node in group 2 to maintenance while storage node 2 is down, should fail
+ {
+ ClusterState clusterState = clusterState(String.format("version:%d distributor:4 storage:4 .0.s:m .1.s:m .2.s:d", currentClusterStateVersion));
+ int nodeIndex = 2;
+ cluster.clusterInfo().getStorageNodeInfo(nodeIndex).setReportedState(new NodeState(STORAGE, DOWN), 0);
+ Node node = new Node(STORAGE, nodeIndex);
+ Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE);
+ assertFalse(result.settingWantedStateIsAllowed(), result.toString());
+ assertFalse(result.wantedStateAlreadySet());
+ assertEquals("At most 2 groups can have wanted state: [0, 1, 2]", result.getReason());
+ }
+
+ // Nodes in group 0 and 1 in maintenance, try to set storage node in group 2 to maintenance, should fail
+ {
+ ClusterState clusterState = clusterState(String.format("version:%d distributor:4 storage:4 .0.s:m .1.s:m", currentClusterStateVersion));
+ int nodeIndex = 2;
+ Node node = new Node(STORAGE, nodeIndex);
+ Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE);
+ assertFalse(result.settingWantedStateIsAllowed(), result.toString());
+ assertFalse(result.wantedStateAlreadySet());
+ assertEquals("At most 2 groups can have wanted state: [0, 1]", result.getReason());
+ }
+
+ }
+
+ @Test
+ void testMaintenanceAllowedFor2Of4Groups8Nodes() {
+ // 4 groups with 2 nodes in each group
+ Collection<ConfiguredNode> nodes = createNodes(8);
+ StorDistributionConfig config = createDistributionConfig(8, 4);
+
+ int maxNumberOfGroupsAllowedToBeDown = 2;
+ var cluster = new ContentCluster("Clustername", nodes, new Distribution(config), maxNumberOfGroupsAllowedToBeDown);
+ setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 5, 6)));
+ var nodeStateChangeChecker = createChangeChecker(cluster);
+
+ // All nodes up, set a storage node in group 0 to maintenance
+ {
+ ClusterState clusterState = defaultAllUpClusterState(8);
+ int nodeIndex = 0;
+ checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, clusterState);
+ setStorageNodeWantedStateToMaintenance(cluster, nodeIndex);
+ }
+
+ // 1 Node in group 0 in maintenance, try to set node 1 in group 0 to maintenance
+ {
+ ClusterState clusterState = clusterState(String.format("version:%d distributor:8 .0.s:d storage:8 .0.s:m", currentClusterStateVersion));
+ int nodeIndex = 1;
+ checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, clusterState);
+ setStorageNodeWantedStateToMaintenance(cluster, nodeIndex);
+ }
+
+ // 2 nodes in group 0 in maintenance, try to set storage node 2 in group 1 to maintenance
+ {
+ ClusterState clusterState = clusterState(String.format("version:%d distributor:8 storage:8 .0.s:m .1.s:m", currentClusterStateVersion));
+ int nodeIndex = 2;
+ checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, clusterState);
+ setStorageNodeWantedStateToMaintenance(cluster, nodeIndex);
+ }
+
+ // 2 nodes in group 0 and 1 in group 1 in maintenance, try to set storage node 4 in group 2 to maintenance, should fail (different group)
+ {
+ ClusterState clusterState = clusterState(String.format("version:%d distributor:8 storage:8 .0.s:m .1.s:m .2.s:m", currentClusterStateVersion));
+ int nodeIndex = 4;
+ Node node = new Node(STORAGE, nodeIndex);
+ Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE);
+ assertFalse(result.settingWantedStateIsAllowed(), result.toString());
+ assertFalse(result.wantedStateAlreadySet());
+ assertEquals("At most 2 groups can have wanted state: [0, 1]", result.getReason());
+ }
+
+ // 2 nodes in group 0 and 1 in group 1 in maintenance, try to set storage node 3 in group 1 to maintenance
+ {
+ ClusterState clusterState = clusterState(String.format("version:%d distributor:8 storage:8 .0.s:m .1.s:m .2.s:m", currentClusterStateVersion));
+ int nodeIndex = 3;
+ checkSettingToMaintenanceIsAllowed(nodeIndex, nodeStateChangeChecker, clusterState);
+ setStorageNodeWantedStateToMaintenance(cluster, nodeIndex);
+ }
+
+ // 2 nodes in group 0 in maintenance, storage node 3 in group 1 is in maintenance with another description
+ // (set in maintenance by operator), try to set storage node 3 in group 1 to maintenance, should bew allowed
+ {
+ ClusterState clusterState = clusterState(String.format("version:%d distributor:8 storage:8 .0.s:m .1.s:m .3.s:m", currentClusterStateVersion));
+ setStorageNodeWantedState(cluster, 3, MAINTENANCE, "Maintenance, set by operator"); // Set to another description
+ setStorageNodeWantedState(cluster, 2, UP, ""); // Set back to UP, want to set this to maintenance again
+ int nodeIndex = 2;
+ Node node = new Node(STORAGE, nodeIndex);
+ Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE);
+ assertTrue(result.settingWantedStateIsAllowed(), result.toString());
+ assertFalse(result.wantedStateAlreadySet());
+ }
+
+ }
+
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testSafeMaintenanceDisallowedWhenOtherDistributorInFlatClusterIsSuspended(int maxNumberOfGroupsAllowedToBeDown) {
// Nodes 0-3, distributor 0 being down with "Orchestrator" description.
- ContentCluster cluster = createCluster(4);
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
setDistributorNodeWantedState(cluster, 0, DOWN, "Orchestrator");
var nodeStateChangeChecker = createChangeChecker(cluster);
ClusterState clusterStateWith0InMaintenance = clusterState(String.format(
@@ -186,11 +306,12 @@ public class NodeStateChangeCheckerTest {
result.getReason());
}
- @Test
- void testSafeMaintenanceDisallowedWhenDistributorInGroupIsDown() {
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testSafeMaintenanceDisallowedWhenDistributorInGroupIsDown(int maxNumberOfGroupsAllowedToBeDown) {
// Nodes 0-3, distributor 0 being in maintenance with "Orchestrator" description.
// 2 groups: nodes 0-1 is group 0, 2-3 is group 1.
- ContentCluster cluster = createCluster(4, 2);
+ ContentCluster cluster = createCluster(4, 2, maxNumberOfGroupsAllowedToBeDown);
setDistributorNodeWantedState(cluster, 0, DOWN, "Orchestrator");
var nodeStateChangeChecker = new NodeStateChangeChecker(cluster, false);
ClusterState clusterStateWith0InMaintenance = clusterState(String.format(
@@ -204,7 +325,10 @@ public class NodeStateChangeCheckerTest {
SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
- assertEquals("At most one group can have wanted state: Other distributor 0 in group 0 has wanted state Down", result.getReason());
+ if (maxNumberOfGroupsAllowedToBeDown >= 1)
+ assertEquals("Wanted state already set for another node in groups: [0]", result.getReason());
+ else
+ assertEquals("At most one group can have wanted state: Other distributor 0 in group 0 has wanted state Down", result.getReason());
}
{
@@ -213,16 +337,22 @@ public class NodeStateChangeCheckerTest {
Result result = nodeStateChangeChecker.evaluateTransition(
new Node(STORAGE, 1), clusterStateWith0InMaintenance,
SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE);
- assertFalse(result.settingWantedStateIsAllowed(), result.getReason());
- assertEquals("Another distributor wants state DOWN: 0", result.getReason());
+ if (maxNumberOfGroupsAllowedToBeDown >= 1) {
+ assertFalse(result.settingWantedStateIsAllowed(), result.getReason());
+ assertEquals("Wanted state already set for another node in groups: [0]", result.getReason());
+ } else {
+ assertFalse(result.settingWantedStateIsAllowed(), result.getReason());
+ assertEquals("Another distributor wants state DOWN: 0", result.getReason());
+ }
}
}
- @Test
- void testSafeMaintenanceWhenOtherStorageNodeInGroupIsSuspended() {
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testSafeMaintenanceWhenOtherStorageNodeInGroupIsSuspended(int maxNumberOfGroupsAllowedToBeDown) {
// Nodes 0-3, storage node 0 being in maintenance with "Orchestrator" description.
// 2 groups: nodes 0-1 is group 0, 2-3 is group 1.
- ContentCluster cluster = createCluster(4, 2);
+ ContentCluster cluster = createCluster(4, 2, maxNumberOfGroupsAllowedToBeDown);
setStorageNodeWantedState(cluster, 0, MAINTENANCE, "Orchestrator");
var nodeStateChangeChecker = new NodeStateChangeChecker(cluster, false);
ClusterState clusterStateWith0InMaintenance = clusterState(String.format(
@@ -236,8 +366,11 @@ public class NodeStateChangeCheckerTest {
SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
- assertEquals("At most one group can have wanted state: Other storage node 0 in group 0 has wanted state Maintenance",
- result.getReason());
+ if (maxNumberOfGroupsAllowedToBeDown >= 1)
+ assertEquals("At most 1 groups can have wanted state: [0]", result.getReason());
+ else
+ assertEquals("At most one group can have wanted state: Other storage node 0 in group 0 has wanted state Maintenance",
+ result.getReason());
}
{
@@ -251,9 +384,10 @@ public class NodeStateChangeCheckerTest {
}
}
- @Test
- void testSafeSetStateDistributors() {
- NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(createCluster(1));
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testSafeSetStateDistributors(int maxNumberOfGroupsAllowedToBeDown) {
+ NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(createCluster(1, 1, maxNumberOfGroupsAllowedToBeDown));
Result result = nodeStateChangeChecker.evaluateTransition(
nodeDistributor, defaultAllUpClusterState(), SAFE,
UP_NODE_STATE, MAINTENANCE_NODE_STATE);
@@ -262,10 +396,11 @@ public class NodeStateChangeCheckerTest {
assertTrue(result.getReason().contains("Safe-set of node state is only supported for storage nodes"));
}
- @Test
- void testCanUpgradeSafeMissingStorage() {
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testCanUpgradeSafeMissingStorage(int maxNumberOfGroupsAllowedToBeDown) {
// Create a content cluster with 4 nodes, and storage node with index 3 down.
- ContentCluster cluster = createCluster(4);
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 5, 6)));
cluster.clusterInfo().getStorageNodeInfo(3).setReportedState(new NodeState(STORAGE, DOWN), 0);
ClusterState clusterStateWith3Down = clusterState(String.format(
@@ -282,16 +417,18 @@ public class NodeStateChangeCheckerTest {
assertEquals("Another storage node has state DOWN: 3", result.getReason());
}
- @Test
- void testCanUpgradeStorageSafeYes() {
- Result result = transitionToMaintenanceWithNoStorageNodesDown(createCluster(4), defaultAllUpClusterState());
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testCanUpgradeStorageSafeYes(int maxNumberOfGroupsAllowedToBeDown) {
+ Result result = transitionToMaintenanceWithNoStorageNodesDown(createCluster(4, 1, maxNumberOfGroupsAllowedToBeDown), defaultAllUpClusterState());
assertTrue(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
}
- @Test
- void testSetUpFailsIfReportedIsDown() {
- ContentCluster cluster = createCluster(4);
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testSetUpFailsIfReportedIsDown(int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
// Not setting nodes up -> all are down
@@ -304,9 +441,10 @@ public class NodeStateChangeCheckerTest {
// A node may be reported as Up but have a generated state of Down if it's part of
// nodes taken down implicitly due to a group having too low node availability.
- @Test
- void testSetUpSucceedsIfReportedIsUpButGeneratedIsDown() {
- ContentCluster cluster = createCluster(4);
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testSetUpSucceedsIfReportedIsUpButGeneratedIsDown(int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
markAllNodesAsReportingStateUp(cluster);
@@ -322,9 +460,10 @@ public class NodeStateChangeCheckerTest {
assertFalse(result.wantedStateAlreadySet());
}
- @Test
- void testCanSetUpEvenIfOldWantedStateIsDown() {
- ContentCluster cluster = createCluster(4);
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testCanSetUpEvenIfOldWantedStateIsDown(int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 3, 6)));
@@ -335,9 +474,10 @@ public class NodeStateChangeCheckerTest {
assertFalse(result.wantedStateAlreadySet());
}
- @Test
- void testCanUpgradeStorageSafeNo() {
- ContentCluster cluster = createCluster(4);
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testCanUpgradeStorageSafeNo(int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 3, 6)));
@@ -350,9 +490,10 @@ public class NodeStateChangeCheckerTest {
result.getReason());
}
- @Test
- void testCanUpgradeIfMissingMinReplicationFactor() {
- ContentCluster cluster = createCluster(4);
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testCanUpgradeIfMissingMinReplicationFactor(int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
setAllNodesUp(cluster, HostInfo.createHostInfo(createDistributorHostInfo(4, 3, 6)));
@@ -363,9 +504,10 @@ public class NodeStateChangeCheckerTest {
assertFalse(result.wantedStateAlreadySet());
}
- @Test
- void testCanUpgradeIfStorageNodeMissingFromNodeInfo() {
- ContentCluster cluster = createCluster(4);
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testCanUpgradeIfStorageNodeMissingFromNodeInfo(int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
String hostInfo = "{\n" +
" \"cluster-state-version\": 2,\n" +
@@ -387,9 +529,10 @@ public class NodeStateChangeCheckerTest {
assertFalse(result.wantedStateAlreadySet());
}
- @Test
- void testMissingDistributorState() {
- ContentCluster cluster = createCluster(4);
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testMissingDistributorState(int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
cluster.clusterInfo().getStorageNodeInfo(1).setReportedState(new NodeState(STORAGE, UP), 0);
@@ -400,8 +543,8 @@ public class NodeStateChangeCheckerTest {
assertEquals("Distributor node 0 has not reported any cluster state version yet.", result.getReason());
}
- private Result transitionToSameState(State state, String oldDescription, String newDescription) {
- ContentCluster cluster = createCluster(4);
+ private Result transitionToSameState(State state, String oldDescription, String newDescription, int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
NodeState currentNodeState = createNodeState(state, oldDescription);
@@ -411,26 +554,29 @@ public class NodeStateChangeCheckerTest {
currentNodeState, newNodeState);
}
- private Result transitionToSameState(String oldDescription, String newDescription) {
- return transitionToSameState(MAINTENANCE, oldDescription, newDescription);
+ private Result transitionToSameState(String oldDescription, String newDescription, int maxNumberOfGroupsAllowedToBeDown) {
+ return transitionToSameState(MAINTENANCE, oldDescription, newDescription, maxNumberOfGroupsAllowedToBeDown);
}
- @Test
- void testSettingUpWhenUpCausesAlreadySet() {
- Result result = transitionToSameState(UP, "foo", "bar");
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testSettingUpWhenUpCausesAlreadySet(int maxNumberOfGroupsAllowedToBeDown) {
+ Result result = transitionToSameState(UP, "foo", "bar", maxNumberOfGroupsAllowedToBeDown);
assertTrue(result.wantedStateAlreadySet());
}
- @Test
- void testSettingAlreadySetState() {
- Result result = transitionToSameState("foo", "foo");
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testSettingAlreadySetState(int maxNumberOfGroupsAllowedToBeDown) {
+ Result result = transitionToSameState("foo", "foo", maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.settingWantedStateIsAllowed());
assertTrue(result.wantedStateAlreadySet());
}
- @Test
- void testDifferentDescriptionImpliesDenied() {
- Result result = transitionToSameState("foo", "bar");
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testDifferentDescriptionImpliesDenied(int maxNumberOfGroupsAllowedToBeDown) {
+ Result result = transitionToSameState("foo", "bar", maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
}
@@ -439,10 +585,9 @@ public class NodeStateChangeCheckerTest {
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
for (int x = 0; x < cluster.clusterInfo().getConfiguredNodes().size(); x++) {
- State state = UP;
- cluster.clusterInfo().getDistributorNodeInfo(x).setReportedState(new NodeState(DISTRIBUTOR, state), 0);
+ cluster.clusterInfo().getDistributorNodeInfo(x).setReportedState(new NodeState(DISTRIBUTOR, UP), 0);
cluster.clusterInfo().getDistributorNodeInfo(x).setHostInfo(HostInfo.createHostInfo(createDistributorHostInfo(4, 5, 6)));
- cluster.clusterInfo().getStorageNodeInfo(x).setReportedState(new NodeState(STORAGE, state), 0);
+ cluster.clusterInfo().getStorageNodeInfo(x).setReportedState(new NodeState(STORAGE, UP), 0);
}
return nodeStateChangeChecker.evaluateTransition(
@@ -462,26 +607,29 @@ public class NodeStateChangeCheckerTest {
return transitionToMaintenanceWithOneStorageNodeDown(cluster, clusterState);
}
- @Test
- void testCanUpgradeWhenAllUp() {
- Result result = transitionToMaintenanceWithNoStorageNodesDown(createCluster(4), defaultAllUpClusterState());
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testCanUpgradeWhenAllUp(int maxNumberOfGroupsAllowedToBeDown) {
+ Result result = transitionToMaintenanceWithNoStorageNodesDown(createCluster(4, maxNumberOfGroupsAllowedToBeDown), defaultAllUpClusterState());
assertTrue(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
}
- @Test
- void testCanUpgradeWhenAllUpOrRetired() {
- Result result = transitionToMaintenanceWithNoStorageNodesDown(createCluster(4), defaultAllUpClusterState());
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testCanUpgradeWhenAllUpOrRetired(int maxNumberOfGroupsAllowedToBeDown) {
+ Result result = transitionToMaintenanceWithNoStorageNodesDown(createCluster(4, maxNumberOfGroupsAllowedToBeDown), defaultAllUpClusterState());
assertTrue(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
}
- @Test
- void testCanUpgradeWhenStorageIsDown() {
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testCanUpgradeWhenStorageIsDown(int maxNumberOfGroupsAllowedToBeDown) {
ClusterState clusterState = defaultAllUpClusterState();
var storageNodeIndex = nodeStorage.getIndex();
- ContentCluster cluster = createCluster(4);
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeState downNodeState = new NodeState(STORAGE, DOWN);
cluster.clusterInfo().getStorageNodeInfo(storageNodeIndex).setReportedState(downNodeState, 4 /* time */);
clusterState.setNodeState(new Node(STORAGE, storageNodeIndex), downNodeState);
@@ -491,13 +639,14 @@ public class NodeStateChangeCheckerTest {
assertFalse(result.wantedStateAlreadySet());
}
- @Test
- void testCannotUpgradeWhenOtherStorageIsDown() {
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testCannotUpgradeWhenOtherStorageIsDown(int maxNumberOfGroupsAllowedToBeDown) {
int otherIndex = 2;
// If this fails, just set otherIndex to some other valid index.
assertNotEquals(nodeStorage.getIndex(), otherIndex);
- ContentCluster cluster = createCluster(4);
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
ClusterState clusterState = defaultAllUpClusterState();
NodeState downNodeState = new NodeState(STORAGE, DOWN);
cluster.clusterInfo().getStorageNodeInfo(otherIndex).setReportedState(downNodeState, 4 /* time */);
@@ -509,9 +658,10 @@ public class NodeStateChangeCheckerTest {
assertTrue(result.getReason().contains("Another storage node has state DOWN: 2"));
}
- @Test
- void testNodeRatioRequirementConsidersGeneratedNodeStates() {
- ContentCluster cluster = createCluster(4);
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testNodeRatioRequirementConsidersGeneratedNodeStates(int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
markAllNodesAsReportingStateUp(cluster);
@@ -531,62 +681,72 @@ public class NodeStateChangeCheckerTest {
assertFalse(result.wantedStateAlreadySet());
}
- @Test
- void testDownDisallowedByNonRetiredState() {
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testDownDisallowedByNonRetiredState(int maxNumberOfGroupsAllowedToBeDown) {
Result result = evaluateDownTransition(
defaultAllUpClusterState(),
UP,
currentClusterStateVersion,
- 0);
+ 0,
+ maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
assertEquals("Only retired nodes are allowed to be set to DOWN in safe mode - is Up", result.getReason());
}
- @Test
- void testDownDisallowedByBuckets() {
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testDownDisallowedByBuckets(int maxNumberOfGroupsAllowedToBeDown) {
Result result = evaluateDownTransition(
retiredClusterStateSuffix(),
UP,
currentClusterStateVersion,
- 1);
+ 1,
+ maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
assertEquals("The storage node manages 1 buckets", result.getReason());
}
- @Test
- void testDownDisallowedByReportedState() {
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testDownDisallowedByReportedState(int maxNumberOfGroupsAllowedToBeDown) {
Result result = evaluateDownTransition(
retiredClusterStateSuffix(),
INITIALIZING,
currentClusterStateVersion,
- 0);
+ 0,
+ maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
assertEquals("Reported state (Initializing) is not UP, so no bucket data is available", result.getReason());
}
- @Test
- void testDownDisallowedByVersionMismatch() {
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testDownDisallowedByVersionMismatch(int maxNumberOfGroupsAllowedToBeDown) {
Result result = evaluateDownTransition(
retiredClusterStateSuffix(),
UP,
currentClusterStateVersion - 1,
- 0);
+ 0,
+ maxNumberOfGroupsAllowedToBeDown);
assertFalse(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
assertEquals("Cluster controller at version 2 got info for storage node 1 at a different version 1",
result.getReason());
}
- @Test
- void testAllowedToSetDown() {
+ @ParameterizedTest
+ @ValueSource(ints = {-1, 1})
+ void testAllowedToSetDown(int maxNumberOfGroupsAllowedToBeDown) {
Result result = evaluateDownTransition(
retiredClusterStateSuffix(),
UP,
currentClusterStateVersion,
- 0);
+ 0,
+ maxNumberOfGroupsAllowedToBeDown);
assertTrue(result.settingWantedStateIsAllowed());
assertFalse(result.wantedStateAlreadySet());
}
@@ -594,8 +754,9 @@ public class NodeStateChangeCheckerTest {
private Result evaluateDownTransition(ClusterState clusterState,
State reportedState,
int hostInfoClusterStateVersion,
- int lastAlldisksBuckets) {
- ContentCluster cluster = createCluster(4);
+ int lastAlldisksBuckets,
+ int maxNumberOfGroupsAllowedToBeDown) {
+ ContentCluster cluster = createCluster(4, maxNumberOfGroupsAllowedToBeDown);
NodeStateChangeChecker nodeStateChangeChecker = createChangeChecker(cluster);
StorageNodeInfo nodeInfo = cluster.clusterInfo().getStorageNodeInfo(nodeStorage.getIndex());
@@ -763,6 +924,18 @@ public class NodeStateChangeCheckerTest {
return configBuilder.build();
}
+ private void checkSettingToMaintenanceIsAllowed(int nodeIndex, NodeStateChangeChecker nodeStateChangeChecker, ClusterState clusterState) {
+ Node node = new Node(STORAGE, nodeIndex);
+ Result result = nodeStateChangeChecker.evaluateTransition(node, clusterState, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE);
+ assertTrue(result.settingWantedStateIsAllowed(), result.toString());
+ assertFalse(result.wantedStateAlreadySet());
+ assertEquals("Preconditions fulfilled and new state different", result.getReason());
+ }
+
+ private void setStorageNodeWantedStateToMaintenance(ContentCluster cluster, int nodeIndex) {
+ setStorageNodeWantedState(cluster, nodeIndex, MAINTENANCE, "Orchestrator");
+ }
+
private void setStorageNodeWantedState(ContentCluster cluster, int nodeIndex, State state, String description) {
NodeState nodeState = new NodeState(STORAGE, state);
cluster.clusterInfo().getStorageNodeInfo(nodeIndex).setWantedState(nodeState.setDescription(description));
diff --git a/configdefinitions/src/vespa/fleetcontroller.def b/configdefinitions/src/vespa/fleetcontroller.def
index 98b4c3b0216..93a20e4ee0d 100644
--- a/configdefinitions/src/vespa/fleetcontroller.def
+++ b/configdefinitions/src/vespa/fleetcontroller.def
@@ -199,6 +199,7 @@ cluster_feed_block_limit{} double
# This is in absolute numbers, so 0.01 implies that a block limit of 0.8 effectively
# becomes 0.79 for an already blocked node.
cluster_feed_block_noise_level double default=0.0
-# For apps that have several groups this controls how many are allowed to be down
-# simultaneously.
-max_number_of_groups_allowed_to_be_down int default=1
+
+# For apps that have several groups this controls how many groups are allowed to
+# be down simultaneously in this cluster.
+max_number_of_groups_allowed_to_be_down int default=-1
diff --git a/parent/pom.xml b/parent/pom.xml
index 8d2f802e34b..ffd8c596277 100644
--- a/parent/pom.xml
+++ b/parent/pom.xml
@@ -1009,6 +1009,11 @@
<version>${junit.version}</version>
</dependency>
<dependency>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter-params</artifactId>
+ <version>${junit.version}</version>
+ </dependency>
+ <dependency>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
<version>${junit.version}</version>