summaryrefslogtreecommitdiffstats
path: root/clustercontroller-core
diff options
context:
space:
mode:
authorHarald Musum <musum@yahooinc.com>2023-03-28 10:43:05 +0200
committerHarald Musum <musum@yahooinc.com>2023-03-28 10:43:05 +0200
commit656524514504c04f669ff61e0708be809d11473c (patch)
tree87316171af4dee3ef2772d6782cf871d8f16889f /clustercontroller-core
parent642acda2e5a28ba06c094e503c662b514efadabe (diff)
Add config for max number of content groups allowed to be down
Diffstat (limited to 'clustercontroller-core')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java22
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java2
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java18
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java3
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java7
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java2
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java21
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequestTest.java3
8 files changed, 54 insertions, 24 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
index 5d9ec6f5f80..9347fadc0e0 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java
@@ -15,7 +15,6 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.TreeMap;
-import java.util.stream.Collectors;
import static com.yahoo.vdslib.state.NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION;
@@ -33,11 +32,25 @@ public class ContentCluster {
private Distribution distribution;
+ private final int maxNumberOfGroupsAllowedToBeDown;
+
public ContentCluster(String clusterName, Collection<ConfiguredNode> configuredNodes, Distribution distribution) {
+ this(clusterName, configuredNodes, distribution, 1);
+ }
+
+ public ContentCluster(FleetControllerOptions options) {
+ this(options.clusterName(), options.nodes(), options.storageDistribution(), options.maxNumberOfGroupsAllowedToBeDown());
+ }
+
+ private ContentCluster(String clusterName,
+ Collection<ConfiguredNode> configuredNodes,
+ Distribution distribution,
+ int maxNumberOfGroupsAllowedToBeDown) {
if (configuredNodes == null) throw new IllegalArgumentException("Nodes must be set");
this.clusterName = clusterName;
this.distribution = distribution;
setNodes(configuredNodes, new NodeListener() {});
+ this.maxNumberOfGroupsAllowedToBeDown = maxNumberOfGroupsAllowedToBeDown;
}
public Distribution getDistribution() { return distribution; }
@@ -95,6 +108,8 @@ public class ContentCluster {
public NodeInfo getNodeInfo(Node node) { return clusterInfo.getNodeInfo(node); }
+ public int maxNumberOfGroupsAllowedToBeDown() { return maxNumberOfGroupsAllowedToBeDown; }
+
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("ContentCluster(").append(clusterName).append(") {");
@@ -121,13 +136,14 @@ public class ContentCluster {
*/
public NodeStateChangeChecker.Result calculateEffectOfNewState(
Node node, ClusterState clusterState, SetUnitStateRequest.Condition condition,
- NodeState oldState, NodeState newState, boolean inMoratorium) {
+ NodeState oldState, NodeState newState, boolean inMoratorium, int maxNumberOfGroupsAllowedToBeDown) {
NodeStateChangeChecker nodeStateChangeChecker = new NodeStateChangeChecker(
distribution.getRedundancy(),
new HierarchicalGroupVisitingAdapter(distribution),
clusterInfo,
- inMoratorium
+ inMoratorium,
+ maxNumberOfGroupsAllowedToBeDown
);
return nodeStateChangeChecker.evaluateTransition(node, clusterState, condition, oldState, newState);
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
index 4dc1fe81fd2..ff0d8b87e92 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
@@ -153,7 +153,7 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta
var timer = new RealTimer();
var metricUpdater = new MetricUpdater(metricReporter, options.fleetControllerIndex(), options.clusterName());
var log = new EventLog(timer, metricUpdater);
- var cluster = new ContentCluster(options.clusterName(), options.nodes(), options.storageDistribution());
+ var cluster = new ContentCluster(options);
var stateGatherer = new NodeStateGatherer(timer, timer, log);
var communicator = new RPCCommunicator(
RPCCommunicator.createRealSupervisor(),
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
index 2259b7b91b1..21ccfb2750e 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
@@ -127,6 +127,8 @@ public class FleetControllerOptions {
private final double clusterFeedBlockNoiseLevel;
+ private final int maxNumberOfGroupsAllowedToBeDown;
+
private FleetControllerOptions(String clusterName,
int fleetControllerIndex,
int fleetControllerCount,
@@ -167,7 +169,8 @@ public class FleetControllerOptions {
int maxDivergentNodesPrintedInTaskErrorMessages,
boolean clusterFeedBlockEnabled,
Map<String, Double> clusterFeedBlockLimit,
- double clusterFeedBlockNoiseLevel) {
+ double clusterFeedBlockNoiseLevel,
+ int maxNumberOfGroupsAllowedToBeDown) {
this.clusterName = clusterName;
this.fleetControllerIndex = fleetControllerIndex;
this.fleetControllerCount = fleetControllerCount;
@@ -209,6 +212,7 @@ public class FleetControllerOptions {
this.clusterFeedBlockEnabled = clusterFeedBlockEnabled;
this.clusterFeedBlockLimit = clusterFeedBlockLimit;
this.clusterFeedBlockNoiseLevel = clusterFeedBlockNoiseLevel;
+ this.maxNumberOfGroupsAllowedToBeDown = maxNumberOfGroupsAllowedToBeDown;
}
public Duration getMaxDeferredTaskVersionWaitTime() {
@@ -383,6 +387,8 @@ public class FleetControllerOptions {
return clusterFeedBlockNoiseLevel;
}
+ public int maxNumberOfGroupsAllowedToBeDown() { return maxNumberOfGroupsAllowedToBeDown; }
+
public static class Builder {
private String clusterName;
@@ -426,6 +432,7 @@ public class FleetControllerOptions {
private boolean clusterFeedBlockEnabled = false;
private Map<String, Double> clusterFeedBlockLimit = Collections.emptyMap();
private double clusterFeedBlockNoiseLevel = 0.01;
+ private int maxNumberOfGroupsAllowedToBeDown = 1;
public Builder(String clusterName, Collection<ConfiguredNode> nodes) {
this.clusterName = clusterName;
@@ -686,6 +693,11 @@ public class FleetControllerOptions {
return this;
}
+ public Builder setMaxNumberOfGroupsAllowedToBeDown(int maxNumberOfGroupsAllowedToBeDown) {
+ this.maxNumberOfGroupsAllowedToBeDown = maxNumberOfGroupsAllowedToBeDown;
+ return this;
+ }
+
public FleetControllerOptions build() {
return new FleetControllerOptions(clusterName,
index,
@@ -727,7 +739,8 @@ public class FleetControllerOptions {
maxDivergentNodesPrintedInTaskErrorMessages,
clusterFeedBlockEnabled,
clusterFeedBlockLimit,
- clusterFeedBlockNoiseLevel);
+ clusterFeedBlockNoiseLevel,
+ maxNumberOfGroupsAllowedToBeDown);
}
public static Builder copy(FleetControllerOptions options) {
@@ -773,6 +786,7 @@ public class FleetControllerOptions {
builder.clusterFeedBlockEnabled = options.clusterFeedBlockEnabled;
builder.clusterFeedBlockLimit = Map.copyOf(options.clusterFeedBlockLimit);
builder.clusterFeedBlockNoiseLevel = options.clusterFeedBlockNoiseLevel;
+ builder.maxNumberOfGroupsAllowedToBeDown = options.maxNumberOfGroupsAllowedToBeDown;
return builder;
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
index 9d8141020c3..7a77bb2b571 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java
@@ -47,7 +47,8 @@ public class NodeStateChangeChecker {
int requiredRedundancy,
HierarchicalGroupVisiting groupVisiting,
ClusterInfo clusterInfo,
- boolean inMoratorium) {
+ boolean inMoratorium,
+ int maxNumberOfGroupsAllowedToBeDown) {
this.requiredRedundancy = requiredRedundancy;
this.groupVisiting = groupVisiting;
this.clusterInfo = clusterInfo;
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
index ddbda2bf776..a2e77b4e3dd 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
@@ -20,7 +20,6 @@ import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.StateRestApiE
import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.response.SetResponse;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.response.UnitState;
-
import java.time.Instant;
import java.util.Map;
import java.util.Objects;
@@ -28,6 +27,8 @@ import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
+import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Result;
+
public class SetNodeStateRequest extends Request<SetResponse> {
private static final Logger log = Logger.getLogger(SetNodeStateRequest.class.getName());
@@ -125,8 +126,8 @@ public class SetNodeStateRequest extends Request<SetResponse> {
NodeState wantedState = nodeInfo.getUserWantedState();
NodeState newWantedState = getRequestedNodeState(newStates, node);
- NodeStateChangeChecker.Result result = cluster.calculateEffectOfNewState(
- node, currentClusterState, condition, wantedState, newWantedState, inMasterMoratorium);
+ Result result = cluster.calculateEffectOfNewState(node, currentClusterState, condition, wantedState, newWantedState,
+ inMasterMoratorium, cluster.maxNumberOfGroupsAllowedToBeDown());
log.log(Level.FINE, () -> "node=" + node +
" current-cluster-state=" + currentClusterState + // Includes version in output format
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java
index 413c8e7414c..f167fbc7231 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java
@@ -44,7 +44,7 @@ public class ClusterFeedBlockTest extends FleetControllerTest {
communicator = new DummyCommunicator(nodes, timer);
var metricUpdater = new MetricUpdater(new NoMetricReporter(), options.fleetControllerIndex(), options.clusterName());
var eventLog = new EventLog(timer, metricUpdater);
- var cluster = new ContentCluster(options.clusterName(), options.nodes(), options.storageDistribution());
+ var cluster = new ContentCluster(options);
var stateGatherer = new NodeStateGatherer(timer, timer, eventLog);
var database = new DatabaseHandler(context, new ZooKeeperDatabaseFactory(context), timer, options.zooKeeperServerAddress(), timer);
var stateGenerator = new StateChangeHandler(context, timer, eventLog);
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
index 4718453ee51..60d4866a33e 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java
@@ -67,7 +67,8 @@ public class NodeStateChangeCheckerTest {
}
private NodeStateChangeChecker createChangeChecker(ContentCluster cluster) {
- return new NodeStateChangeChecker(requiredRedundancy, noopVisiting, cluster.clusterInfo(), false);
+ return new NodeStateChangeChecker(requiredRedundancy, noopVisiting, cluster.clusterInfo(),
+ false, cluster.maxNumberOfGroupsAllowedToBeDown());
}
private ContentCluster createCluster(int nodeCount) {
@@ -128,7 +129,7 @@ public class NodeStateChangeCheckerTest {
void testDeniedInMoratorium() {
ContentCluster cluster = createCluster(4);
var nodeStateChangeChecker = new NodeStateChangeChecker(
- requiredRedundancy, noopVisiting, cluster.clusterInfo(), true);
+ requiredRedundancy, noopVisiting, cluster.clusterInfo(), true, cluster.maxNumberOfGroupsAllowedToBeDown());
Result result = nodeStateChangeChecker.evaluateTransition(
new Node(STORAGE, 10), defaultAllUpClusterState(), SAFE,
UP_NODE_STATE, MAINTENANCE_NODE_STATE);
@@ -140,8 +141,7 @@ public class NodeStateChangeCheckerTest {
@Test
void testUnknownStorageNode() {
ContentCluster cluster = createCluster(4);
- var nodeStateChangeChecker = new NodeStateChangeChecker(
- requiredRedundancy, noopVisiting, cluster.clusterInfo(), false);
+ var nodeStateChangeChecker = createChangeChecker(cluster);
Result result = nodeStateChangeChecker.evaluateTransition(
new Node(STORAGE, 10), defaultAllUpClusterState(), SAFE,
UP_NODE_STATE, MAINTENANCE_NODE_STATE);
@@ -155,8 +155,7 @@ public class NodeStateChangeCheckerTest {
// Nodes 0-3, storage node 0 being in maintenance with "Orchestrator" description.
ContentCluster cluster = createCluster(4);
cluster.clusterInfo().getStorageNodeInfo(0).setWantedState(new NodeState(STORAGE, State.MAINTENANCE).setDescription("Orchestrator"));
- var nodeStateChangeChecker = new NodeStateChangeChecker(
- requiredRedundancy, noopVisiting, cluster.clusterInfo(), false);
+ var nodeStateChangeChecker = createChangeChecker(cluster);
ClusterState clusterStateWith0InMaintenance = clusterState(String.format(
"version:%d distributor:4 storage:4 .0.s:m",
currentClusterStateVersion));
@@ -176,8 +175,7 @@ public class NodeStateChangeCheckerTest {
ContentCluster cluster = createCluster(4);
cluster.clusterInfo().getDistributorNodeInfo(0)
.setWantedState(new NodeState(DISTRIBUTOR, DOWN).setDescription("Orchestrator"));
- var nodeStateChangeChecker = new NodeStateChangeChecker(
- requiredRedundancy, noopVisiting, cluster.clusterInfo(), false);
+ var nodeStateChangeChecker = createChangeChecker(cluster);
ClusterState clusterStateWith0InMaintenance = clusterState(String.format(
"version:%d distributor:4 .0.s:d storage:4",
currentClusterStateVersion));
@@ -200,7 +198,7 @@ public class NodeStateChangeCheckerTest {
.setWantedState(new NodeState(STORAGE, DOWN).setDescription("Orchestrator"));
HierarchicalGroupVisiting visiting = makeHierarchicalGroupVisitingWith2Groups(4);
var nodeStateChangeChecker = new NodeStateChangeChecker(
- requiredRedundancy, visiting, cluster.clusterInfo(), false);
+ requiredRedundancy, visiting, cluster.clusterInfo(), false, cluster.maxNumberOfGroupsAllowedToBeDown());
ClusterState clusterStateWith0InMaintenance = clusterState(String.format(
"version:%d distributor:4 .0.s:d storage:4",
currentClusterStateVersion));
@@ -234,7 +232,7 @@ public class NodeStateChangeCheckerTest {
cluster.clusterInfo().getStorageNodeInfo(0).setWantedState(new NodeState(STORAGE, State.MAINTENANCE).setDescription("Orchestrator"));
HierarchicalGroupVisiting visiting = makeHierarchicalGroupVisitingWith2Groups(4);
var nodeStateChangeChecker = new NodeStateChangeChecker(
- requiredRedundancy, visiting, cluster.clusterInfo(), false);
+ requiredRedundancy, visiting, cluster.clusterInfo(), false, cluster.maxNumberOfGroupsAllowedToBeDown());
ClusterState clusterStateWith0InMaintenance = clusterState(String.format(
"version:%d distributor:4 storage:4 .0.s:m",
currentClusterStateVersion));
@@ -323,8 +321,7 @@ public class NodeStateChangeCheckerTest {
currentClusterStateVersion));
// We should then be denied setting storage node 1 safely to maintenance.
- var nodeStateChangeChecker = new NodeStateChangeChecker(
- requiredRedundancy, noopVisiting, cluster.clusterInfo(), false);
+ var nodeStateChangeChecker = createChangeChecker(cluster);
Result result = nodeStateChangeChecker.evaluateTransition(
nodeStorage, clusterStateWith3Down, SAFE,
UP_NODE_STATE, MAINTENANCE_NODE_STATE);
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequestTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequestTest.java
index 1bd17b11755..b208ff7fb27 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequestTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequestTest.java
@@ -24,6 +24,7 @@ import java.util.Optional;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyBoolean;
+import static org.mockito.ArgumentMatchers.anyInt;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.times;
@@ -129,7 +130,7 @@ public class SetNodeStateRequestTest {
when(unitState.getId()).thenReturn(wantedStateString);
when(unitState.getReason()).thenReturn(REASON);
- when(cluster.calculateEffectOfNewState(any(), any(), any(), any(), any(), anyBoolean())).thenReturn(result);
+ when(cluster.calculateEffectOfNewState(any(), any(), any(), any(), any(), anyBoolean(), anyInt())).thenReturn(result);
when(storageNodeInfo.isStorage()).thenReturn(storageNode.getType() == NodeType.STORAGE);
when(storageNodeInfo.getNodeIndex()).thenReturn(storageNode.getIndex());