diff options
10 files changed, 58 insertions, 24 deletions
diff --git a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java index e1d73436936..786928391a5 100644 --- a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java +++ b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java @@ -98,6 +98,7 @@ public class ClusterControllerClusterConfigurer extends AbstractComponent { builder.setClusterFeedBlockEnabled(config.enable_cluster_feed_block()); builder.setClusterFeedBlockLimit(Map.copyOf(config.cluster_feed_block_limit())); builder.setClusterFeedBlockNoiseLevel(config.cluster_feed_block_noise_level()); + builder.setMaxNumberOfGroupsAllowedToBeDown(config.max_number_of_groups_allowed_to_be_down()); } private static void configure(FleetControllerOptions.Builder builder, SlobroksConfig config) { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java index 5d9ec6f5f80..9347fadc0e0 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ContentCluster.java @@ -15,7 +15,6 @@ import java.util.List; import java.util.Map; import java.util.Objects; import java.util.TreeMap; -import java.util.stream.Collectors; import static com.yahoo.vdslib.state.NodeState.ORCHESTRATOR_RESERVED_DESCRIPTION; @@ -33,11 +32,25 @@ public class ContentCluster { private Distribution distribution; + private final int maxNumberOfGroupsAllowedToBeDown; + public ContentCluster(String clusterName, Collection<ConfiguredNode> configuredNodes, Distribution distribution) { + this(clusterName, configuredNodes, distribution, 1); + } + + public ContentCluster(FleetControllerOptions options) { + this(options.clusterName(), options.nodes(), options.storageDistribution(), options.maxNumberOfGroupsAllowedToBeDown()); + } + + private ContentCluster(String clusterName, + Collection<ConfiguredNode> configuredNodes, + Distribution distribution, + int maxNumberOfGroupsAllowedToBeDown) { if (configuredNodes == null) throw new IllegalArgumentException("Nodes must be set"); this.clusterName = clusterName; this.distribution = distribution; setNodes(configuredNodes, new NodeListener() {}); + this.maxNumberOfGroupsAllowedToBeDown = maxNumberOfGroupsAllowedToBeDown; } public Distribution getDistribution() { return distribution; } @@ -95,6 +108,8 @@ public class ContentCluster { public NodeInfo getNodeInfo(Node node) { return clusterInfo.getNodeInfo(node); } + public int maxNumberOfGroupsAllowedToBeDown() { return maxNumberOfGroupsAllowedToBeDown; } + public String toString() { StringBuilder sb = new StringBuilder(); sb.append("ContentCluster(").append(clusterName).append(") {"); @@ -121,13 +136,14 @@ public class ContentCluster { */ public NodeStateChangeChecker.Result calculateEffectOfNewState( Node node, ClusterState clusterState, SetUnitStateRequest.Condition condition, - NodeState oldState, NodeState newState, boolean inMoratorium) { + NodeState oldState, NodeState newState, boolean inMoratorium, int maxNumberOfGroupsAllowedToBeDown) { NodeStateChangeChecker nodeStateChangeChecker = new NodeStateChangeChecker( distribution.getRedundancy(), new HierarchicalGroupVisitingAdapter(distribution), clusterInfo, - inMoratorium + inMoratorium, + maxNumberOfGroupsAllowedToBeDown ); return nodeStateChangeChecker.evaluateTransition(node, clusterState, condition, oldState, newState); } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 4dc1fe81fd2..ff0d8b87e92 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -153,7 +153,7 @@ public class FleetController implements NodeListener, SlobrokListener, SystemSta var timer = new RealTimer(); var metricUpdater = new MetricUpdater(metricReporter, options.fleetControllerIndex(), options.clusterName()); var log = new EventLog(timer, metricUpdater); - var cluster = new ContentCluster(options.clusterName(), options.nodes(), options.storageDistribution()); + var cluster = new ContentCluster(options); var stateGatherer = new NodeStateGatherer(timer, timer, log); var communicator = new RPCCommunicator( RPCCommunicator.createRealSupervisor(), diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java index 2259b7b91b1..21ccfb2750e 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java @@ -127,6 +127,8 @@ public class FleetControllerOptions { private final double clusterFeedBlockNoiseLevel; + private final int maxNumberOfGroupsAllowedToBeDown; + private FleetControllerOptions(String clusterName, int fleetControllerIndex, int fleetControllerCount, @@ -167,7 +169,8 @@ public class FleetControllerOptions { int maxDivergentNodesPrintedInTaskErrorMessages, boolean clusterFeedBlockEnabled, Map<String, Double> clusterFeedBlockLimit, - double clusterFeedBlockNoiseLevel) { + double clusterFeedBlockNoiseLevel, + int maxNumberOfGroupsAllowedToBeDown) { this.clusterName = clusterName; this.fleetControllerIndex = fleetControllerIndex; this.fleetControllerCount = fleetControllerCount; @@ -209,6 +212,7 @@ public class FleetControllerOptions { this.clusterFeedBlockEnabled = clusterFeedBlockEnabled; this.clusterFeedBlockLimit = clusterFeedBlockLimit; this.clusterFeedBlockNoiseLevel = clusterFeedBlockNoiseLevel; + this.maxNumberOfGroupsAllowedToBeDown = maxNumberOfGroupsAllowedToBeDown; } public Duration getMaxDeferredTaskVersionWaitTime() { @@ -383,6 +387,8 @@ public class FleetControllerOptions { return clusterFeedBlockNoiseLevel; } + public int maxNumberOfGroupsAllowedToBeDown() { return maxNumberOfGroupsAllowedToBeDown; } + public static class Builder { private String clusterName; @@ -426,6 +432,7 @@ public class FleetControllerOptions { private boolean clusterFeedBlockEnabled = false; private Map<String, Double> clusterFeedBlockLimit = Collections.emptyMap(); private double clusterFeedBlockNoiseLevel = 0.01; + private int maxNumberOfGroupsAllowedToBeDown = 1; public Builder(String clusterName, Collection<ConfiguredNode> nodes) { this.clusterName = clusterName; @@ -686,6 +693,11 @@ public class FleetControllerOptions { return this; } + public Builder setMaxNumberOfGroupsAllowedToBeDown(int maxNumberOfGroupsAllowedToBeDown) { + this.maxNumberOfGroupsAllowedToBeDown = maxNumberOfGroupsAllowedToBeDown; + return this; + } + public FleetControllerOptions build() { return new FleetControllerOptions(clusterName, index, @@ -727,7 +739,8 @@ public class FleetControllerOptions { maxDivergentNodesPrintedInTaskErrorMessages, clusterFeedBlockEnabled, clusterFeedBlockLimit, - clusterFeedBlockNoiseLevel); + clusterFeedBlockNoiseLevel, + maxNumberOfGroupsAllowedToBeDown); } public static Builder copy(FleetControllerOptions options) { @@ -773,6 +786,7 @@ public class FleetControllerOptions { builder.clusterFeedBlockEnabled = options.clusterFeedBlockEnabled; builder.clusterFeedBlockLimit = Map.copyOf(options.clusterFeedBlockLimit); builder.clusterFeedBlockNoiseLevel = options.clusterFeedBlockNoiseLevel; + builder.maxNumberOfGroupsAllowedToBeDown = options.maxNumberOfGroupsAllowedToBeDown; return builder; } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java index 9d8141020c3..7a77bb2b571 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeChecker.java @@ -47,7 +47,8 @@ public class NodeStateChangeChecker { int requiredRedundancy, HierarchicalGroupVisiting groupVisiting, ClusterInfo clusterInfo, - boolean inMoratorium) { + boolean inMoratorium, + int maxNumberOfGroupsAllowedToBeDown) { this.requiredRedundancy = requiredRedundancy; this.groupVisiting = groupVisiting; this.clusterInfo = clusterInfo; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java index ddbda2bf776..a2e77b4e3dd 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java @@ -20,7 +20,6 @@ import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.StateRestApiE import com.yahoo.vespa.clustercontroller.utils.staterestapi.requests.SetUnitStateRequest; import com.yahoo.vespa.clustercontroller.utils.staterestapi.response.SetResponse; import com.yahoo.vespa.clustercontroller.utils.staterestapi.response.UnitState; - import java.time.Instant; import java.util.Map; import java.util.Objects; @@ -28,6 +27,8 @@ import java.util.Optional; import java.util.logging.Level; import java.util.logging.Logger; +import static com.yahoo.vespa.clustercontroller.core.NodeStateChangeChecker.Result; + public class SetNodeStateRequest extends Request<SetResponse> { private static final Logger log = Logger.getLogger(SetNodeStateRequest.class.getName()); @@ -125,8 +126,8 @@ public class SetNodeStateRequest extends Request<SetResponse> { NodeState wantedState = nodeInfo.getUserWantedState(); NodeState newWantedState = getRequestedNodeState(newStates, node); - NodeStateChangeChecker.Result result = cluster.calculateEffectOfNewState( - node, currentClusterState, condition, wantedState, newWantedState, inMasterMoratorium); + Result result = cluster.calculateEffectOfNewState(node, currentClusterState, condition, wantedState, newWantedState, + inMasterMoratorium, cluster.maxNumberOfGroupsAllowedToBeDown()); log.log(Level.FINE, () -> "node=" + node + " current-cluster-state=" + currentClusterState + // Includes version in output format diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java index 413c8e7414c..f167fbc7231 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java @@ -44,7 +44,7 @@ public class ClusterFeedBlockTest extends FleetControllerTest { communicator = new DummyCommunicator(nodes, timer); var metricUpdater = new MetricUpdater(new NoMetricReporter(), options.fleetControllerIndex(), options.clusterName()); var eventLog = new EventLog(timer, metricUpdater); - var cluster = new ContentCluster(options.clusterName(), options.nodes(), options.storageDistribution()); + var cluster = new ContentCluster(options); var stateGatherer = new NodeStateGatherer(timer, timer, eventLog); var database = new DatabaseHandler(context, new ZooKeeperDatabaseFactory(context), timer, options.zooKeeperServerAddress(), timer); var stateGenerator = new StateChangeHandler(context, timer, eventLog); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java index 4718453ee51..60d4866a33e 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/NodeStateChangeCheckerTest.java @@ -67,7 +67,8 @@ public class NodeStateChangeCheckerTest { } private NodeStateChangeChecker createChangeChecker(ContentCluster cluster) { - return new NodeStateChangeChecker(requiredRedundancy, noopVisiting, cluster.clusterInfo(), false); + return new NodeStateChangeChecker(requiredRedundancy, noopVisiting, cluster.clusterInfo(), + false, cluster.maxNumberOfGroupsAllowedToBeDown()); } private ContentCluster createCluster(int nodeCount) { @@ -128,7 +129,7 @@ public class NodeStateChangeCheckerTest { void testDeniedInMoratorium() { ContentCluster cluster = createCluster(4); var nodeStateChangeChecker = new NodeStateChangeChecker( - requiredRedundancy, noopVisiting, cluster.clusterInfo(), true); + requiredRedundancy, noopVisiting, cluster.clusterInfo(), true, cluster.maxNumberOfGroupsAllowedToBeDown()); Result result = nodeStateChangeChecker.evaluateTransition( new Node(STORAGE, 10), defaultAllUpClusterState(), SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); @@ -140,8 +141,7 @@ public class NodeStateChangeCheckerTest { @Test void testUnknownStorageNode() { ContentCluster cluster = createCluster(4); - var nodeStateChangeChecker = new NodeStateChangeChecker( - requiredRedundancy, noopVisiting, cluster.clusterInfo(), false); + var nodeStateChangeChecker = createChangeChecker(cluster); Result result = nodeStateChangeChecker.evaluateTransition( new Node(STORAGE, 10), defaultAllUpClusterState(), SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); @@ -155,8 +155,7 @@ public class NodeStateChangeCheckerTest { // Nodes 0-3, storage node 0 being in maintenance with "Orchestrator" description. ContentCluster cluster = createCluster(4); cluster.clusterInfo().getStorageNodeInfo(0).setWantedState(new NodeState(STORAGE, State.MAINTENANCE).setDescription("Orchestrator")); - var nodeStateChangeChecker = new NodeStateChangeChecker( - requiredRedundancy, noopVisiting, cluster.clusterInfo(), false); + var nodeStateChangeChecker = createChangeChecker(cluster); ClusterState clusterStateWith0InMaintenance = clusterState(String.format( "version:%d distributor:4 storage:4 .0.s:m", currentClusterStateVersion)); @@ -176,8 +175,7 @@ public class NodeStateChangeCheckerTest { ContentCluster cluster = createCluster(4); cluster.clusterInfo().getDistributorNodeInfo(0) .setWantedState(new NodeState(DISTRIBUTOR, DOWN).setDescription("Orchestrator")); - var nodeStateChangeChecker = new NodeStateChangeChecker( - requiredRedundancy, noopVisiting, cluster.clusterInfo(), false); + var nodeStateChangeChecker = createChangeChecker(cluster); ClusterState clusterStateWith0InMaintenance = clusterState(String.format( "version:%d distributor:4 .0.s:d storage:4", currentClusterStateVersion)); @@ -200,7 +198,7 @@ public class NodeStateChangeCheckerTest { .setWantedState(new NodeState(STORAGE, DOWN).setDescription("Orchestrator")); HierarchicalGroupVisiting visiting = makeHierarchicalGroupVisitingWith2Groups(4); var nodeStateChangeChecker = new NodeStateChangeChecker( - requiredRedundancy, visiting, cluster.clusterInfo(), false); + requiredRedundancy, visiting, cluster.clusterInfo(), false, cluster.maxNumberOfGroupsAllowedToBeDown()); ClusterState clusterStateWith0InMaintenance = clusterState(String.format( "version:%d distributor:4 .0.s:d storage:4", currentClusterStateVersion)); @@ -234,7 +232,7 @@ public class NodeStateChangeCheckerTest { cluster.clusterInfo().getStorageNodeInfo(0).setWantedState(new NodeState(STORAGE, State.MAINTENANCE).setDescription("Orchestrator")); HierarchicalGroupVisiting visiting = makeHierarchicalGroupVisitingWith2Groups(4); var nodeStateChangeChecker = new NodeStateChangeChecker( - requiredRedundancy, visiting, cluster.clusterInfo(), false); + requiredRedundancy, visiting, cluster.clusterInfo(), false, cluster.maxNumberOfGroupsAllowedToBeDown()); ClusterState clusterStateWith0InMaintenance = clusterState(String.format( "version:%d distributor:4 storage:4 .0.s:m", currentClusterStateVersion)); @@ -323,8 +321,7 @@ public class NodeStateChangeCheckerTest { currentClusterStateVersion)); // We should then be denied setting storage node 1 safely to maintenance. - var nodeStateChangeChecker = new NodeStateChangeChecker( - requiredRedundancy, noopVisiting, cluster.clusterInfo(), false); + var nodeStateChangeChecker = createChangeChecker(cluster); Result result = nodeStateChangeChecker.evaluateTransition( nodeStorage, clusterStateWith3Down, SAFE, UP_NODE_STATE, MAINTENANCE_NODE_STATE); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequestTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequestTest.java index 1bd17b11755..b208ff7fb27 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequestTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequestTest.java @@ -24,6 +24,7 @@ import java.util.Optional; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyBoolean; +import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; @@ -129,7 +130,7 @@ public class SetNodeStateRequestTest { when(unitState.getId()).thenReturn(wantedStateString); when(unitState.getReason()).thenReturn(REASON); - when(cluster.calculateEffectOfNewState(any(), any(), any(), any(), any(), anyBoolean())).thenReturn(result); + when(cluster.calculateEffectOfNewState(any(), any(), any(), any(), any(), anyBoolean(), anyInt())).thenReturn(result); when(storageNodeInfo.isStorage()).thenReturn(storageNode.getType() == NodeType.STORAGE); when(storageNodeInfo.getNodeIndex()).thenReturn(storageNode.getIndex()); diff --git a/configdefinitions/src/vespa/fleetcontroller.def b/configdefinitions/src/vespa/fleetcontroller.def index b2a3dd29be6..10eb408ed69 100644 --- a/configdefinitions/src/vespa/fleetcontroller.def +++ b/configdefinitions/src/vespa/fleetcontroller.def @@ -207,3 +207,6 @@ cluster_feed_block_limit{} double # This is in absolute numbers, so 0.01 implies that a block limit of 0.8 effectively # becomes 0.79 for an already blocked node. cluster_feed_block_noise_level double default=0.0 +# For apps that have several groups this controls how many are allowed to be down +# simultaneously. +max_number_of_groups_allowed_to_be_down int default=1 |