summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@verizonmedia.com>2021-02-10 14:36:23 +0100
committerTor Brede Vekterli <vekterli@verizonmedia.com>2021-02-10 14:45:11 +0100
commitdd230a258ba8896460a1b406ec4271622f2098f4 (patch)
tree47f6419c670a948079e78c99ad5d359654b558c2
parent5f4dcd3002d005ef91ed8721fcf5ca5afa78317c (diff)
Support configurable feed block hysteresis on the cluster controller
Adds an absolute number delta that is subtracted from the feed block limit when a node has a resource already in feed blocked state. This means that there's a lower watermark threshold that must be crossed before feeding can be unblocked. Avoids flip-flopping between block states. Default is currently 0.0, i.e. effectively disabled. To be modified later for system tests and trial roll-outs. A couple of caveats with the current implementation: * The cluster state is not recomputed automatically when just the hysteresis threshold is crossed, so the description will be out of date on the content nodes. However, if any other feed block event happens (or the hysteresis threshold is crossed), the state will be recomputed as expected. This does not affect correctness, since the feed is still to be blocked. * A node event remove/add pair is emitted for feed block status when the hysteresis threshold is crossed and there's a cluster state recomputation.
-rw-r--r--clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java1
-rw-r--r--clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java4
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculator.java8
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java6
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java2
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java64
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateVersionTracker.java4
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java51
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java4
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java43
-rw-r--r--configdefinitions/src/vespa/fleetcontroller.def8
11 files changed, 183 insertions, 12 deletions
diff --git a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
index df778028325..4cb6c5d222a 100644
--- a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
+++ b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
@@ -83,6 +83,7 @@ public class ClusterControllerClusterConfigurer {
options.enableTwoPhaseClusterStateActivation = config.enable_two_phase_cluster_state_transitions();
options.clusterFeedBlockEnabled = config.enable_cluster_feed_block();
options.clusterFeedBlockLimit = Map.copyOf(config.cluster_feed_block_limit());
+ options.clusterFeedBlockNoiseLevel = config.cluster_feed_block_noise_level();
}
private static void configure(FleetControllerOptions options, SlobroksConfig config) {
diff --git a/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java b/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java
index 9d2d7610469..76eff0066b1 100644
--- a/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java
+++ b/clustercontroller-apps/src/test/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurerTest.java
@@ -34,7 +34,8 @@ public class ClusterControllerClusterConfigurerTest {
.min_node_ratio_per_group(0.123)
.enable_cluster_feed_block(true)
.cluster_feed_block_limit("foo", 0.5)
- .cluster_feed_block_limit("bar", 0.7);
+ .cluster_feed_block_limit("bar", 0.7)
+ .cluster_feed_block_noise_level(0.05);
SlobroksConfig.Builder slobroksConfig = new SlobroksConfig.Builder();
SlobroksConfig.Slobrok.Builder slobrok = new SlobroksConfig.Slobrok.Builder();
slobrok.connectionspec("foo");
@@ -63,6 +64,7 @@ public class ClusterControllerClusterConfigurerTest {
assertTrue(configurer.getOptions().clusterFeedBlockEnabled);
assertEquals(0.5, configurer.getOptions().clusterFeedBlockLimit.get("foo"), 0.01);
assertEquals(0.7, configurer.getOptions().clusterFeedBlockLimit.get("bar"), 0.01);
+ assertEquals(0.05, configurer.getOptions().clusterFeedBlockNoiseLevel, 0.001);
try{
zookeepersConfig.zookeeperserverlist("");
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculator.java
index f4975ee4ee4..900f85be888 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculator.java
@@ -177,14 +177,14 @@ public class EventDiffCalculator {
Set<NodeResourceExhaustion> fromBlockSet = params.feedBlockFrom != null ? params.feedBlockFrom.getConcreteExhaustions() : Collections.emptySet();
Set<NodeResourceExhaustion> toBlockSet = params.feedBlockTo != null ? params.feedBlockTo.getConcreteExhaustions() : Collections.emptySet();
- for (var ex : setSubtraction(toBlockSet, fromBlockSet)) {
- var info = cluster.getNodeInfo(ex.node);
- events.add(createNodeEvent(info, String.format("Added resource exhaustion: %s", ex.toExhaustionAddedDescription()), params));
- }
for (var ex : setSubtraction(fromBlockSet, toBlockSet)) {
var info = cluster.getNodeInfo(ex.node);
events.add(createNodeEvent(info, String.format("Removed resource exhaustion: %s", ex.toExhaustionRemovedDescription()), params));
}
+ for (var ex : setSubtraction(toBlockSet, fromBlockSet)) {
+ var info = cluster.getNodeInfo(ex.node);
+ events.add(createNodeEvent(info, String.format("Added resource exhaustion: %s", ex.toExhaustionAddedDescription()), params));
+ }
}
private static void emitSingleNodeEvents(PerStateParams params, List<Event> events, ContentCluster cluster, ClusterState fromState, ClusterState toState, Node n) {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
index 60b14e86f50..83efb5d8ded 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
@@ -345,7 +345,6 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
if (!options.clusterFeedBlockEnabled) {
return;
}
- // TODO hysteresis to prevent oscillations!
var calc = createResourceExhaustionCalculator();
// Important: nodeInfo contains the _current_ host info _prior_ to newHostInfo being applied.
var previouslyExhausted = calc.enumerateNodeResourceExhaustions(nodeInfo);
@@ -953,7 +952,10 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
}
private ResourceExhaustionCalculator createResourceExhaustionCalculator() {
- return new ResourceExhaustionCalculator(options.clusterFeedBlockEnabled, options.clusterFeedBlockLimit);
+ return new ResourceExhaustionCalculator(
+ options.clusterFeedBlockEnabled, options.clusterFeedBlockLimit,
+ stateVersionTracker.getLatestCandidateStateBundle().getFeedBlockOrNull(),
+ options.clusterFeedBlockNoiseLevel);
}
private static ClusterStateDeriver createIdentityClonedBucketSpaceStateDeriver() {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
index 9c5aaecd468..e63531229d6 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
@@ -138,6 +138,8 @@ public class FleetControllerOptions implements Cloneable {
// Resource type -> limit in [0, 1]
public Map<String, Double> clusterFeedBlockLimit = Collections.emptyMap();
+ public double clusterFeedBlockNoiseLevel = 0.01;
+
public FleetControllerOptions(String clusterName, Collection<ConfiguredNode> nodes) {
this.clusterName = clusterName;
maxTransitionTime.put(NodeType.DISTRIBUTOR, 0);
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java
index 231d9f95bdb..00edd767ad6 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java
@@ -7,6 +7,7 @@ import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.Map;
+import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
@@ -14,15 +15,67 @@ import java.util.stream.Collectors;
* Given a mapping of (opaque) resource names and their exclusive limits,
* this class acts as an utility to easily enumerate all the resources that
* a given node (or set of nodes) have exhausted.
+ *
+ * In order to support hysteresis, optionally takes in the _current_ feed
+ * block state. This lets the calculator make the decision to emit a resource
+ * exhaustion for a node that is technically below the feed block limit, as
+ * long as it's not yet below the hysteresis threshold.
*/
public class ResourceExhaustionCalculator {
private final boolean feedBlockEnabled;
private final Map<String, Double> feedBlockLimits;
+ private final double feedBlockNoiseLevel;
+ private final Set<NodeAndResourceType> previouslyBlockedNodeResources;
+
+ private static class NodeAndResourceType {
+ public final int nodeIndex;
+ public final String resourceType;
+
+ public NodeAndResourceType(int nodeIndex, String resourceType) {
+ this.nodeIndex = nodeIndex;
+ this.resourceType = resourceType;
+ }
+
+ public static NodeAndResourceType of(int nodeIndex, String resourceType) {
+ return new NodeAndResourceType(nodeIndex, resourceType);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ NodeAndResourceType that = (NodeAndResourceType) o;
+ return nodeIndex == that.nodeIndex &&
+ Objects.equals(resourceType, that.resourceType);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(nodeIndex, resourceType);
+ }
+ }
public ResourceExhaustionCalculator(boolean feedBlockEnabled, Map<String, Double> feedBlockLimits) {
this.feedBlockEnabled = feedBlockEnabled;
this.feedBlockLimits = feedBlockLimits;
+ this.feedBlockNoiseLevel = 0.0;
+ this.previouslyBlockedNodeResources = Collections.emptySet();
+ }
+
+ public ResourceExhaustionCalculator(boolean feedBlockEnabled, Map<String, Double> feedBlockLimits,
+ ClusterStateBundle.FeedBlock previousFeedBlock,
+ double feedBlockNoiseLevel) {
+ this.feedBlockEnabled = feedBlockEnabled;
+ this.feedBlockLimits = feedBlockLimits;
+ this.feedBlockNoiseLevel = feedBlockNoiseLevel;
+ if (previousFeedBlock != null) {
+ this.previouslyBlockedNodeResources = previousFeedBlock.getConcreteExhaustions().stream()
+ .map(ex -> NodeAndResourceType.of(ex.node.getIndex(), ex.resourceType))
+ .collect(Collectors.toSet());
+ } else {
+ this.previouslyBlockedNodeResources = Collections.emptySet();
+ }
}
public ClusterStateBundle.FeedBlock inferContentClusterFeedBlockOrNull(Collection<NodeInfo> nodeInfos) {
@@ -50,13 +103,18 @@ public class ResourceExhaustionCalculator {
public Set<NodeResourceExhaustion> resourceExhaustionsFromHostInfo(NodeInfo nodeInfo, HostInfo hostInfo) {
Set<NodeResourceExhaustion> exceedingLimit = null;
for (var usage : hostInfo.getContentNode().getResourceUsage().entrySet()) {
- double limit = feedBlockLimits.getOrDefault(usage.getKey(), 1.0);
- if (usage.getValue().getUsage() > limit) {
+ double configuredLimit = feedBlockLimits.getOrDefault(usage.getKey(), 1.0);
+ // To enable hysteresis on feed un-block we adjust the effective limit iff the particular
+ // <node, resource> tuple was blocked in the previous state.
+ boolean wasBlocked = previouslyBlockedNodeResources.contains(NodeAndResourceType.of(nodeInfo.getNodeIndex(), usage.getKey()));
+ double effectiveLimit = wasBlocked ? Math.max(configuredLimit - feedBlockNoiseLevel, 0.0)
+ : configuredLimit;
+ if (usage.getValue().getUsage() > effectiveLimit) {
if (exceedingLimit == null) {
exceedingLimit = new LinkedHashSet<>();
}
exceedingLimit.add(new NodeResourceExhaustion(nodeInfo.getNode(), usage.getKey(), usage.getValue(),
- limit, nodeInfo.getRpcAddress()));
+ effectiveLimit, nodeInfo.getRpcAddress()));
}
}
return (exceedingLimit != null) ? exceedingLimit : Collections.emptySet();
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateVersionTracker.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateVersionTracker.java
index e2f98cf5492..12338a5bafa 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateVersionTracker.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/StateVersionTracker.java
@@ -124,6 +124,10 @@ public class StateVersionTracker {
return latestCandidateState.getBaselineAnnotatedState();
}
+ public ClusterStateBundle getLatestCandidateStateBundle() {
+ return latestCandidateState;
+ }
+
public List<ClusterStateHistoryEntry> getClusterStateHistory() {
return Collections.unmodifiableList(clusterStateHistory);
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java
index 75a197ec77a..da62aac66a2 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java
@@ -81,22 +81,27 @@ public class ClusterFeedBlockTest extends FleetControllerTest {
super.tearDown();
}
- private static FleetControllerOptions createOptions(Map<String, Double> feedBlockLimits) {
+ private static FleetControllerOptions createOptions(Map<String, Double> feedBlockLimits,
+ double clusterFeedBlockNoiseLevel) {
FleetControllerOptions options = defaultOptions("mycluster");
options.setStorageDistribution(DistributionBuilder.forFlatCluster(NODE_COUNT));
options.nodes = new HashSet<>(DistributionBuilder.buildConfiguredNodes(NODE_COUNT));
options.clusterFeedBlockEnabled = true;
options.clusterFeedBlockLimit = Map.copyOf(feedBlockLimits);
+ options.clusterFeedBlockNoiseLevel = clusterFeedBlockNoiseLevel;
return options;
}
+ private static FleetControllerOptions createOptions(Map<String, Double> feedBlockLimits) {
+ return createOptions(feedBlockLimits, 0.0);
+ }
+
private void reportResourceUsageFromNode(int nodeIndex, Set<FeedBlockUtil.UsageDetails> resourceUsages) throws Exception {
String hostInfo = createResourceUsageJson(resourceUsages);
communicator.setNodeState(new Node(NodeType.STORAGE, nodeIndex), new NodeState(NodeType.STORAGE, State.UP), hostInfo);
ctrl.tick();
}
- // TODO some form of hysteresis
@Test
public void cluster_feed_can_be_blocked_and_unblocked_by_single_node() throws Exception {
initialize(createOptions(mapOf(usage("cheese", 0.7), usage("wine", 0.4))));
@@ -168,4 +173,46 @@ public class ClusterFeedBlockTest extends FleetControllerTest {
assertEquals("cheese on node 1 [unknown hostname] (0.800 > 0.700)", bundle.getFeedBlock().get().getDescription());
}
+ @Test
+ public void cluster_feed_block_state_is_recomputed_when_usage_enters_hysteresis_range() throws Exception {
+ initialize(createOptions(mapOf(usage("cheese", 0.7), usage("wine", 0.4)), 0.1));
+ assertFalse(ctrl.getClusterStateBundle().clusterFeedIsBlocked());
+
+ reportResourceUsageFromNode(1, setOf(usage("cheese", 0.75), usage("wine", 0.3)));
+ var bundle = ctrl.getClusterStateBundle();
+ assertTrue(bundle.clusterFeedIsBlocked());
+ assertEquals("cheese on node 1 [unknown hostname] (0.750 > 0.700)", bundle.getFeedBlock().get().getDescription());
+
+ reportResourceUsageFromNode(1, setOf(usage("cheese", 0.68), usage("wine", 0.3)));
+ bundle = ctrl.getClusterStateBundle();
+ assertTrue(bundle.clusterFeedIsBlocked());
+ // FIXME Effective limit is modified by hysteresis but due to how we check state deltas this
+ // is not discovered here. Still correct in terms of what resources are blocked or not, but
+ // the description is not up to date here.
+ assertEquals("cheese on node 1 [unknown hostname] (0.750 > 0.700)",
+ bundle.getFeedBlock().get().getDescription());
+
+ // Trigger an explicit recompute by adding a separate resource exhaustion
+ reportResourceUsageFromNode(1, setOf(usage("cheese", 0.67), usage("wine", 0.5)));
+ bundle = ctrl.getClusterStateBundle();
+ assertTrue(bundle.clusterFeedIsBlocked());
+ assertEquals("cheese on node 1 [unknown hostname] (0.670 > 0.600), " +
+ "wine on node 1 [unknown hostname] (0.500 > 0.400)", // Not under hysteresis
+ bundle.getFeedBlock().get().getDescription());
+
+ // Wine usage drops beyond hysteresis range, should be unblocked immediately.
+ reportResourceUsageFromNode(1, setOf(usage("cheese", 0.61), usage("wine", 0.2)));
+ bundle = ctrl.getClusterStateBundle();
+ assertTrue(bundle.clusterFeedIsBlocked());
+ assertEquals("cheese on node 1 [unknown hostname] (0.610 > 0.600)",
+ bundle.getFeedBlock().get().getDescription());
+
+ // Cheese now drops below hysteresis range, should be unblocked as well.
+ reportResourceUsageFromNode(1, setOf(usage("cheese", 0.59), usage("wine", 0.2)));
+ bundle = ctrl.getClusterStateBundle();
+ assertFalse(bundle.clusterFeedIsBlocked());
+ }
+
+ // FIXME implicit changes in limits due to hysteresis adds spurious exhaustion remove+add node event pair
+
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java
index 2254435e629..65199aa9957 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/FeedBlockUtil.java
@@ -89,6 +89,10 @@ public class FeedBlockUtil {
return new NodeResourceExhaustion(new Node(NodeType.STORAGE, index), type, new ResourceUsage(0.8, null), 0.7, "foo");
}
+ static NodeResourceExhaustion exhaustion(int index, String type, double usage) {
+ return new NodeResourceExhaustion(new Node(NodeType.STORAGE, index), type, new ResourceUsage(usage, null), 0.7, "foo");
+ }
+
static Set<NodeResourceExhaustion> setOf(NodeResourceExhaustion... exhaustions) {
return Arrays.stream(exhaustions).collect(Collectors.toCollection(LinkedHashSet::new));
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java
index f5f7b4676d8..55cf173aa25 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java
@@ -5,8 +5,10 @@ import org.junit.Test;
import static com.yahoo.vespa.clustercontroller.core.ClusterFixture.storageNode;
import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.createFixtureWithReportedUsages;
+import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.exhaustion;
import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.forNode;
import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.mapOf;
+import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.setOf;
import static com.yahoo.vespa.clustercontroller.core.FeedBlockUtil.usage;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
@@ -98,4 +100,45 @@ public class ResourceExhaustionCalculatorTest {
assertNull(feedBlock);
}
+ @Test
+ public void retain_node_feed_block_status_when_within_hysteresis_window_limit_crossed_edge_case() {
+ var curFeedBlock = ClusterStateBundle.FeedBlock.blockedWith("foo", setOf(exhaustion(1, "memory", 0.51)));
+ var calc = new ResourceExhaustionCalculator(true, mapOf(usage("disk", 0.5), usage("memory", 0.5)), curFeedBlock, 0.1);
+ // Node 1 goes from 0.51 to 0.49, crossing the 0.5 threshold. Should still be blocked.
+ // Node 2 is at 0.49 but was not previously blocked and should not be blocked now either.
+ var cf = createFixtureWithReportedUsages(forNode(1, usage("disk", 0.3), usage("memory", 0.49)),
+ forNode(2, usage("disk", 0.3), usage("memory", 0.49)));
+ var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfo());
+ assertNotNull(feedBlock);
+ // TODO should we not change the limits themselves? Explicit mention of hysteresis state?
+ assertEquals("memory on node 1 [storage.1.local] (0.490 > 0.400)",
+ feedBlock.getDescription());
+ }
+
+ @Test
+ public void retain_node_feed_block_status_when_within_hysteresis_window_under_limit_edge_case() {
+ var curFeedBlock = ClusterStateBundle.FeedBlock.blockedWith("foo", setOf(exhaustion(1, "memory", 0.49)));
+ var calc = new ResourceExhaustionCalculator(true, mapOf(usage("disk", 0.5), usage("memory", 0.5)), curFeedBlock, 0.1);
+ // Node 1 goes from 0.49 to 0.48, NOT crossing the 0.5 threshold. Should still be blocked.
+ // Node 2 is at 0.49 but was not previously blocked and should not be blocked now either.
+ var cf = createFixtureWithReportedUsages(forNode(1, usage("disk", 0.3), usage("memory", 0.48)),
+ forNode(2, usage("disk", 0.3), usage("memory", 0.49)));
+ var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfo());
+ assertNotNull(feedBlock);
+ assertEquals("memory on node 1 [storage.1.local] (0.480 > 0.400)",
+ feedBlock.getDescription());
+ }
+
+ @Test
+ public void retained_node_feed_block_cleared_once_hysteresis_threshold_is_passed() {
+ var curFeedBlock = ClusterStateBundle.FeedBlock.blockedWith("foo", setOf(exhaustion(1, "memory", 0.48)));
+ var calc = new ResourceExhaustionCalculator(true, mapOf(usage("disk", 0.5), usage("memory", 0.5)), curFeedBlock, 0.1);
+ // Node 1 goes from 0.48 to 0.39. Should be unblocked
+ // Node 2 is at 0.49 but was not previously blocked and should not be blocked now either.
+ var cf = createFixtureWithReportedUsages(forNode(1, usage("disk", 0.3), usage("memory", 0.39)),
+ forNode(2, usage("disk", 0.3), usage("memory", 0.49)));
+ var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfo());
+ assertNull(feedBlock);
+ }
+
}
diff --git a/configdefinitions/src/vespa/fleetcontroller.def b/configdefinitions/src/vespa/fleetcontroller.def
index 3c88639d09d..d2d746363f0 100644
--- a/configdefinitions/src/vespa/fleetcontroller.def
+++ b/configdefinitions/src/vespa/fleetcontroller.def
@@ -197,3 +197,11 @@ enable_cluster_feed_block bool default=false
# The keys used must match the similar keys in the host info JSON structure.
# All limits are numbers between 0.0 and 1.0.
cluster_feed_block_limit{} double
+
+# To avoid having the cluster feed block state flip-flop from nodes that are hovering
+# just around the feed block limits, this noise threshold implicitly makes the
+# feed block limit value _lower_ for a resource that is already exhausted. I.e. the
+# node must reach a lower resource usage than the limit for feed to be unblocked.
+# This is in absolute numbers, so 0.01 implies that a block limit of 0.8 effectively
+# becomes 0.79 for an already blocked node.
+cluster_feed_block_noise_level double default=0.0