summaryrefslogtreecommitdiffstats
path: root/clustercontroller-core
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2023-07-26 11:42:13 +0200
committerTor Brede Vekterli <vekterli@yahooinc.com>2023-07-26 11:42:13 +0200
commit143fbc7218e9487e029c97aa1c813acb8ffd0317 (patch)
tree606e6a6130e73c4a601104e4d7682f3713560581 /clustercontroller-core
parent5bed3bea31f386ed0233b34215526463dcfb27eb (diff)
Make generated automatic feed block error messages more user-friendly
Messages are generated centrally by the cluster controller and pushed to content nodes as part of a cluster state bundle; the distributors nodes merely repeat back what they have been told. This changes the cluster controller feed block error message code to be less ambiguous and to include a URL to our public documentation about feed blocks. Example of _old_ message: ``` disk on node 1 [storage.1.local] (0.510 > 0.500) ``` Same feed block with _new_ message: ``` disk on node 1 [storage.1.local] is 51.0% full (the configured limit is 50.0%). See https://docs.vespa.ai/en/operations/feed-block.html ```
Diffstat (limited to 'clustercontroller-core')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeResourceExhaustion.java9
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java6
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java38
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java6
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java39
5 files changed, 60 insertions, 38 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeResourceExhaustion.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeResourceExhaustion.java
index ee121d7682c..6899eaa6598 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeResourceExhaustion.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeResourceExhaustion.java
@@ -47,18 +47,19 @@ public class NodeResourceExhaustion {
}
public String toExhaustionAddedDescription() {
- return String.format(Locale.US, "%s (%.3g > %.3g)", makeDescriptionPrefix(), resourceUsage.getUsage(), limit);
+ return String.format(Locale.US, "%s is %.1f%% full (the configured limit is %.1f%%)",
+ makeDescriptionPrefix(), resourceUsage.getUsage() * 100.0, limit * 100.0);
}
public String toExhaustionRemovedDescription() {
- return String.format(Locale.US, "%s (<= %.3g)", makeDescriptionPrefix(), limit);
+ return String.format(Locale.US, "%s (<= %.1f%%)", makeDescriptionPrefix(), limit * 100.0);
}
public String toShorthandDescription() {
- return String.format(Locale.US, "%s%s %.3g > %.3g",
+ return String.format(Locale.US, "%s%s %.1f%% > %.1f%%",
resourceType,
(resourceUsage.getName() != null ? ":" + resourceUsage.getName() : ""),
- resourceUsage.getUsage(), limit);
+ resourceUsage.getUsage() * 100.0, limit * 100.0);
}
private String makeDescriptionPrefix() {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java
index db84882cfa7..732048716bb 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java
@@ -78,6 +78,11 @@ public class ResourceExhaustionCalculator {
}
}
+ public static String decoratedMessage(String msg) {
+ // Add a user-friendly documentation link to the error message
+ return "%s. See https://docs.vespa.ai/en/operations/feed-block.html".formatted(msg);
+ }
+
public ClusterStateBundle.FeedBlock inferContentClusterFeedBlockOrNull(Collection<NodeInfo> nodeInfos) {
if (!feedBlockEnabled) {
return null;
@@ -94,6 +99,7 @@ public class ResourceExhaustionCalculator {
if (exhaustions.size() > maxDescriptions) {
description += String.format(" (... and %d more)", exhaustions.size() - maxDescriptions);
}
+ description = decoratedMessage(description);
// FIXME we currently will trigger a cluster state recomputation even if the number of
// exhaustions is greater than what is returned as part of the description. Though at
// that point, cluster state recomputations will be the least of your worries...!
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java
index 55e256cf89c..4bb36546d01 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java
@@ -117,6 +117,10 @@ public class ClusterFeedBlockTest extends FleetControllerTest {
assertFalse(ctrl.getClusterStateBundle().clusterFeedIsBlocked());
}
+ private static String decorate(String msg) {
+ return ResourceExhaustionCalculator.decoratedMessage(msg);
+ }
+
@Test
void cluster_feed_block_state_is_recomputed_when_resource_block_set_differs() throws Exception {
initialize(createOptions(mapOf(usage("cheese", 0.7), usage("wine", 0.4))));
@@ -125,14 +129,15 @@ public class ClusterFeedBlockTest extends FleetControllerTest {
reportResourceUsageFromNode(1, setOf(usage("cheese", 0.8), usage("wine", 0.3)));
var bundle = ctrl.getClusterStateBundle();
assertTrue(bundle.clusterFeedIsBlocked());
- assertEquals("cheese on node 1 [unknown hostname] (0.800 > 0.700)", bundle.getFeedBlock().get().getDescription());
+ assertEquals(decorate("cheese on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%)"),
+ bundle.getFeedBlock().get().getDescription());
reportResourceUsageFromNode(1, setOf(usage("cheese", 0.8), usage("wine", 0.5)));
bundle = ctrl.getClusterStateBundle();
assertTrue(bundle.clusterFeedIsBlocked());
- assertEquals("cheese on node 1 [unknown hostname] (0.800 > 0.700), " +
- "wine on node 1 [unknown hostname] (0.500 > 0.400)",
- bundle.getFeedBlock().get().getDescription());
+ assertEquals(decorate("cheese on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%), " +
+ "wine on node 1 [unknown hostname] is 50.0% full (the configured limit is 40.0%)"),
+ bundle.getFeedBlock().get().getDescription());
}
@Test
@@ -143,13 +148,15 @@ public class ClusterFeedBlockTest extends FleetControllerTest {
reportResourceUsageFromNode(1, setOf(usage("cheese", 0.8), usage("wine", 0.3)));
var bundle = ctrl.getClusterStateBundle();
assertTrue(bundle.clusterFeedIsBlocked());
- assertEquals("cheese on node 1 [unknown hostname] (0.800 > 0.700)", bundle.getFeedBlock().get().getDescription());
+ assertEquals(decorate("cheese on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%)"),
+ bundle.getFeedBlock().get().getDescription());
// 80% -> 90%, should not trigger new state.
reportResourceUsageFromNode(1, setOf(usage("cheese", 0.9), usage("wine", 0.3)));
bundle = ctrl.getClusterStateBundle();
assertTrue(bundle.clusterFeedIsBlocked());
- assertEquals("cheese on node 1 [unknown hostname] (0.800 > 0.700)", bundle.getFeedBlock().get().getDescription());
+ assertEquals(decorate("cheese on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%)"),
+ bundle.getFeedBlock().get().getDescription());
}
@Test
@@ -160,7 +167,8 @@ public class ClusterFeedBlockTest extends FleetControllerTest {
reportResourceUsageFromNode(1, setOf(usage("cheese", 0.75), usage("wine", 0.3)));
var bundle = ctrl.getClusterStateBundle();
assertTrue(bundle.clusterFeedIsBlocked());
- assertEquals("cheese on node 1 [unknown hostname] (0.750 > 0.700)", bundle.getFeedBlock().get().getDescription());
+ assertEquals(decorate("cheese on node 1 [unknown hostname] is 75.0% full (the configured limit is 70.0%)"),
+ bundle.getFeedBlock().get().getDescription());
reportResourceUsageFromNode(1, setOf(usage("cheese", 0.68), usage("wine", 0.3)));
bundle = ctrl.getClusterStateBundle();
@@ -168,23 +176,23 @@ public class ClusterFeedBlockTest extends FleetControllerTest {
// FIXME Effective limit is modified by hysteresis but due to how we check state deltas this
// is not discovered here. Still correct in terms of what resources are blocked or not, but
// the description is not up to date here.
- assertEquals("cheese on node 1 [unknown hostname] (0.750 > 0.700)",
- bundle.getFeedBlock().get().getDescription());
+ assertEquals(decorate("cheese on node 1 [unknown hostname] is 75.0% full (the configured limit is 70.0%)"),
+ bundle.getFeedBlock().get().getDescription());
// Trigger an explicit recompute by adding a separate resource exhaustion
reportResourceUsageFromNode(1, setOf(usage("cheese", 0.67), usage("wine", 0.5)));
bundle = ctrl.getClusterStateBundle();
assertTrue(bundle.clusterFeedIsBlocked());
- assertEquals("cheese on node 1 [unknown hostname] (0.670 > 0.600), " +
- "wine on node 1 [unknown hostname] (0.500 > 0.400)", // Not under hysteresis
- bundle.getFeedBlock().get().getDescription());
+ assertEquals(decorate("cheese on node 1 [unknown hostname] is 67.0% full (the configured limit is 60.0%), " +
+ "wine on node 1 [unknown hostname] is 50.0% full (the configured limit is 40.0%)"), // Not under hysteresis
+ bundle.getFeedBlock().get().getDescription());
// Wine usage drops beyond hysteresis range, should be unblocked immediately.
- reportResourceUsageFromNode(1, setOf(usage("cheese", 0.61), usage("wine", 0.2)));
+ reportResourceUsageFromNode(1, setOf(usage("cheese", 0.611), usage("wine", 0.2)));
bundle = ctrl.getClusterStateBundle();
assertTrue(bundle.clusterFeedIsBlocked());
- assertEquals("cheese on node 1 [unknown hostname] (0.610 > 0.600)",
- bundle.getFeedBlock().get().getDescription());
+ assertEquals(decorate("cheese on node 1 [unknown hostname] is 61.1% full (the configured limit is 60.0%)"),
+ bundle.getFeedBlock().get().getDescription());
// Cheese now drops below hysteresis range, should be unblocked as well.
reportResourceUsageFromNode(1, setOf(usage("cheese", 0.59), usage("wine", 0.2)));
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java
index 8b07c216774..8ad7a63d601 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java
@@ -508,7 +508,7 @@ public class EventDiffCalculatorTest {
assertThat(events.size(), equalTo(2));
assertThat(events, hasItem(allOf(
eventForNode(storageNode(1)),
- nodeEventWithDescription("Added resource exhaustion: oil on node 1 [unknown hostname] (0.800 > 0.700)"),
+ nodeEventWithDescription("Added resource exhaustion: oil on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%)"),
nodeEventForBaseline())));
assertThat(events, hasItem(
clusterEventWithDescription("Cluster feed blocked due to resource exhaustion: we're closed")));
@@ -528,7 +528,7 @@ public class EventDiffCalculatorTest {
assertThat(events.size(), equalTo(1));
assertThat(events, hasItem(allOf(
eventForNode(storageNode(1)),
- nodeEventWithDescription("Added resource exhaustion: cpu_brake_fluid on node 1 [unknown hostname] (0.800 > 0.700)"),
+ nodeEventWithDescription("Added resource exhaustion: cpu_brake_fluid on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%)"),
nodeEventForBaseline())));
}
@@ -546,7 +546,7 @@ public class EventDiffCalculatorTest {
assertThat(events.size(), equalTo(1));
assertThat(events, hasItem(allOf(
eventForNode(storageNode(2)),
- nodeEventWithDescription("Removed resource exhaustion: cpu_brake_fluid on node 2 [unknown hostname] (<= 0.700)"),
+ nodeEventWithDescription("Removed resource exhaustion: cpu_brake_fluid on node 2 [unknown hostname] (<= 70.0%)"),
nodeEventForBaseline())));
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java
index 24945d0d261..fb2052476d0 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java
@@ -15,6 +15,10 @@ import static org.junit.jupiter.api.Assertions.*;
public class ResourceExhaustionCalculatorTest {
+ private static String decorate(String msg) {
+ return ResourceExhaustionCalculator.decoratedMessage(msg);
+ }
+
@Test
void no_feed_block_returned_when_no_resources_lower_than_limit() {
var calc = new ResourceExhaustionCalculator(true, mapOf(usage("disk", 0.5), usage("memory", 0.8)));
@@ -32,7 +36,8 @@ public class ResourceExhaustionCalculatorTest {
var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos());
assertNotNull(feedBlock);
assertTrue(feedBlock.blockFeedInCluster());
- assertEquals("disk on node 1 [storage.1.local] (0.510 > 0.500)", feedBlock.getDescription());
+ assertEquals(decorate("disk on node 1 [storage.1.local] is 51.0% full (the configured limit is 50.0%)"),
+ feedBlock.getDescription());
}
@Test
@@ -43,7 +48,8 @@ public class ResourceExhaustionCalculatorTest {
var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos());
assertNotNull(feedBlock);
assertTrue(feedBlock.blockFeedInCluster());
- assertEquals("disk:a-fancy-disk on node 1 [storage.1.local] (0.510 > 0.500)", feedBlock.getDescription());
+ assertEquals(decorate("disk:a-fancy-disk on node 1 [storage.1.local] is 51.0% full (the configured limit is 50.0%)"),
+ feedBlock.getDescription());
}
@Test
@@ -56,8 +62,9 @@ public class ResourceExhaustionCalculatorTest {
var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos());
assertNotNull(feedBlock);
assertTrue(feedBlock.blockFeedInCluster());
- assertEquals("disk on node 1 [unknown hostname] (0.510 > 0.500), " +
- "memory on node 2 [unknown hostname] (0.850 > 0.800)", feedBlock.getDescription());
+ assertEquals(decorate("disk on node 1 [unknown hostname] is 51.0% full (the configured limit is 50.0%), " +
+ "memory on node 2 [unknown hostname] is 85.0% full (the configured limit is 80.0%)"),
+ feedBlock.getDescription());
}
@Test
@@ -68,10 +75,10 @@ public class ResourceExhaustionCalculatorTest {
var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos());
assertNotNull(feedBlock);
assertTrue(feedBlock.blockFeedInCluster());
- assertEquals("disk on node 1 [storage.1.local] (0.510 > 0.400), " +
- "memory on node 1 [storage.1.local] (0.850 > 0.800), " +
- "disk on node 2 [storage.2.local] (0.450 > 0.400)",
- feedBlock.getDescription());
+ assertEquals(decorate("disk on node 1 [storage.1.local] is 51.0% full (the configured limit is 40.0%), " +
+ "memory on node 1 [storage.1.local] is 85.0% full (the configured limit is 80.0%), " +
+ "disk on node 2 [storage.2.local] is 45.0% full (the configured limit is 40.0%)"),
+ feedBlock.getDescription());
}
@Test
@@ -83,10 +90,10 @@ public class ResourceExhaustionCalculatorTest {
var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos());
assertNotNull(feedBlock);
assertTrue(feedBlock.blockFeedInCluster());
- assertEquals("disk on node 1 [storage.1.local] (0.510 > 0.400), " +
- "memory on node 1 [storage.1.local] (0.850 > 0.800), " +
- "disk on node 2 [storage.2.local] (0.450 > 0.400) (... and 2 more)",
- feedBlock.getDescription());
+ assertEquals(decorate("disk on node 1 [storage.1.local] is 51.0% full (the configured limit is 40.0%), " +
+ "memory on node 1 [storage.1.local] is 85.0% full (the configured limit is 80.0%), " +
+ "disk on node 2 [storage.2.local] is 45.0% full (the configured limit is 40.0%) (... and 2 more)"),
+ feedBlock.getDescription());
}
@Test
@@ -109,8 +116,8 @@ public class ResourceExhaustionCalculatorTest {
var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos());
assertNotNull(feedBlock);
// TODO should we not change the limits themselves? Explicit mention of hysteresis state?
- assertEquals("memory on node 1 [storage.1.local] (0.490 > 0.400)",
- feedBlock.getDescription());
+ assertEquals(decorate("memory on node 1 [storage.1.local] is 49.0% full (the configured limit is 40.0%)"),
+ feedBlock.getDescription());
}
@Test
@@ -123,8 +130,8 @@ public class ResourceExhaustionCalculatorTest {
forNode(2, usage("disk", 0.3), usage("memory", 0.49)));
var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos());
assertNotNull(feedBlock);
- assertEquals("memory on node 1 [storage.1.local] (0.480 > 0.400)",
- feedBlock.getDescription());
+ assertEquals(decorate("memory on node 1 [storage.1.local] is 48.0% full (the configured limit is 40.0%)"),
+ feedBlock.getDescription());
}
@Test