diff options
5 files changed, 60 insertions, 38 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeResourceExhaustion.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeResourceExhaustion.java index ee121d7682c..6899eaa6598 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeResourceExhaustion.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/NodeResourceExhaustion.java @@ -47,18 +47,19 @@ public class NodeResourceExhaustion { } public String toExhaustionAddedDescription() { - return String.format(Locale.US, "%s (%.3g > %.3g)", makeDescriptionPrefix(), resourceUsage.getUsage(), limit); + return String.format(Locale.US, "%s is %.1f%% full (the configured limit is %.1f%%)", + makeDescriptionPrefix(), resourceUsage.getUsage() * 100.0, limit * 100.0); } public String toExhaustionRemovedDescription() { - return String.format(Locale.US, "%s (<= %.3g)", makeDescriptionPrefix(), limit); + return String.format(Locale.US, "%s (<= %.1f%%)", makeDescriptionPrefix(), limit * 100.0); } public String toShorthandDescription() { - return String.format(Locale.US, "%s%s %.3g > %.3g", + return String.format(Locale.US, "%s%s %.1f%% > %.1f%%", resourceType, (resourceUsage.getName() != null ? ":" + resourceUsage.getName() : ""), - resourceUsage.getUsage(), limit); + resourceUsage.getUsage() * 100.0, limit * 100.0); } private String makeDescriptionPrefix() { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java index db84882cfa7..732048716bb 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java @@ -78,6 +78,11 @@ public class ResourceExhaustionCalculator { } } + public static String decoratedMessage(String msg) { + // Add a user-friendly documentation link to the error message + return "%s. See https://docs.vespa.ai/en/operations/feed-block.html".formatted(msg); + } + public ClusterStateBundle.FeedBlock inferContentClusterFeedBlockOrNull(Collection<NodeInfo> nodeInfos) { if (!feedBlockEnabled) { return null; @@ -94,6 +99,7 @@ public class ResourceExhaustionCalculator { if (exhaustions.size() > maxDescriptions) { description += String.format(" (... and %d more)", exhaustions.size() - maxDescriptions); } + description = decoratedMessage(description); // FIXME we currently will trigger a cluster state recomputation even if the number of // exhaustions is greater than what is returned as part of the description. Though at // that point, cluster state recomputations will be the least of your worries...! diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java index 55e256cf89c..4bb36546d01 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ClusterFeedBlockTest.java @@ -117,6 +117,10 @@ public class ClusterFeedBlockTest extends FleetControllerTest { assertFalse(ctrl.getClusterStateBundle().clusterFeedIsBlocked()); } + private static String decorate(String msg) { + return ResourceExhaustionCalculator.decoratedMessage(msg); + } + @Test void cluster_feed_block_state_is_recomputed_when_resource_block_set_differs() throws Exception { initialize(createOptions(mapOf(usage("cheese", 0.7), usage("wine", 0.4)))); @@ -125,14 +129,15 @@ public class ClusterFeedBlockTest extends FleetControllerTest { reportResourceUsageFromNode(1, setOf(usage("cheese", 0.8), usage("wine", 0.3))); var bundle = ctrl.getClusterStateBundle(); assertTrue(bundle.clusterFeedIsBlocked()); - assertEquals("cheese on node 1 [unknown hostname] (0.800 > 0.700)", bundle.getFeedBlock().get().getDescription()); + assertEquals(decorate("cheese on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%)"), + bundle.getFeedBlock().get().getDescription()); reportResourceUsageFromNode(1, setOf(usage("cheese", 0.8), usage("wine", 0.5))); bundle = ctrl.getClusterStateBundle(); assertTrue(bundle.clusterFeedIsBlocked()); - assertEquals("cheese on node 1 [unknown hostname] (0.800 > 0.700), " + - "wine on node 1 [unknown hostname] (0.500 > 0.400)", - bundle.getFeedBlock().get().getDescription()); + assertEquals(decorate("cheese on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%), " + + "wine on node 1 [unknown hostname] is 50.0% full (the configured limit is 40.0%)"), + bundle.getFeedBlock().get().getDescription()); } @Test @@ -143,13 +148,15 @@ public class ClusterFeedBlockTest extends FleetControllerTest { reportResourceUsageFromNode(1, setOf(usage("cheese", 0.8), usage("wine", 0.3))); var bundle = ctrl.getClusterStateBundle(); assertTrue(bundle.clusterFeedIsBlocked()); - assertEquals("cheese on node 1 [unknown hostname] (0.800 > 0.700)", bundle.getFeedBlock().get().getDescription()); + assertEquals(decorate("cheese on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%)"), + bundle.getFeedBlock().get().getDescription()); // 80% -> 90%, should not trigger new state. reportResourceUsageFromNode(1, setOf(usage("cheese", 0.9), usage("wine", 0.3))); bundle = ctrl.getClusterStateBundle(); assertTrue(bundle.clusterFeedIsBlocked()); - assertEquals("cheese on node 1 [unknown hostname] (0.800 > 0.700)", bundle.getFeedBlock().get().getDescription()); + assertEquals(decorate("cheese on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%)"), + bundle.getFeedBlock().get().getDescription()); } @Test @@ -160,7 +167,8 @@ public class ClusterFeedBlockTest extends FleetControllerTest { reportResourceUsageFromNode(1, setOf(usage("cheese", 0.75), usage("wine", 0.3))); var bundle = ctrl.getClusterStateBundle(); assertTrue(bundle.clusterFeedIsBlocked()); - assertEquals("cheese on node 1 [unknown hostname] (0.750 > 0.700)", bundle.getFeedBlock().get().getDescription()); + assertEquals(decorate("cheese on node 1 [unknown hostname] is 75.0% full (the configured limit is 70.0%)"), + bundle.getFeedBlock().get().getDescription()); reportResourceUsageFromNode(1, setOf(usage("cheese", 0.68), usage("wine", 0.3))); bundle = ctrl.getClusterStateBundle(); @@ -168,23 +176,23 @@ public class ClusterFeedBlockTest extends FleetControllerTest { // FIXME Effective limit is modified by hysteresis but due to how we check state deltas this // is not discovered here. Still correct in terms of what resources are blocked or not, but // the description is not up to date here. - assertEquals("cheese on node 1 [unknown hostname] (0.750 > 0.700)", - bundle.getFeedBlock().get().getDescription()); + assertEquals(decorate("cheese on node 1 [unknown hostname] is 75.0% full (the configured limit is 70.0%)"), + bundle.getFeedBlock().get().getDescription()); // Trigger an explicit recompute by adding a separate resource exhaustion reportResourceUsageFromNode(1, setOf(usage("cheese", 0.67), usage("wine", 0.5))); bundle = ctrl.getClusterStateBundle(); assertTrue(bundle.clusterFeedIsBlocked()); - assertEquals("cheese on node 1 [unknown hostname] (0.670 > 0.600), " + - "wine on node 1 [unknown hostname] (0.500 > 0.400)", // Not under hysteresis - bundle.getFeedBlock().get().getDescription()); + assertEquals(decorate("cheese on node 1 [unknown hostname] is 67.0% full (the configured limit is 60.0%), " + + "wine on node 1 [unknown hostname] is 50.0% full (the configured limit is 40.0%)"), // Not under hysteresis + bundle.getFeedBlock().get().getDescription()); // Wine usage drops beyond hysteresis range, should be unblocked immediately. - reportResourceUsageFromNode(1, setOf(usage("cheese", 0.61), usage("wine", 0.2))); + reportResourceUsageFromNode(1, setOf(usage("cheese", 0.611), usage("wine", 0.2))); bundle = ctrl.getClusterStateBundle(); assertTrue(bundle.clusterFeedIsBlocked()); - assertEquals("cheese on node 1 [unknown hostname] (0.610 > 0.600)", - bundle.getFeedBlock().get().getDescription()); + assertEquals(decorate("cheese on node 1 [unknown hostname] is 61.1% full (the configured limit is 60.0%)"), + bundle.getFeedBlock().get().getDescription()); // Cheese now drops below hysteresis range, should be unblocked as well. reportResourceUsageFromNode(1, setOf(usage("cheese", 0.59), usage("wine", 0.2))); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java index 8b07c216774..8ad7a63d601 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/EventDiffCalculatorTest.java @@ -508,7 +508,7 @@ public class EventDiffCalculatorTest { assertThat(events.size(), equalTo(2)); assertThat(events, hasItem(allOf( eventForNode(storageNode(1)), - nodeEventWithDescription("Added resource exhaustion: oil on node 1 [unknown hostname] (0.800 > 0.700)"), + nodeEventWithDescription("Added resource exhaustion: oil on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%)"), nodeEventForBaseline()))); assertThat(events, hasItem( clusterEventWithDescription("Cluster feed blocked due to resource exhaustion: we're closed"))); @@ -528,7 +528,7 @@ public class EventDiffCalculatorTest { assertThat(events.size(), equalTo(1)); assertThat(events, hasItem(allOf( eventForNode(storageNode(1)), - nodeEventWithDescription("Added resource exhaustion: cpu_brake_fluid on node 1 [unknown hostname] (0.800 > 0.700)"), + nodeEventWithDescription("Added resource exhaustion: cpu_brake_fluid on node 1 [unknown hostname] is 80.0% full (the configured limit is 70.0%)"), nodeEventForBaseline()))); } @@ -546,7 +546,7 @@ public class EventDiffCalculatorTest { assertThat(events.size(), equalTo(1)); assertThat(events, hasItem(allOf( eventForNode(storageNode(2)), - nodeEventWithDescription("Removed resource exhaustion: cpu_brake_fluid on node 2 [unknown hostname] (<= 0.700)"), + nodeEventWithDescription("Removed resource exhaustion: cpu_brake_fluid on node 2 [unknown hostname] (<= 70.0%)"), nodeEventForBaseline()))); } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java index 24945d0d261..fb2052476d0 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculatorTest.java @@ -15,6 +15,10 @@ import static org.junit.jupiter.api.Assertions.*; public class ResourceExhaustionCalculatorTest { + private static String decorate(String msg) { + return ResourceExhaustionCalculator.decoratedMessage(msg); + } + @Test void no_feed_block_returned_when_no_resources_lower_than_limit() { var calc = new ResourceExhaustionCalculator(true, mapOf(usage("disk", 0.5), usage("memory", 0.8))); @@ -32,7 +36,8 @@ public class ResourceExhaustionCalculatorTest { var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos()); assertNotNull(feedBlock); assertTrue(feedBlock.blockFeedInCluster()); - assertEquals("disk on node 1 [storage.1.local] (0.510 > 0.500)", feedBlock.getDescription()); + assertEquals(decorate("disk on node 1 [storage.1.local] is 51.0% full (the configured limit is 50.0%)"), + feedBlock.getDescription()); } @Test @@ -43,7 +48,8 @@ public class ResourceExhaustionCalculatorTest { var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos()); assertNotNull(feedBlock); assertTrue(feedBlock.blockFeedInCluster()); - assertEquals("disk:a-fancy-disk on node 1 [storage.1.local] (0.510 > 0.500)", feedBlock.getDescription()); + assertEquals(decorate("disk:a-fancy-disk on node 1 [storage.1.local] is 51.0% full (the configured limit is 50.0%)"), + feedBlock.getDescription()); } @Test @@ -56,8 +62,9 @@ public class ResourceExhaustionCalculatorTest { var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos()); assertNotNull(feedBlock); assertTrue(feedBlock.blockFeedInCluster()); - assertEquals("disk on node 1 [unknown hostname] (0.510 > 0.500), " + - "memory on node 2 [unknown hostname] (0.850 > 0.800)", feedBlock.getDescription()); + assertEquals(decorate("disk on node 1 [unknown hostname] is 51.0% full (the configured limit is 50.0%), " + + "memory on node 2 [unknown hostname] is 85.0% full (the configured limit is 80.0%)"), + feedBlock.getDescription()); } @Test @@ -68,10 +75,10 @@ public class ResourceExhaustionCalculatorTest { var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos()); assertNotNull(feedBlock); assertTrue(feedBlock.blockFeedInCluster()); - assertEquals("disk on node 1 [storage.1.local] (0.510 > 0.400), " + - "memory on node 1 [storage.1.local] (0.850 > 0.800), " + - "disk on node 2 [storage.2.local] (0.450 > 0.400)", - feedBlock.getDescription()); + assertEquals(decorate("disk on node 1 [storage.1.local] is 51.0% full (the configured limit is 40.0%), " + + "memory on node 1 [storage.1.local] is 85.0% full (the configured limit is 80.0%), " + + "disk on node 2 [storage.2.local] is 45.0% full (the configured limit is 40.0%)"), + feedBlock.getDescription()); } @Test @@ -83,10 +90,10 @@ public class ResourceExhaustionCalculatorTest { var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos()); assertNotNull(feedBlock); assertTrue(feedBlock.blockFeedInCluster()); - assertEquals("disk on node 1 [storage.1.local] (0.510 > 0.400), " + - "memory on node 1 [storage.1.local] (0.850 > 0.800), " + - "disk on node 2 [storage.2.local] (0.450 > 0.400) (... and 2 more)", - feedBlock.getDescription()); + assertEquals(decorate("disk on node 1 [storage.1.local] is 51.0% full (the configured limit is 40.0%), " + + "memory on node 1 [storage.1.local] is 85.0% full (the configured limit is 80.0%), " + + "disk on node 2 [storage.2.local] is 45.0% full (the configured limit is 40.0%) (... and 2 more)"), + feedBlock.getDescription()); } @Test @@ -109,8 +116,8 @@ public class ResourceExhaustionCalculatorTest { var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos()); assertNotNull(feedBlock); // TODO should we not change the limits themselves? Explicit mention of hysteresis state? - assertEquals("memory on node 1 [storage.1.local] (0.490 > 0.400)", - feedBlock.getDescription()); + assertEquals(decorate("memory on node 1 [storage.1.local] is 49.0% full (the configured limit is 40.0%)"), + feedBlock.getDescription()); } @Test @@ -123,8 +130,8 @@ public class ResourceExhaustionCalculatorTest { forNode(2, usage("disk", 0.3), usage("memory", 0.49))); var feedBlock = calc.inferContentClusterFeedBlockOrNull(cf.cluster().getNodeInfos()); assertNotNull(feedBlock); - assertEquals("memory on node 1 [storage.1.local] (0.480 > 0.400)", - feedBlock.getDescription()); + assertEquals(decorate("memory on node 1 [storage.1.local] is 48.0% full (the configured limit is 40.0%)"), + feedBlock.getDescription()); } @Test |