diff options
author | Martin Polden <mpolden@mpolden.no> | 2018-12-06 11:42:56 +0100 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2018-12-06 11:42:56 +0100 |
commit | 08c2324f8da503c24db3ea44e7a78b59e4417582 (patch) | |
tree | d3c38ca5f0bb9e4ca1076b7db63bde3c3b3b09e4 /node-repository | |
parent | 5101119fcf57ff624169fd728ab92a3fef5530b3 (diff) |
Emit metric for throttled node failures
Diffstat (limited to 'node-repository')
2 files changed, 32 insertions, 11 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 497a511dc71..3d92829d662 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -47,6 +47,12 @@ public class NodeFailer extends Maintainer { private static final Logger log = Logger.getLogger(NodeFailer.class.getName()); private static final Duration nodeRequestInterval = Duration.ofMinutes(10); + /** Metric for number of nodes that we want to fail, but cannot due to throttling */ + public static final String throttledNodeFailuresMetric = "throttledNodeFailures"; + + /** Metric that indicates whether throttling is active where 1 means active and 0 means inactive */ + public static final String throttlingActiveMetric = "nodeFailThrottling"; + /** Provides information about the status of ready hosts */ private final HostLivenessTracker hostLivenessTracker; @@ -88,11 +94,17 @@ public class NodeFailer extends Maintainer { try (Mutex lock = nodeRepository().lockUnallocated()) { updateNodeLivenessEventsForReadyNodes(); - getReadyNodesByFailureReason().forEach((node, reason) -> { - if (!throttle(node)) { - nodeRepository().fail(node.hostname(), Agent.system, reason); + int throttledNodeFailures = 0; + for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) { + Node node = entry.getKey(); + if (throttle(node)) { + throttledNodeFailures++; + continue; } - }); + String reason = entry.getValue(); + nodeRepository().fail(node.hostname(), Agent.system, reason); + } + metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null); } // Active nodes @@ -317,7 +329,7 @@ public class NodeFailer extends Maintainer { log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(), throttlePolicy.toHumanReadableString())); } - metric.set("nodeFailThrottling", throttle ? 1 : 0, null); + metric.set(throttlingActiveMetric, throttle ? 1 : 0, null); return throttle; } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 08cf8e7dc20..6147543c102 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -487,7 +487,8 @@ public class NodeFailerTest { // 2 nodes are failed (the minimum amount that are always allowed to fail) tester.failer.run(); assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 6 more hours pass, no more nodes are failed for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) { @@ -496,7 +497,8 @@ public class NodeFailerTest { } tester.failer.run(); assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 2 docker hosts now fail, 1 of them (with all its children is allowed to fail) hosts.subList(0, 2).forEach(host -> { @@ -509,7 +511,8 @@ public class NodeFailerTest { tester.failer.run(); assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 24 more hours pass without any other nodes being failed out for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) { @@ -518,21 +521,24 @@ public class NodeFailerTest { } tester.failer.run(); assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // Next, the 2 ready nodes that were dead from the start are failed out, and finally // the second host and all its children are failed tester.clock.advance(Duration.ofMinutes(30)); tester.failer.run(); assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get("nodeFailThrottling")); + assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // Nothing else to fail tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(deadNodes); tester.failer.run(); assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is not indicated by the metric.", 0, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get("nodeFailThrottling")); + assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); } // Throttles based on percentage in large zone @@ -550,6 +556,7 @@ public class NodeFailerTest { // 1% are allowed to fail assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 6 more hours pass, no more nodes are failed for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) { @@ -559,6 +566,7 @@ public class NodeFailerTest { tester.failer.run(); assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 18 more hours pass, 24 hours since the first 5 nodes were failed. The remaining 5 are failed for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) { @@ -568,6 +576,7 @@ public class NodeFailerTest { tester.failer.run(); assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get("nodeFailThrottling")); + assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); } } |