diff options
author | Martin Polden <mpolden@mpolden.no> | 2018-12-06 13:01:41 +0100 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2018-12-06 13:01:41 +0100 |
commit | 721634065f324bb25d5a9f07fe4e9706479ea68e (patch) | |
tree | b1b71c08d8c1c5b4ada7a8f177cd270f8d1c2cba /node-repository | |
parent | 08c2324f8da503c24db3ea44e7a78b59e4417582 (diff) |
Include throttled active nodes
Diffstat (limited to 'node-repository')
2 files changed, 17 insertions, 8 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 3d92829d662..83c2f20f8aa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -90,11 +90,12 @@ public class NodeFailer extends Maintainer { @Override protected void maintain() { + int throttledNodeFailures = 0; + // Ready nodes try (Mutex lock = nodeRepository().lockUnallocated()) { updateNodeLivenessEventsForReadyNodes(); - int throttledNodeFailures = 0; for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) { Node node = entry.getKey(); if (throttle(node)) { @@ -104,16 +105,24 @@ public class NodeFailer extends Maintainer { String reason = entry.getValue(); nodeRepository().fail(node.hostname(), Agent.system, reason); } - metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null); } // Active nodes updateNodeDownState(); - getActiveNodesByFailureReason().forEach((node, reason) -> { - if (failAllowedFor(node.type()) && !throttle(node)) { - failActive(node, reason); + for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason().entrySet()) { + Node node = entry.getKey(); + if (!failAllowedFor(node.type())) { + continue; + } + if (throttle(node)) { + throttledNodeFailures++; + continue; } - }); + String reason = entry.getValue(); + failActive(node, reason); + } + + metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null); } private void updateNodeLivenessEventsForReadyNodes() { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 6147543c102..1bf291cd2c3 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -512,7 +512,7 @@ public class NodeFailerTest { tester.failer.run(); assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling")); - assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); + assertEquals("Throttled node failures", 3, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 24 more hours pass without any other nodes being failed out for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) { @@ -522,7 +522,7 @@ public class NodeFailerTest { tester.failer.run(); assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling")); - assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); + assertEquals("Throttled node failures", 3, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // Next, the 2 ready nodes that were dead from the start are failed out, and finally // the second host and all its children are failed |