summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2018-12-06 13:01:41 +0100
committerMartin Polden <mpolden@mpolden.no>2018-12-06 13:01:41 +0100
commit721634065f324bb25d5a9f07fe4e9706479ea68e (patch)
treeb1b71c08d8c1c5b4ada7a8f177cd270f8d1c2cba /node-repository
parent08c2324f8da503c24db3ea44e7a78b59e4417582 (diff)
Include throttled active nodes
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java21
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java4
2 files changed, 17 insertions, 8 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 3d92829d662..83c2f20f8aa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -90,11 +90,12 @@ public class NodeFailer extends Maintainer {
@Override
protected void maintain() {
+ int throttledNodeFailures = 0;
+
// Ready nodes
try (Mutex lock = nodeRepository().lockUnallocated()) {
updateNodeLivenessEventsForReadyNodes();
- int throttledNodeFailures = 0;
for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) {
Node node = entry.getKey();
if (throttle(node)) {
@@ -104,16 +105,24 @@ public class NodeFailer extends Maintainer {
String reason = entry.getValue();
nodeRepository().fail(node.hostname(), Agent.system, reason);
}
- metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null);
}
// Active nodes
updateNodeDownState();
- getActiveNodesByFailureReason().forEach((node, reason) -> {
- if (failAllowedFor(node.type()) && !throttle(node)) {
- failActive(node, reason);
+ for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason().entrySet()) {
+ Node node = entry.getKey();
+ if (!failAllowedFor(node.type())) {
+ continue;
+ }
+ if (throttle(node)) {
+ throttledNodeFailures++;
+ continue;
}
- });
+ String reason = entry.getValue();
+ failActive(node, reason);
+ }
+
+ metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null);
}
private void updateNodeLivenessEventsForReadyNodes() {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 6147543c102..1bf291cd2c3 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -512,7 +512,7 @@ public class NodeFailerTest {
tester.failer.run();
assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling"));
- assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
+ assertEquals("Throttled node failures", 3, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 24 more hours pass without any other nodes being failed out
for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) {
@@ -522,7 +522,7 @@ public class NodeFailerTest {
tester.failer.run();
assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling"));
- assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
+ assertEquals("Throttled node failures", 3, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// Next, the 2 ready nodes that were dead from the start are failed out, and finally
// the second host and all its children are failed