summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2018-12-06 11:42:56 +0100
committerMartin Polden <mpolden@mpolden.no>2018-12-06 11:42:56 +0100
commit08c2324f8da503c24db3ea44e7a78b59e4417582 (patch)
treed3c38ca5f0bb9e4ca1076b7db63bde3c3b3b09e4 /node-repository
parent5101119fcf57ff624169fd728ab92a3fef5530b3 (diff)
Emit metric for throttled node failures
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java22
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java21
2 files changed, 32 insertions, 11 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 497a511dc71..3d92829d662 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -47,6 +47,12 @@ public class NodeFailer extends Maintainer {
private static final Logger log = Logger.getLogger(NodeFailer.class.getName());
private static final Duration nodeRequestInterval = Duration.ofMinutes(10);
+ /** Metric for number of nodes that we want to fail, but cannot due to throttling */
+ public static final String throttledNodeFailuresMetric = "throttledNodeFailures";
+
+ /** Metric that indicates whether throttling is active where 1 means active and 0 means inactive */
+ public static final String throttlingActiveMetric = "nodeFailThrottling";
+
/** Provides information about the status of ready hosts */
private final HostLivenessTracker hostLivenessTracker;
@@ -88,11 +94,17 @@ public class NodeFailer extends Maintainer {
try (Mutex lock = nodeRepository().lockUnallocated()) {
updateNodeLivenessEventsForReadyNodes();
- getReadyNodesByFailureReason().forEach((node, reason) -> {
- if (!throttle(node)) {
- nodeRepository().fail(node.hostname(), Agent.system, reason);
+ int throttledNodeFailures = 0;
+ for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) {
+ Node node = entry.getKey();
+ if (throttle(node)) {
+ throttledNodeFailures++;
+ continue;
}
- });
+ String reason = entry.getValue();
+ nodeRepository().fail(node.hostname(), Agent.system, reason);
+ }
+ metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null);
}
// Active nodes
@@ -317,7 +329,7 @@ public class NodeFailer extends Maintainer {
log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(),
throttlePolicy.toHumanReadableString()));
}
- metric.set("nodeFailThrottling", throttle ? 1 : 0, null);
+ metric.set(throttlingActiveMetric, throttle ? 1 : 0, null);
return throttle;
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 08cf8e7dc20..6147543c102 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -487,7 +487,8 @@ public class NodeFailerTest {
// 2 nodes are failed (the minimum amount that are always allowed to fail)
tester.failer.run();
assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 6 more hours pass, no more nodes are failed
for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) {
@@ -496,7 +497,8 @@ public class NodeFailerTest {
}
tester.failer.run();
assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 2 docker hosts now fail, 1 of them (with all its children is allowed to fail)
hosts.subList(0, 2).forEach(host -> {
@@ -509,7 +511,8 @@ public class NodeFailerTest {
tester.failer.run();
assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 24 more hours pass without any other nodes being failed out
for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) {
@@ -518,21 +521,24 @@ public class NodeFailerTest {
}
tester.failer.run();
assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// Next, the 2 ready nodes that were dead from the start are failed out, and finally
// the second host and all its children are failed
tester.clock.advance(Duration.ofMinutes(30));
tester.failer.run();
assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// Nothing else to fail
tester.clock.advance(Duration.ofHours(25));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
tester.failer.run();
assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is not indicated by the metric.", 0, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
}
// Throttles based on percentage in large zone
@@ -550,6 +556,7 @@ public class NodeFailerTest {
// 1% are allowed to fail
assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 6 more hours pass, no more nodes are failed
for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) {
@@ -559,6 +566,7 @@ public class NodeFailerTest {
tester.failer.run();
assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 18 more hours pass, 24 hours since the first 5 nodes were failed. The remaining 5 are failed
for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) {
@@ -568,6 +576,7 @@ public class NodeFailerTest {
tester.failer.run();
assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
}
}