summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2019-01-04 08:41:48 +0100
committerGitHub <noreply@github.com>2019-01-04 08:41:48 +0100
commit19dda9a6180432a2652165a036098f1176977f41 (patch)
tree0ba9d6d6f063cd16c02030022c13e110d952aa97 /node-repository
parent33e24cb366464dd3ea090d470da5126a594bbc0c (diff)
parentc4ea4d8efa4fd574c76c1aea8286d966b194f85b (diff)
Merge pull request #7999 from vespa-engine/mpolden/adjust-throttling
Adjust NodeFailer throttling
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java7
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java50
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java72
3 files changed, 81 insertions, 48 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
index ed6d1744af4..5f0bac7f194 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
@@ -78,6 +78,13 @@ public class NodeList {
.collect(collectingAndThen(Collectors.toList(), NodeList::new));
}
+ /** Returns the subset of nodes that are parents */
+ public NodeList parents() {
+ return nodes.stream()
+ .filter(n -> !n.parentHostname().isPresent())
+ .collect(collectingAndThen(Collectors.toList(), NodeList::new));
+ }
+
/** Returns the child nodes of the given parent node */
public NodeList childrenOf(Node parent) {
return nodes.stream()
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 83c2f20f8aa..0dabe252406 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -12,6 +12,7 @@ import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.History;
@@ -33,6 +34,7 @@ import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
+import static java.util.stream.Collectors.collectingAndThen;
import static java.util.stream.Collectors.counting;
/**
@@ -122,6 +124,7 @@ public class NodeFailer extends Maintainer {
failActive(node, reason);
}
+ metric.set(throttlingActiveMetric, Math.min( 1, throttledNodeFailures), null);
metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null);
}
@@ -326,30 +329,31 @@ public class NodeFailer extends Maintainer {
if (throttlePolicy == ThrottlePolicy.disabled) return false;
Instant startOfThrottleWindow = clock.instant().minus(throttlePolicy.throttleWindow);
List<Node> nodes = nodeRepository().getNodes();
- long recentlyFailedNodes = nodes.stream()
- .filter(n -> n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow))
- .count();
- int allowedFailedNodes = (int) Math.max(nodes.size() * throttlePolicy.fractionAllowedToFail,
- throttlePolicy.minimumAllowedToFail);
-
- boolean throttle = allowedFailedNodes < recentlyFailedNodes ||
- (allowedFailedNodes == recentlyFailedNodes && !node.type().isDockerHost());
- if (throttle) {
- log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(),
- throttlePolicy.toHumanReadableString()));
- }
- metric.set(throttlingActiveMetric, throttle ? 1 : 0, null);
- return throttle;
+ NodeList recentlyFailedNodes = nodes.stream()
+ .filter(n -> n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow))
+ .collect(collectingAndThen(Collectors.toList(), NodeList::new));
+
+ // Allow failing nodes within policy
+ if (recentlyFailedNodes.size() < throttlePolicy.allowedToFailOf(nodes.size())) return false;
+
+ // Always allow failing physical nodes up to minimum limit
+ if (!node.parentHostname().isPresent() &&
+ recentlyFailedNodes.parents().size() < throttlePolicy.minimumAllowedToFail) return false;
+
+ log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(),
+ throttlePolicy.toHumanReadableString(nodes.size())));
+
+ return true;
}
public enum ThrottlePolicy {
- hosted(Duration.ofDays(1), 0.01, 2),
+ hosted(Duration.ofDays(1), 0.02, 2),
disabled(Duration.ZERO, 0, 0);
- public final Duration throttleWindow;
- public final double fractionAllowedToFail;
- public final int minimumAllowedToFail;
+ private final Duration throttleWindow;
+ private final double fractionAllowedToFail;
+ private final int minimumAllowedToFail;
ThrottlePolicy(Duration throttleWindow, double fractionAllowedToFail, int minimumAllowedToFail) {
this.throttleWindow = throttleWindow;
@@ -357,10 +361,16 @@ public class NodeFailer extends Maintainer {
this.minimumAllowedToFail = minimumAllowedToFail;
}
- public String toHumanReadableString() {
- return String.format("Max %.0f%% or %d nodes can fail over a period of %s", fractionAllowedToFail*100,
+ public int allowedToFailOf(int totalNodes) {
+ return (int) Math.max(totalNodes * fractionAllowedToFail, minimumAllowedToFail);
+ }
+
+ public String toHumanReadableString(int totalNodes) {
+ return String.format("Max %.0f%% (%d) or %d nodes can fail over a period of %s", fractionAllowedToFail*100,
+ allowedToFailOf(totalNodes),
minimumAllowedToFail, throttleWindow);
}
+
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 1bf291cd2c3..b13e5c418f9 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -475,10 +475,9 @@ public class NodeFailerTest {
NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(10);
List<Node> readyNodes = tester.createReadyNodes(50, 30);
List<Node> hosts = tester.nodeRepository.getNodes(NodeType.host);
-
List<Node> deadNodes = readyNodes.subList(0, 4);
- // 2 hours pass, 4 nodes die
+ // 2 hours pass, 4 physical nodes die
for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
@@ -487,7 +486,7 @@ public class NodeFailerTest {
// 2 nodes are failed (the minimum amount that are always allowed to fail)
tester.failer.run();
assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 6 more hours pass, no more nodes are failed
@@ -497,11 +496,26 @@ public class NodeFailerTest {
}
tester.failer.run();
assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
- // 2 docker hosts now fail, 1 of them (with all its children is allowed to fail)
- hosts.subList(0, 2).forEach(host -> {
+ // 18 more hours pass, the remaining dead nodes are allowed to fail
+ for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) {
+ tester.clock.advance(Duration.ofMinutes(interval));
+ tester.allNodesMakeAConfigRequestExcept(deadNodes);
+ }
+ tester.failer.run();
+ assertEquals(4, tester.nodeRepository.getNodes(Node.State.failed).size());
+
+ // 24 more hours pass, nothing happens
+ for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
+ tester.clock.advance(Duration.ofMinutes(interval));
+ tester.allNodesMakeAConfigRequestExcept(deadNodes);
+ }
+
+ // 3 hosts fail. 2 of them and all of their children are allowed to fail
+ List<Node> failedHosts = hosts.subList(0, 3);
+ failedHosts.forEach(host -> {
tester.serviceMonitor.setHostDown(host.hostname());
deadNodes.add(host);
});
@@ -510,9 +524,12 @@ public class NodeFailerTest {
tester.allNodesMakeAConfigRequestExcept(deadNodes);
tester.failer.run();
- assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling"));
- assertEquals("Throttled node failures", 3, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
+ assertEquals(4 + /* already failed */
+ 2 + /* hosts */
+ (2 * 3) /* containers per host */,
+ tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
+ assertEquals("Throttled node failures", 1, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 24 more hours pass without any other nodes being failed out
for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) {
@@ -520,24 +537,23 @@ public class NodeFailerTest {
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
tester.failer.run();
- assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling"));
- assertEquals("Throttled node failures", 3, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
+ assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
+ assertEquals("Throttled node failures", 1, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
- // Next, the 2 ready nodes that were dead from the start are failed out, and finally
- // the second host and all its children are failed
+ // The final host and its containers are failed out
tester.clock.advance(Duration.ofMinutes(30));
tester.failer.run();
- assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals(16, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// Nothing else to fail
tester.clock.advance(Duration.ofHours(25));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
tester.failer.run();
- assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals(16, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
}
@@ -545,17 +561,17 @@ public class NodeFailerTest {
{
NodeFailTester tester = NodeFailTester.withNoApplications();
List<Node> readyNodes = tester.createReadyNodes(500);
- List<Node> deadNodes = readyNodes.subList(0, 10);
+ List<Node> deadNodes = readyNodes.subList(0, 15);
- // 2 hours pass, 10 nodes (2%) die
+ // 2 hours pass, 15 nodes (3%) die
for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
tester.failer.run();
- // 1% are allowed to fail
- assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
+ // 2% are allowed to fail
+ assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 6 more hours pass, no more nodes are failed
@@ -564,18 +580,18 @@ public class NodeFailerTest {
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
tester.failer.run();
- assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
- // 18 more hours pass, 24 hours since the first 5 nodes were failed. The remaining 5 are failed
+ // 18 more hours pass, 24 hours since the first 10 nodes were failed. The remaining 5 are failed
for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
tester.failer.run();
- assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size());
- assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get("nodeFailThrottling"));
+ assertEquals(15, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
}
}