From 7d8e3ab69dff9da55078a96e86ae95da01352ec8 Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Thu, 3 Jan 2019 12:55:22 +0100 Subject: Always allow 2 parent hosts to fail in a 24 hour period --- .../com/yahoo/vespa/hosted/provision/NodeList.java | 7 +++ .../hosted/provision/maintenance/NodeFailer.java | 48 +++++++++++++-------- .../provision/maintenance/NodeFailerTest.java | 50 ++++++++++++++-------- 3 files changed, 69 insertions(+), 36 deletions(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java index ed6d1744af4..5f0bac7f194 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java @@ -78,6 +78,13 @@ public class NodeList { .collect(collectingAndThen(Collectors.toList(), NodeList::new)); } + /** Returns the subset of nodes that are parents */ + public NodeList parents() { + return nodes.stream() + .filter(n -> !n.parentHostname().isPresent()) + .collect(collectingAndThen(Collectors.toList(), NodeList::new)); + } + /** Returns the child nodes of the given parent node */ public NodeList childrenOf(Node parent) { return nodes.stream() diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 83c2f20f8aa..680a84756dc 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -12,6 +12,7 @@ import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.History; @@ -33,6 +34,7 @@ import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; +import static java.util.stream.Collectors.collectingAndThen; import static java.util.stream.Collectors.counting; /** @@ -122,6 +124,7 @@ public class NodeFailer extends Maintainer { failActive(node, reason); } + metric.set(throttlingActiveMetric, Math.min( 1, throttledNodeFailures), null); metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null); } @@ -326,20 +329,21 @@ public class NodeFailer extends Maintainer { if (throttlePolicy == ThrottlePolicy.disabled) return false; Instant startOfThrottleWindow = clock.instant().minus(throttlePolicy.throttleWindow); List nodes = nodeRepository().getNodes(); - long recentlyFailedNodes = nodes.stream() - .filter(n -> n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow)) - .count(); - int allowedFailedNodes = (int) Math.max(nodes.size() * throttlePolicy.fractionAllowedToFail, - throttlePolicy.minimumAllowedToFail); - - boolean throttle = allowedFailedNodes < recentlyFailedNodes || - (allowedFailedNodes == recentlyFailedNodes && !node.type().isDockerHost()); - if (throttle) { - log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(), - throttlePolicy.toHumanReadableString())); - } - metric.set(throttlingActiveMetric, throttle ? 1 : 0, null); - return throttle; + NodeList recentlyFailedNodes = nodes.stream() + .filter(n -> n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow)) + .collect(collectingAndThen(Collectors.toList(), NodeList::new)); + + // Allow failing nodes within policy + if (recentlyFailedNodes.size() < throttlePolicy.allowedToFailOf(nodes.size())) return false; + + // Always allow failing parents up to minimum limit + if (!node.parentHostname().isPresent() && + recentlyFailedNodes.parents().size() < throttlePolicy.minimumAllowedToFail) return false; + + log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(), + throttlePolicy.toHumanReadableString(nodes.size()))); + + return true; } public enum ThrottlePolicy { @@ -347,9 +351,9 @@ public class NodeFailer extends Maintainer { hosted(Duration.ofDays(1), 0.01, 2), disabled(Duration.ZERO, 0, 0); - public final Duration throttleWindow; - public final double fractionAllowedToFail; - public final int minimumAllowedToFail; + private final Duration throttleWindow; + private final double fractionAllowedToFail; + private final int minimumAllowedToFail; ThrottlePolicy(Duration throttleWindow, double fractionAllowedToFail, int minimumAllowedToFail) { this.throttleWindow = throttleWindow; @@ -357,10 +361,16 @@ public class NodeFailer extends Maintainer { this.minimumAllowedToFail = minimumAllowedToFail; } - public String toHumanReadableString() { - return String.format("Max %.0f%% or %d nodes can fail over a period of %s", fractionAllowedToFail*100, + public int allowedToFailOf(int totalNodes) { + return (int) Math.max(totalNodes * fractionAllowedToFail, minimumAllowedToFail); + } + + public String toHumanReadableString(int totalNodes) { + return String.format("Max %.0f%% (%d) or %d nodes can fail over a period of %s", fractionAllowedToFail*100, + allowedToFailOf(totalNodes), minimumAllowedToFail, throttleWindow); } + } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 1bf291cd2c3..fdce49490fa 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -475,7 +475,6 @@ public class NodeFailerTest { NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(10); List readyNodes = tester.createReadyNodes(50, 30); List hosts = tester.nodeRepository.getNodes(NodeType.host); - List deadNodes = readyNodes.subList(0, 4); // 2 hours pass, 4 nodes die @@ -487,7 +486,7 @@ public class NodeFailerTest { // 2 nodes are failed (the minimum amount that are always allowed to fail) tester.failer.run(); assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 6 more hours pass, no more nodes are failed @@ -497,11 +496,26 @@ public class NodeFailerTest { } tester.failer.run(); assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); - // 2 docker hosts now fail, 1 of them (with all its children is allowed to fail) - hosts.subList(0, 2).forEach(host -> { + // 18 more hours pass, the remaining 2 down nodes are allowed to fail + for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) { + tester.clock.advance(Duration.ofMinutes(interval)); + tester.allNodesMakeAConfigRequestExcept(deadNodes); + } + tester.failer.run(); + assertEquals(4, tester.nodeRepository.getNodes(Node.State.failed).size()); + + // 24 more hours pass, nothing happens + for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { + tester.clock.advance(Duration.ofMinutes(interval)); + tester.allNodesMakeAConfigRequestExcept(deadNodes); + } + + // 3 hosts fail. 2 of them and all of their children are allowed to fail + List failedHosts = hosts.subList(0, 3); + failedHosts.forEach(host -> { tester.serviceMonitor.setHostDown(host.hostname()); deadNodes.add(host); }); @@ -510,9 +524,12 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); tester.failer.run(); - assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling")); - assertEquals("Throttled node failures", 3, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); + assertEquals(4 + /* already failed */ + 2 + /* hosts */ + (2 * 3) /* containers per host */, + tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); + assertEquals("Throttled node failures", 1, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 24 more hours pass without any other nodes being failed out for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) { @@ -520,24 +537,23 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.failer.run(); - assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get("nodeFailThrottling")); - assertEquals("Throttled node failures", 3, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); + assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); + assertEquals("Throttled node failures", 1, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); - // Next, the 2 ready nodes that were dead from the start are failed out, and finally - // the second host and all its children are failed + // The final Docker host and its containers are failed out tester.clock.advance(Duration.ofMinutes(30)); tester.failer.run(); - assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get("nodeFailThrottling")); + assertEquals(16, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // Nothing else to fail tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(deadNodes); tester.failer.run(); - assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get("nodeFailThrottling")); + assertEquals(16, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is not indicated by the metric", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); } -- cgit v1.2.3 From 98548141e1a03c2972a47643e45ad57629e1b386 Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Thu, 3 Jan 2019 12:58:18 +0100 Subject: Increase allowed to fail fraction --- .../hosted/provision/maintenance/NodeFailer.java | 2 +- .../hosted/provision/maintenance/NodeFailerTest.java | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 680a84756dc..9b18f70eb64 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -348,7 +348,7 @@ public class NodeFailer extends Maintainer { public enum ThrottlePolicy { - hosted(Duration.ofDays(1), 0.01, 2), + hosted(Duration.ofDays(1), 0.02, 2), disabled(Duration.ZERO, 0, 0); private final Duration throttleWindow; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index fdce49490fa..74c11e4fd27 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -561,17 +561,17 @@ public class NodeFailerTest { { NodeFailTester tester = NodeFailTester.withNoApplications(); List readyNodes = tester.createReadyNodes(500); - List deadNodes = readyNodes.subList(0, 10); + List deadNodes = readyNodes.subList(0, 15); - // 2 hours pass, 10 nodes (2%) die + // 2 hours pass, 15 nodes (3%) die for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.failer.run(); - // 1% are allowed to fail - assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); + // 2% are allowed to fail + assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 6 more hours pass, no more nodes are failed @@ -580,18 +580,18 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.failer.run(); - assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); + assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); - // 18 more hours pass, 24 hours since the first 5 nodes were failed. The remaining 5 are failed + // 18 more hours pass, 24 hours since the first 10 nodes were failed. The remaining 5 are failed for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.failer.run(); - assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size()); - assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get("nodeFailThrottling")); + assertEquals(15, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); } } -- cgit v1.2.3 From c4ea4d8efa4fd574c76c1aea8286d966b194f85b Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Thu, 3 Jan 2019 14:24:00 +0100 Subject: Clarify physical nodes --- .../com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | 2 +- .../yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 9b18f70eb64..0dabe252406 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -336,7 +336,7 @@ public class NodeFailer extends Maintainer { // Allow failing nodes within policy if (recentlyFailedNodes.size() < throttlePolicy.allowedToFailOf(nodes.size())) return false; - // Always allow failing parents up to minimum limit + // Always allow failing physical nodes up to minimum limit if (!node.parentHostname().isPresent() && recentlyFailedNodes.parents().size() < throttlePolicy.minimumAllowedToFail) return false; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 74c11e4fd27..b13e5c418f9 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -477,7 +477,7 @@ public class NodeFailerTest { List hosts = tester.nodeRepository.getNodes(NodeType.host); List deadNodes = readyNodes.subList(0, 4); - // 2 hours pass, 4 nodes die + // 2 hours pass, 4 physical nodes die for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); @@ -499,7 +499,7 @@ public class NodeFailerTest { assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 2, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); - // 18 more hours pass, the remaining 2 down nodes are allowed to fail + // 18 more hours pass, the remaining dead nodes are allowed to fail for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); @@ -541,7 +541,7 @@ public class NodeFailerTest { assertEquals("Throttling is indicated by the metric", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 1, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); - // The final Docker host and its containers are failed out + // The final host and its containers are failed out tester.clock.advance(Duration.ofMinutes(30)); tester.failer.run(); assertEquals(16, tester.nodeRepository.getNodes(Node.State.failed).size()); -- cgit v1.2.3