diff options
author | Jon Bratseth <bratseth@gmail.com> | 2022-12-01 13:19:56 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-12-01 13:19:56 +0100 |
commit | d6fac3d9c0801fda08c832a7cb4360a585a1c97d (patch) | |
tree | 992ffdc5ee60e45887b7c3042e0bcc94853fab2f /node-repository | |
parent | c04788cb28e3b72e19b4ad11c87031ff85f17681 (diff) | |
parent | 6f95ebf764e1419115b202d3c3964b440f4831b2 (diff) |
Merge pull request #25072 from vespa-engine/mpolden/increase-throttle-limit
Allow 4% of nodes to fail before throttling
Diffstat (limited to 'node-repository')
2 files changed, 12 insertions, 10 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 32eac49a288..51b59ab77eb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -286,7 +286,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { public enum ThrottlePolicy { - hosted(Duration.ofDays(1), 0.03, 2), + hosted(Duration.ofDays(1), 0.04, 2), disabled(Duration.ZERO, 0, 0); private final Duration throttleWindow; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index ede958ef083..47803594148 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -563,33 +563,35 @@ public class NodeFailerTest { NodeList allNodes = tester.nodeRepository.nodes().list(); assertEquals(500, allNodes.size()); - // 2 hours pass, 20 nodes (4%) die + // 2 hours pass, many nodes fail tester.runMaintainers(); + int downNodes = 25; // 5% + int allowedToFail = 20; // 4% allNodes.state(Node.State.active) .nodeType(NodeType.tenant) .stream() - .limit(20) + .limit(downNodes) .forEach(host -> tester.serviceMonitor.setHostDown(host.hostname())); tester.runMaintainers(); tester.clock.advance(Duration.ofHours(2)); tester.runMaintainers(); - // 3% are allowed to fail - assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size()); + // Fails nodes up to throttle limit + assertEquals(allowedToFail, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); - assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); + assertEquals("Throttled node failures", downNodes - allowedToFail, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 6 more hours pass, no more nodes are failed tester.clock.advance(Duration.ofHours(6)); tester.runMaintainers(); - assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(allowedToFail, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); - assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); + assertEquals("Throttled node failures", downNodes - allowedToFail, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); - // 18 more hours pass, 24 hours since the first 10 nodes were failed. The remaining 5 are failed + // 18 more hours pass, 24 hours since the first batch of nodes were failed. The remaining nodes are failed tester.clock.advance(Duration.ofHours(18)); tester.runMaintainers(); - assertEquals(20, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(downNodes, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); } |