diff options
author | Jon Bratseth <bratseth@gmail.com> | 2022-04-08 14:00:02 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2022-04-08 14:00:02 +0200 |
commit | 4c160bb52ac7544c6e692e86240391c2d07d57ca (patch) | |
tree | dc75cb2a83fab8dfed4d9ddde2fe0fa0f4071848 /node-repository | |
parent | f1814747bf7548bdec63ca608969bbd730e6c8b5 (diff) |
Increase node failure throttling from 2 to 3 %
Diffstat (limited to 'node-repository')
2 files changed, 7 insertions, 7 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index a1916d7dc20..3900d10a53e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -309,7 +309,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { public enum ThrottlePolicy { - hosted(Duration.ofDays(1), 0.02, 2), + hosted(Duration.ofDays(1), 0.03, 2), disabled(Duration.ZERO, 0, 0); private final Duration throttleWindow; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index cf1cea49fa6..f7d29a116ed 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -638,33 +638,33 @@ public class NodeFailerTest { NodeList allNodes = tester.nodeRepository.nodes().list(); assertEquals(500, allNodes.size()); - // 2 hours pass, 15 nodes (3%) die + // 2 hours pass, 20 nodes (4%) die tester.runMaintainers(); allNodes.state(Node.State.active) .nodeType(NodeType.tenant) .stream() - .limit(15) + .limit(20) .forEach(host -> tester.serviceMonitor.setHostDown(host.hostname())); tester.runMaintainers(); tester.clock.advance(Duration.ofHours(2)); tester.runMaintainers(); - // 2% are allowed to fail - assertEquals(10, tester.nodeRepository.nodes().list(Node.State.failed).size()); + // 3% are allowed to fail + assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 6 more hours pass, no more nodes are failed tester.clock.advance(Duration.ofHours(6)); tester.runMaintainers(); - assertEquals(10, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 18 more hours pass, 24 hours since the first 10 nodes were failed. The remaining 5 are failed tester.clock.advance(Duration.ofHours(18)); tester.runMaintainers(); - assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(20, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); } |