diff options
author | Martin Polden <mpolden@mpolden.no> | 2022-04-08 16:03:00 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-04-08 16:03:00 +0200 |
commit | 472db7a0c4756570ad9741c230c6a8181e550e25 (patch) | |
tree | 00cf0691f4e496e2ee294ceeea508a2825cb27be /node-repository | |
parent | 69454cef2fe53694eb5541e5f622a4e973c081bf (diff) | |
parent | 4c160bb52ac7544c6e692e86240391c2d07d57ca (diff) |
Merge pull request #22055 from vespa-engine/bratseth/tolerate-more-failures
Increase node failure throttling from 2 to 3 %
Diffstat (limited to 'node-repository')
2 files changed, 7 insertions, 7 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index a1916d7dc20..3900d10a53e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -309,7 +309,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { public enum ThrottlePolicy { - hosted(Duration.ofDays(1), 0.02, 2), + hosted(Duration.ofDays(1), 0.03, 2), disabled(Duration.ZERO, 0, 0); private final Duration throttleWindow; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index cf1cea49fa6..f7d29a116ed 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -638,33 +638,33 @@ public class NodeFailerTest { NodeList allNodes = tester.nodeRepository.nodes().list(); assertEquals(500, allNodes.size()); - // 2 hours pass, 15 nodes (3%) die + // 2 hours pass, 20 nodes (4%) die tester.runMaintainers(); allNodes.state(Node.State.active) .nodeType(NodeType.tenant) .stream() - .limit(15) + .limit(20) .forEach(host -> tester.serviceMonitor.setHostDown(host.hostname())); tester.runMaintainers(); tester.clock.advance(Duration.ofHours(2)); tester.runMaintainers(); - // 2% are allowed to fail - assertEquals(10, tester.nodeRepository.nodes().list(Node.State.failed).size()); + // 3% are allowed to fail + assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 6 more hours pass, no more nodes are failed tester.clock.advance(Duration.ofHours(6)); tester.runMaintainers(); - assertEquals(10, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); // 18 more hours pass, 24 hours since the first 10 nodes were failed. The remaining 5 are failed tester.clock.advance(Duration.ofHours(18)); tester.runMaintainers(); - assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size()); + assertEquals(20, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); } |