summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2022-04-08 16:03:00 +0200
committerGitHub <noreply@github.com>2022-04-08 16:03:00 +0200
commit472db7a0c4756570ad9741c230c6a8181e550e25 (patch)
tree00cf0691f4e496e2ee294ceeea508a2825cb27be /node-repository
parent69454cef2fe53694eb5541e5f622a4e973c081bf (diff)
parent4c160bb52ac7544c6e692e86240391c2d07d57ca (diff)
Merge pull request #22055 from vespa-engine/bratseth/tolerate-more-failures
Increase node failure throttling from 2 to 3 %
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java12
2 files changed, 7 insertions, 7 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index a1916d7dc20..3900d10a53e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -309,7 +309,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
public enum ThrottlePolicy {
- hosted(Duration.ofDays(1), 0.02, 2),
+ hosted(Duration.ofDays(1), 0.03, 2),
disabled(Duration.ZERO, 0, 0);
private final Duration throttleWindow;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index cf1cea49fa6..f7d29a116ed 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -638,33 +638,33 @@ public class NodeFailerTest {
NodeList allNodes = tester.nodeRepository.nodes().list();
assertEquals(500, allNodes.size());
- // 2 hours pass, 15 nodes (3%) die
+ // 2 hours pass, 20 nodes (4%) die
tester.runMaintainers();
allNodes.state(Node.State.active)
.nodeType(NodeType.tenant)
.stream()
- .limit(15)
+ .limit(20)
.forEach(host -> tester.serviceMonitor.setHostDown(host.hostname()));
tester.runMaintainers();
tester.clock.advance(Duration.ofHours(2));
tester.runMaintainers();
- // 2% are allowed to fail
- assertEquals(10, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ // 3% are allowed to fail
+ assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 6 more hours pass, no more nodes are failed
tester.clock.advance(Duration.ofHours(6));
tester.runMaintainers();
- assertEquals(10, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 18 more hours pass, 24 hours since the first 10 nodes were failed. The remaining 5 are failed
tester.clock.advance(Duration.ofHours(18));
tester.runMaintainers();
- assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ assertEquals(20, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
}