summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2022-12-01 12:47:15 +0100
committerMartin Polden <mpolden@mpolden.no>2022-12-01 12:49:30 +0100
commit6f95ebf764e1419115b202d3c3964b440f4831b2 (patch)
tree1a73b8457bb8118866667113b4e3221297a40413 /node-repository
parentf611155a289870ad02b79e0cb9068a9fa1d55f67 (diff)
Allow 4% of nodes to fail before throttling
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java20
2 files changed, 12 insertions, 10 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 32eac49a288..51b59ab77eb 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -286,7 +286,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
public enum ThrottlePolicy {
- hosted(Duration.ofDays(1), 0.03, 2),
+ hosted(Duration.ofDays(1), 0.04, 2),
disabled(Duration.ZERO, 0, 0);
private final Duration throttleWindow;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index ede958ef083..47803594148 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -563,33 +563,35 @@ public class NodeFailerTest {
NodeList allNodes = tester.nodeRepository.nodes().list();
assertEquals(500, allNodes.size());
- // 2 hours pass, 20 nodes (4%) die
+ // 2 hours pass, many nodes fail
tester.runMaintainers();
+ int downNodes = 25; // 5%
+ int allowedToFail = 20; // 4%
allNodes.state(Node.State.active)
.nodeType(NodeType.tenant)
.stream()
- .limit(20)
+ .limit(downNodes)
.forEach(host -> tester.serviceMonitor.setHostDown(host.hostname()));
tester.runMaintainers();
tester.clock.advance(Duration.ofHours(2));
tester.runMaintainers();
- // 3% are allowed to fail
- assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ // Fails nodes up to throttle limit
+ assertEquals(allowedToFail, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
- assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
+ assertEquals("Throttled node failures", downNodes - allowedToFail, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
// 6 more hours pass, no more nodes are failed
tester.clock.advance(Duration.ofHours(6));
tester.runMaintainers();
- assertEquals(15, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ assertEquals(allowedToFail, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
- assertEquals("Throttled node failures", 5, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
+ assertEquals("Throttled node failures", downNodes - allowedToFail, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
- // 18 more hours pass, 24 hours since the first 10 nodes were failed. The remaining 5 are failed
+ // 18 more hours pass, 24 hours since the first batch of nodes were failed. The remaining nodes are failed
tester.clock.advance(Duration.ofHours(18));
tester.runMaintainers();
- assertEquals(20, tester.nodeRepository.nodes().list(Node.State.failed).size());
+ assertEquals(downNodes, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
}