From 1a69165c6e149e7c2a905e9ce1b733876549d7b2 Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Fri, 10 Nov 2017 12:26:54 +0100 Subject: Allow to fail 1 host/day --- .../hosted/provision/maintenance/NodeFailer.java | 7 ++-- .../provision/maintenance/NodeFailerTest.java | 41 ++++++++++++++++------ 2 files changed, 36 insertions(+), 12 deletions(-) (limited to 'node-repository/src') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 603f4d2be7c..266d91e7e3e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -268,8 +268,11 @@ public class NodeFailer extends Maintainer { .map(Optional::get) .filter(failedEvent -> failedEvent.at().isAfter(startOfThrottleWindow)) .count(); - boolean throttle = recentlyFailedNodes >= Math.max(nodes.size() * throttlePolicy.fractionAllowedToFail, - throttlePolicy.minimumAllowedToFail); + int allowedFailedNodes = (int) Math.max(nodes.size() * throttlePolicy.fractionAllowedToFail, + throttlePolicy.minimumAllowedToFail); + + boolean throttle = allowedFailedNodes < recentlyFailedNodes || + (allowedFailedNodes == recentlyFailedNodes && node.type() != NodeType.host); if (throttle) { log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(), throttlePolicy.toHumanReadableString())); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index afc298872c8..b9b871dfd1f 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -11,7 +11,6 @@ import org.junit.Test; import java.time.Duration; import java.util.Arrays; -import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -367,12 +366,10 @@ public class NodeFailerTest { public void node_failing_throttle() { // Throttles based on a absolute number in small zone { - NodeFailTester tester = NodeFailTester.withNoApplications(); - tester.createReadyNodes(50); - tester.createReadyNodes(50, 50, "docker"); - - List readyNodes = tester.nodeRepository.getNodes(); - Collections.shuffle(readyNodes); + // 50 regular tenant nodes, 10 hosts with each 3 tenant nodes, total 90 nodes + NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(10); + List readyNodes = tester.createReadyNodes(50, 30); + List hosts = tester.nodeRepository.getNodes(NodeType.host); List deadNodes = readyNodes.subList(0, 4); @@ -394,13 +391,37 @@ public class NodeFailerTest { tester.failer.run(); assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); - // 18 more hours pass, it's now 24 hours since the first 2 failed. The remaining 2 are failed - for (int minutes = 0, interval = 30; minutes <= 18 * 60; minutes += interval) { + // 2 docker hosts now fail, 1 of them (with all its children is allowed to fail) + hosts.subList(0, 2).forEach(host -> { + tester.serviceMonitor.setHostDown(host.hostname()); + deadNodes.add(host); + }); + tester.failer.run(); + tester.clock.advance(Duration.ofMinutes(61)); + tester.allNodesMakeAConfigRequestExcept(deadNodes); + + tester.failer.run(); + assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); + + // 24 more hours pass without any other nodes being failed out + for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.failer.run(); - assertEquals(4, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); + + // Next, the 2 ready nodes that were dead from the start are failed out, and finally + // the second host and all its children are failed + tester.clock.advance(Duration.ofMinutes(30)); + tester.failer.run(); + assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); + + // Nothing else to fail + tester.clock.advance(Duration.ofHours(25)); + tester.allNodesMakeAConfigRequestExcept(deadNodes); + tester.failer.run(); + assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); } // Throttles based on percentage in large zone -- cgit v1.2.3