diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2017-11-16 09:10:49 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-11-16 09:10:49 +0100 |
commit | d21531b7614f3816dd4650a2f9e846c0a42495c3 (patch) | |
tree | a7ba2e5c9840aa59de7385bedc5bf01c1796c060 /node-repository | |
parent | e952b1d5abd4311e3743affa91726a3b09f5a859 (diff) | |
parent | 1a69165c6e149e7c2a905e9ce1b733876549d7b2 (diff) |
Merge pull request #3998 from vespa-engine/freva/node-failer
Count Docker nodes when throttling in node failure
Diffstat (limited to 'node-repository')
3 files changed, 101 insertions, 104 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 1c81d97ddea..266d91e7e3e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; -import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.config.provision.NodeType; import com.yahoo.transaction.Mutex; @@ -24,12 +23,12 @@ import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.ArrayList; -import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.logging.Level; import java.util.logging.Logger; -import java.util.stream.Collectors; /** * Maintains information in the node repo about when this node last responded to ping @@ -76,21 +75,15 @@ public class NodeFailer extends Maintainer { @Override protected void maintain() { // Ready nodes - updateNodeLivenessEventsForReadyNodes(); - for (Node node : readyNodesWhichAreDead()) { - // Docker hosts and nodes do not run Vespa services - if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER || node.type() == NodeType.host) continue; - if ( ! throttle(node)) nodeRepository().fail(node.hostname(), - Agent.system, "Not receiving config requests from node"); - } - - for (Node node : readyNodesWithHardwareFailure()) - if ( ! throttle(node)) nodeRepository().fail(node.hostname(), - Agent.system, "Node has hardware failure"); + try (Mutex lock = nodeRepository().lockUnallocated()) { + updateNodeLivenessEventsForReadyNodes(); - for (Node node: readyNodesWithHardwareDivergence()) - if ( ! throttle(node)) nodeRepository().fail(node.hostname(), - Agent.system, "Node hardware diverges from spec"); + getReadyNodesByFailureReason().forEach((node, reason) -> { + if (!throttle(node)) { + nodeRepository().fail(node.hostname(), Agent.system, reason); + } + }); + } // Active nodes for (Node node : determineActiveNodeDownStatus()) { @@ -103,59 +96,55 @@ public class NodeFailer extends Maintainer { private void updateNodeLivenessEventsForReadyNodes() { // Update node last request events through ZooKeeper to collect request to all config servers. // We do this here ("lazily") to avoid writing to zk for each config request. - try (Mutex lock = nodeRepository().lockUnallocated()) { - for (Node node : nodeRepository().getNodes(Node.State.ready)) { - Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname()); - if ( ! lastLocalRequest.isPresent()) continue; - - Optional<History.Event> recordedRequest = node.history().event(History.Event.Type.requested); - if ( ! recordedRequest.isPresent() || recordedRequest.get().at().isBefore(lastLocalRequest.get())) { - History updatedHistory = node.history().with(new History.Event(History.Event.Type.requested, - Agent.system, - lastLocalRequest.get())); - nodeRepository().write(node.with(updatedHistory)); - } + for (Node node : nodeRepository().getNodes(Node.State.ready)) { + Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname()); + if ( ! lastLocalRequest.isPresent()) continue; + + Optional<History.Event> recordedRequest = node.history().event(History.Event.Type.requested); + if ( ! recordedRequest.isPresent() || recordedRequest.get().at().isBefore(lastLocalRequest.get())) { + History updatedHistory = node.history().with(new History.Event(History.Event.Type.requested, + Agent.system, + lastLocalRequest.get())); + nodeRepository().write(node.with(updatedHistory)); } } } - private List<Node> readyNodesWhichAreDead() { - // Allow requests some time to be registered in case all config servers have been down - if (constructionTime.isAfter(clock.instant().minus(nodeRequestInterval).minus(nodeRequestInterval) )) - return Collections.emptyList(); - - // Nodes are taken as dead if they have not made a config request since this instant. - // Add 10 minutes to the down time limit to allow nodes to make a request that infrequently. - Instant oldestAcceptableRequestTime = clock.instant().minus(downTimeLimit).minus(nodeRequestInterval); - - return nodeRepository().getNodes(Node.State.ready).stream() - .filter(node -> wasMadeReadyBefore(oldestAcceptableRequestTime, node)) - .filter(node -> ! hasRecordedRequestAfter(oldestAcceptableRequestTime, node)) - .collect(Collectors.toList()); + private Map<Node, String> getReadyNodesByFailureReason() { + Instant oldestAcceptableRequestTime = + // Allow requests some time to be registered in case all config servers have been down + constructionTime.isAfter(clock.instant().minus(nodeRequestInterval.multipliedBy(2))) ? + Instant.EPOCH : + + // Nodes are taken as dead if they have not made a config request since this instant. + // Add 10 minutes to the down time limit to allow nodes to make a request that infrequently. + clock.instant().minus(downTimeLimit).minus(nodeRequestInterval); + + Map<Node, String> nodesByFailureReason = new HashMap<>(); + for (Node node : nodeRepository().getNodes(Node.State.ready)) { + if (! hasNodeRequestedConfigAfter(node, oldestAcceptableRequestTime)) { + nodesByFailureReason.put(node, "Not receiving config requests from node"); + } else if (node.status().hardwareFailureDescription().isPresent()) { + nodesByFailureReason.put(node, "Node has hardware failure"); + } else if (node.status().hardwareDivergence().isPresent()) { + nodesByFailureReason.put(node, "Node has hardware divergence"); + } + } + return nodesByFailureReason; } - private List<Node> readyNodesWithHardwareDivergence() { - return nodeRepository().getNodes(Node.State.ready).stream() - .filter(node -> node.status().hardwareDivergence().isPresent()) - .collect(Collectors.toList()); + private boolean hasNodeRequestedConfigAfter(Node node, Instant instant) { + return !wasMadeReadyBefore(node, instant) || hasRecordedRequestAfter(node, instant); } - private boolean wasMadeReadyBefore(Instant instant, Node node) { + private boolean wasMadeReadyBefore(Node node, Instant instant) { Optional<History.Event> readiedEvent = node.history().event(History.Event.Type.readied); - if ( ! readiedEvent.isPresent()) return false; - return readiedEvent.get().at().isBefore(instant); + return readiedEvent.map(event -> event.at().isBefore(instant)).orElse(false); } - private boolean hasRecordedRequestAfter(Instant instant, Node node) { + private boolean hasRecordedRequestAfter(Node node, Instant instant) { Optional<History.Event> lastRequest = node.history().event(History.Event.Type.requested); - if ( ! lastRequest.isPresent()) return false; - return lastRequest.get().at().isAfter(instant); - } - - private List<Node> readyNodesWithHardwareFailure() { - return nodeRepository().getNodes(Node.State.ready).stream() - .filter(node -> node.status().hardwareFailureDescription().isPresent()) - .collect(Collectors.toList()); + return lastRequest.map(event -> event.at().isAfter(instant)).orElse(false); } private boolean applicationSuspended(Node node) { @@ -272,18 +261,18 @@ public class NodeFailer extends Maintainer { private boolean throttle(Node node) { if (throttlePolicy == ThrottlePolicy.disabled) return false; Instant startOfThrottleWindow = clock.instant().minus(throttlePolicy.throttleWindow); - List<Node> nodes = nodeRepository().getNodes().stream() - // Do not consider Docker containers when throttling - .filter(n -> n.flavor().getType() != Flavor.Type.DOCKER_CONTAINER) - .collect(Collectors.toList()); + List<Node> nodes = nodeRepository().getNodes(); long recentlyFailedNodes = nodes.stream() .map(n -> n.history().event(History.Event.Type.failed)) .filter(Optional::isPresent) .map(Optional::get) .filter(failedEvent -> failedEvent.at().isAfter(startOfThrottleWindow)) .count(); - boolean throttle = recentlyFailedNodes >= Math.max(nodes.size() * throttlePolicy.fractionAllowedToFail, - throttlePolicy.minimumAllowedToFail); + int allowedFailedNodes = (int) Math.max(nodes.size() * throttlePolicy.fractionAllowedToFail, + throttlePolicy.minimumAllowedToFail); + + boolean throttle = allowedFailedNodes < recentlyFailedNodes || + (allowedFailedNodes == recentlyFailedNodes && node.type() != NodeType.host); if (throttle) { log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(), throttlePolicy.toHumanReadableString())); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java index 6bcb9426373..0e0195a5bed 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java @@ -185,8 +185,8 @@ public class NodeFailTester { } public void allNodesMakeAConfigRequestExcept(List<Node> deadNodes) { - for (Node node : nodeRepository.getNodes(NodeType.tenant)) { - if ( ! deadNodes.contains(node) && node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER) + for (Node node : nodeRepository.getNodes()) { + if ( ! deadNodes.contains(node)) hostLivenessTracker.receivedRequestFrom(node.hostname()); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index c78663415ef..b9b871dfd1f 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -5,7 +5,6 @@ import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; -import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException; import com.yahoo.vespa.orchestrator.ApplicationStateChangeDeniedException; import org.junit.Test; @@ -66,7 +65,7 @@ public class NodeFailerTest { assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); } - // Failures are detected on two ready nodes, which are then failed + // Hardware failures are detected on two ready nodes, which are then failed Node readyFail1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(2); Node readyFail2 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(3); tester.nodeRepository.write(readyFail1.with(readyFail1.status().withHardwareFailureDescription(Optional.of("memory_mcelog")))); @@ -166,18 +165,16 @@ public class NodeFailerTest { tester.createReadyNodes(1, 16, "docker"); // For a day all nodes work so nothing happens - for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) { - tester.clock.advance(Duration.ofMinutes(5)); + for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { + tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); tester.failer.run(); assertEquals( 5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); } List<Node> ready = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready); - List<Node> readyHosts = tester.nodeRepository.getNodes(NodeType.host, Node.State.ready); - // Two ready nodes die and a ready docker node "dies" - // (Vespa does not run when in ready state for docker node, so it does not make config requests) + // Two ready nodes and a ready docker node die, but only 2 of those are failed out tester.clock.advance(Duration.ofMinutes(180)); Node dockerNode = ready.stream().filter(node -> node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER).findFirst().get(); List<Node> otherNodes = ready.stream() @@ -188,18 +185,13 @@ public class NodeFailerTest { assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - // Another ready node die + // Another ready node dies and the node that died earlier, are allowed to fail tester.clock.advance(Duration.ofDays(1)); tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), dockerNode, otherNodes.get(3)); tester.failer.run(); - assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals(ready.get(1), tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(0)); - assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - - // Ready Docker hosts do not make config requests - tester.allNodesMakeAConfigRequestExcept(readyHosts.get(0), readyHosts.get(1), readyHosts.get(2)); - tester.failer.run(); - assertEquals(3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size()); + assertEquals( 1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(otherNodes.get(1), tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(0)); + assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); } @Test @@ -207,8 +199,8 @@ public class NodeFailerTest { NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(7); // For a day all nodes work so nothing happens - for (int minutes = 0; minutes < 24 * 60; minutes += 5 ) { - tester.clock.advance(Duration.ofMinutes(5)); + for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { + tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); tester.failer.run(); assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); @@ -247,10 +239,10 @@ public class NodeFailerTest { Node downTenant1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).get(0); tester.serviceMonitor.setHostDown(downTenant1.hostname()); - // nothing happens the first 45 minutes - for (int minutes = 0; minutes < 45; minutes += 5 ) { + // nothing happens during the entire day because of the failure throttling + for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { tester.failer.run(); - tester.clock.advance(Duration.ofMinutes(5)); + tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); assertEquals(3 + 1, tester.nodeRepository.getNodes(Node.State.failed).size()); } @@ -374,14 +366,12 @@ public class NodeFailerTest { public void node_failing_throttle() { // Throttles based on a absolute number in small zone { - NodeFailTester tester = NodeFailTester.withNoApplications(); - List<Node> readyNodes = tester.createReadyNodes(50); - List<Node> readyDockerNodes = tester.createReadyNodes(50, 50, "docker"); + // 50 regular tenant nodes, 10 hosts with each 3 tenant nodes, total 90 nodes + NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(10); + List<Node> readyNodes = tester.createReadyNodes(50, 30); + List<Node> hosts = tester.nodeRepository.getNodes(NodeType.host); List<Node> deadNodes = readyNodes.subList(0, 4); - // Fail 10 Docker containers, should not impact throttling policy - readyDockerNodes.subList(0, 10) - .forEach(node -> tester.nodeRepository.fail(node.hostname(), Agent.system, "Failed in test")); // 2 hours pass, 4 nodes die for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) { @@ -391,7 +381,7 @@ public class NodeFailerTest { // 2 nodes are failed (the minimum amount that are always allowed to fail) tester.failer.run(); - assertEquals(2, getNonDockerFailedNodes(tester.nodeRepository).size()); + assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); // 6 more hours pass, no more nodes are failed for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) { @@ -399,15 +389,39 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.failer.run(); - assertEquals(2, getNonDockerFailedNodes(tester.nodeRepository).size()); + assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); - // 18 more hours pass, it's now 24 hours since the first 2 failed. The remaining 2 are failed - for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) { + // 2 docker hosts now fail, 1 of them (with all its children is allowed to fail) + hosts.subList(0, 2).forEach(host -> { + tester.serviceMonitor.setHostDown(host.hostname()); + deadNodes.add(host); + }); + tester.failer.run(); + tester.clock.advance(Duration.ofMinutes(61)); + tester.allNodesMakeAConfigRequestExcept(deadNodes); + + tester.failer.run(); + assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); + + // 24 more hours pass without any other nodes being failed out + for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.failer.run(); - assertEquals(4, getNonDockerFailedNodes(tester.nodeRepository).size()); + assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); + + // Next, the 2 ready nodes that were dead from the start are failed out, and finally + // the second host and all its children are failed + tester.clock.advance(Duration.ofMinutes(30)); + tester.failer.run(); + assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); + + // Nothing else to fail + tester.clock.advance(Duration.ofHours(25)); + tester.allNodesMakeAConfigRequestExcept(deadNodes); + tester.failer.run(); + assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); } // Throttles based on percentage in large zone @@ -443,12 +457,6 @@ public class NodeFailerTest { } } - /** Get all failed nodes that are not Docker containers */ - private static List<Node> getNonDockerFailedNodes(NodeRepository nodeRepository) { - return nodeRepository.getNodes(Node.State.failed).stream() - .filter(node -> node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER) - .collect(Collectors.toList()); - } /** * Selects the first parent host that: |