diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2019-02-15 13:49:02 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@verizonmedia.com> | 2019-02-15 13:49:02 +0100 |
commit | 5df006545147994f43f40acea91a7baa1acd93df (patch) | |
tree | 7158a6661ef9030baa6ac25d14c70c76deecb34e /node-repository | |
parent | 0cfa248003df6099bb88821b119fee45791db016 (diff) |
Fail instead of retire on failure report in NodeFailer
Diffstat (limited to 'node-repository')
2 files changed, 38 insertions, 157 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index f6933f7f585..28e036169fa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -2,8 +2,6 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.cloud.config.ConfigserverConfig; -import com.yahoo.config.provision.ApplicationId; -import com.yahoo.config.provision.ApplicationLockException; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.HostLivenessTracker; @@ -28,7 +26,6 @@ import com.yahoo.vespa.service.monitor.ServiceMonitor; import java.time.Clock; import java.time.Duration; import java.time.Instant; -import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -134,28 +131,6 @@ public class NodeFailer extends Maintainer { failActive(node, reason); } - // Retire active hosts and their children. - activeNodes.stream() - .filter(node -> failAllowedFor(node.type())) - .filter(node -> !nodesWithFailureReason.contains(node)) - // Defer to parent host (it should also be active) - .filter(node -> node.parentHostname().isEmpty()) - // This will sort those with wantToRetire first - .sorted(Comparator.comparing(node -> node.status().wantToRetire(), Comparator.reverseOrder())) - .filter(node -> { - if (node.status().wantToRetire()) return true; - if (node.allocation().map(a -> a.membership().retired()).orElse(false)) return true; - List<String> reasons = reasonsToRetireActiveParentHost(node); - if (reasons.size() > 0) { - retireRecursively(node, reasons, activeNodes); - return true; - } - return false; - }) - // Only allow 1 active host to be wantToRetire at a time for rate limiting. - .limit(1) - .count(); - metric.set(throttlingActiveMetric, Math.min( 1, throttledNodeFailures), null); metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null); } @@ -235,8 +210,20 @@ public class NodeFailer extends Maintainer { for (Node node : activeNodes) { if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) { nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); - } else if (node.status().hardwareFailureDescription().isPresent() && nodeSuspended(node)) { - nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get()); + } else if (nodeSuspended(node)) { + if (node.status().hardwareFailureDescription().isPresent()) { + nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get()); + } else { + Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); + List<String> failureReports = reasonsToRetireActiveParentHost(hostNode); + if (failureReports.size() > 0) { + if (hostNode.equals(node)) { + nodesByFailureReason.put(node, "Host has failure reports: " + failureReports); + } else { + nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports); + } + } + } } } return nodesByFailureReason; @@ -261,43 +248,6 @@ public class NodeFailer extends Maintainer { reportId + " reported " + report.getCreatedTime() + ": " + report.getDescription()); } - /** - * There are reasons why this node should be parked, and we'd like to do it through retiring, - * including any child nodes. - */ - private void retireRecursively(Node node, List<String> reasons, List<Node> activeNodes) { - if (activeNodes != null) { - List<Node> childNodesToRetire = activeNodes.stream() - .filter(n -> n.parentHostname().equals(Optional.of(node.hostname()))) - .collect(Collectors.toList()); - for (Node childNode : childNodesToRetire) { - retireRecursively(childNode, reasons, null); - } - } - - if (node.status().wantToRetire()) return; - retireActive(node.hostname(), node.allocation().get().owner(), reasons); - } - - private void retireActive(String hostname, ApplicationId owner, List<String> reasons) { - // Getting the application lock can take a very long time for the largest applications. - // Don't bother waiting for too long since retries is automatic with maintainers. - Duration lockWait = Duration.ofSeconds(10); - try (Mutex lock = nodeRepository().lock(owner, lockWait)) { - // Recheck all conditions in case anything has changed - Optional<Node> node = nodeRepository().getNode(hostname); - if (node.isEmpty()) return; - if (node.get().state() != Node.State.active) return; - if (!node.get().allocation().orElseThrow().owner().equals(owner)) return; - if (node.get().status().wantToRetire()) return; - - log.info("Setting wantToRetire on " + node.get() + " due to these reports: " + reasons); - nodeRepository().write(node.get().withWantToRetire(true, Agent.NodeFailer, clock.instant())); - } catch (ApplicationLockException e) { - log.warning("Failed to get lock on " + owner + " within " + lockWait + " to set wantToRetire, will retry later"); - } - } - /** Returns whether node has any kind of hardware issue */ public static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) { if (node.status().hardwareFailureDescription().isPresent() || node.status().hardwareDivergence().isPresent()) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 3e022de6a5a..eaacf7c656b 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -53,9 +53,30 @@ public class NodeFailerTest { tester.nodeRepository.write(updatedNode); }); + testNodeFailingWith(tester, hostWithHwFailure); + } + + @Test + public void fail_nodes_with_severe_reports_if_allowed_to_be_down() { + NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(6); + String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2); + + // Set failure report to the parent and all its children. + Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low"); + tester.nodeRepository.getNodes().stream() + .filter(node -> node.hostname().equals(hostWithFailureReports)) + .forEach(node -> { + Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport)); + tester.nodeRepository.write(updatedNode); + }); + + testNodeFailingWith(tester, hostWithFailureReports); + } + + private void testNodeFailingWith(NodeFailTester tester, String hostWithHwFailure) { // The host should have 2 nodes in active and 1 ready Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream() - .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); + .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); assertEquals(2, hostnamesByState.get(Node.State.active).size()); assertEquals(1, hostnamesByState.get(Node.State.ready).size()); @@ -73,7 +94,7 @@ public class NodeFailerTest { Arrays.asList(hostnamesByState.get(Node.State.active).get(0), hostnamesByState.get(Node.State.ready).get(0))); expectedHostnamesByState1Iter.put(Node.State.active, hostnamesByState.get(Node.State.active).subList(1, 2)); Map<Node.State, List<String>> hostnamesByState1Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream() - .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); + .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); assertEquals(expectedHostnamesByState1Iter, hostnamesByState1Iter); // Suspend the second of the active nodes @@ -85,7 +106,7 @@ public class NodeFailerTest { // All of the children should be failed now Set<Node.State> childStates2Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream() - .map(Node::state).collect(Collectors.toSet()); + .map(Node::state).collect(Collectors.toSet()); assertEquals(Collections.singleton(Node.State.failed), childStates2Iter); // The host itself is still active as it too must be allowed to suspend assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithHwFailure).get().state()); @@ -97,96 +118,6 @@ public class NodeFailerTest { } @Test - public void set_want_to_retire_if_failure_report() { - NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(6); - String dockerHostWithFailureReport = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2); - - // Set failure report to the parent and all its children. - Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low"); - tester.nodeRepository.getNodes().stream() - .filter(node -> node.hostname().equals(dockerHostWithFailureReport)) - .forEach(node -> { - Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport)); - tester.nodeRepository.write(updatedNode); - }); - - { - // The host is active - Node parentNode = tester.nodeRepository.getNode(dockerHostWithFailureReport).orElseThrow(); - assertEquals(Node.State.active, parentNode.state()); - assertEquals(1, parentNode.reports().getReports().size()); - assertFalse(parentNode.status().wantToRetire()); - - List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList(); - assertEquals(3, childNodes.size()); - - // The 2 active child nodes - List<Node> activeChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.active).collect(Collectors.toList()); - assertEquals(2, activeChildNodes.size()); - assertTrue(activeChildNodes.stream().noneMatch(n -> n.status().wantToRetire())); - - // The ready child node - List<Node> failedChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.ready).collect(Collectors.toList()); - assertEquals(1, failedChildNodes.size()); - assertTrue(activeChildNodes.stream().noneMatch(n -> n.status().wantToRetire())); - } - - tester.failer.run(); - - { - // The host is active with wantToRetire - Node parentNode = tester.nodeRepository.getNode(dockerHostWithFailureReport).orElseThrow(); - assertEquals(Node.State.active, parentNode.state()); - assertTrue(parentNode.status().wantToRetire()); - - List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList(); - assertEquals(3, childNodes.size()); - - // The active nodes -> wantToRetire - List<Node> activeChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.active).collect(Collectors.toList()); - assertEquals(2, activeChildNodes.size()); - assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire())); - - // The ready node -> failed with wantToRetire - List<Node> failedChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.failed).collect(Collectors.toList()); - assertEquals(1, failedChildNodes.size()); - assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire())); - } - - // Set wantToRetire on the second host. Rate limiting will keep it from becoming wantToRetire - - String dockerHost2 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, dockerHostWithFailureReport); - tester.nodeRepository.getNodes().stream() - .filter(node -> node.hostname().equals(dockerHost2)) - .forEach(node -> { - Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport)); - tester.nodeRepository.write(updatedNode); - }); - - { - // dockerHost2 is active and with reports - Node parentNode = tester.nodeRepository.getNode(dockerHost2).orElseThrow(); - assertEquals(Node.State.active, parentNode.state()); - assertEquals(1, parentNode.reports().getReports().size()); - assertFalse(parentNode.status().wantToRetire()); - - List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHost2).asList(); - assertEquals(3, childNodes.size()); - } - - tester.clock.advance(Duration.ofHours(25)); - tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); - - { - // dockerHost2 is active with wantToRetire - Node parentNode = tester.nodeRepository.getNode(dockerHost2).orElseThrow(); - assertEquals(Node.State.active, parentNode.state()); - assertFalse(parentNode.status().wantToRetire()); - } - } - - @Test public void nodes_for_suspended_applications_are_not_failed() { NodeFailTester tester = NodeFailTester.withTwoApplications(); tester.suspend(NodeFailTester.app1); |