diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2019-02-12 22:09:23 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@verizonmedia.com> | 2019-02-12 22:09:23 +0100 |
commit | db7fafdf45dca3e1ae83e32c1332f5d3669a6d51 (patch) | |
tree | fc36aadc181aacd2bb7ef24ede43d33197583297 /node-repository | |
parent | a89a4821cdfd14619ee2e6029d3461bf6e6b2de1 (diff) |
Max 1 active host with wantToRetire, and fix NodeFailer.hasHardwareIssue
Diffstat (limited to 'node-repository')
3 files changed, 74 insertions, 21 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index 621ba2f778f..52ef5b4d1c6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -99,7 +99,7 @@ public class FailedExpirer extends Maintainer { private void recycle(List<Node> nodes) { List<Node> nodesToRecycle = new ArrayList<>(); for (Node candidate : nodes) { - if (NodeFailer.hasHardwareIssue(candidate, nodes)) { + if (NodeFailer.hasHardwareIssue(candidate, nodeRepository)) { List<String> unparkedChildren = !candidate.type().isDockerHost() ? Collections.emptyList() : nodeRepository.list().childrenOf(candidate).asList().stream() .filter(node -> node.state() != Node.State.parked) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index d0b1beb8e28..a8665595f97 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -133,14 +133,23 @@ public class NodeFailer extends Maintainer { failActive(node, reason); } - // Retire active nodes - for (Node node : activeNodes) { - if (!failAllowedFor(node.type())) continue; - if (nodesWithFailureReason.contains(node)) continue; - if (node.parentHostname().isPresent()) continue; // Defer to parent host (it should be active too) + // Retire active nodes. Only allow 1 active host to be wantToRetire at a time for rate limiting. + final long maxWantToRetireHosts = 1; + List<Node> candidateNodes = activeNodes.stream() + .filter(node -> failAllowedFor(node.type())) + .filter(node -> !nodesWithFailureReason.contains(node)) + // Defer to parent host (it should also be active) + .filter(node -> node.parentHostname().isEmpty()) + .collect(Collectors.toList()); + long currentWantToRetireHosts = candidateNodes.stream().filter(node -> node.status().wantToRetire()).count(); + + for (int i = 0; i < candidateNodes.size() && currentWantToRetireHosts < maxWantToRetireHosts; ++i) { + Node node = candidateNodes.get(i); List<String> reasons = reasonsToRetireActiveParentHost(node); - if (reasons.isEmpty()) continue; - retireRecursively(node, reasons, activeNodes); + if (reasons.size() > 0) { + retireRecursively(node, reasons, activeNodes); + ++currentWantToRetireHosts; + } } metric.set(throttlingActiveMetric, Math.min( 1, throttledNodeFailures), null); @@ -286,14 +295,12 @@ public class NodeFailer extends Maintainer { } /** Returns whether node has any kind of hardware issue */ - public static boolean hasHardwareIssue(Node node, List<Node> nodes) { + public static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) { if (node.status().hardwareFailureDescription().isPresent() || node.status().hardwareDivergence().isPresent()) { return true; } - Node hostNode = node.parentHostname() - .flatMap(parent -> nodes.stream().filter(n -> n.hostname().equals(parent)).findFirst()) - .orElse(node); + Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.getNode(parent)).orElse(node); return reasonsToRetireActiveParentHost(hostNode).size() > 0; } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index b0466885931..3e022de6a5a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -111,20 +111,34 @@ public class NodeFailerTest { }); { - // The host should have 2 nodes in active and 1 ready + // The host is active + Node parentNode = tester.nodeRepository.getNode(dockerHostWithFailureReport).orElseThrow(); + assertEquals(Node.State.active, parentNode.state()); + assertEquals(1, parentNode.reports().getReports().size()); + assertFalse(parentNode.status().wantToRetire()); + List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList(); - Map<Node.State, List<String>> hostnamesByState = childNodes.stream() - .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); - assertEquals(2, hostnamesByState.get(Node.State.active).size()); - assertEquals(1, hostnamesByState.get(Node.State.ready).size()); + assertEquals(3, childNodes.size()); + + // The 2 active child nodes + List<Node> activeChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.active).collect(Collectors.toList()); + assertEquals(2, activeChildNodes.size()); + assertTrue(activeChildNodes.stream().noneMatch(n -> n.status().wantToRetire())); + + // The ready child node + List<Node> failedChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.ready).collect(Collectors.toList()); + assertEquals(1, failedChildNodes.size()); + assertTrue(activeChildNodes.stream().noneMatch(n -> n.status().wantToRetire())); } tester.failer.run(); - tester.clock.advance(Duration.ofHours(25)); - tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); { + // The host is active with wantToRetire + Node parentNode = tester.nodeRepository.getNode(dockerHostWithFailureReport).orElseThrow(); + assertEquals(Node.State.active, parentNode.state()); + assertTrue(parentNode.status().wantToRetire()); + List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList(); assertEquals(3, childNodes.size()); @@ -133,11 +147,43 @@ public class NodeFailerTest { assertEquals(2, activeChildNodes.size()); assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire())); - // The ready node -> failed + // The ready node -> failed with wantToRetire List<Node> failedChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.failed).collect(Collectors.toList()); assertEquals(1, failedChildNodes.size()); assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire())); } + + // Set wantToRetire on the second host. Rate limiting will keep it from becoming wantToRetire + + String dockerHost2 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, dockerHostWithFailureReport); + tester.nodeRepository.getNodes().stream() + .filter(node -> node.hostname().equals(dockerHost2)) + .forEach(node -> { + Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport)); + tester.nodeRepository.write(updatedNode); + }); + + { + // dockerHost2 is active and with reports + Node parentNode = tester.nodeRepository.getNode(dockerHost2).orElseThrow(); + assertEquals(Node.State.active, parentNode.state()); + assertEquals(1, parentNode.reports().getReports().size()); + assertFalse(parentNode.status().wantToRetire()); + + List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHost2).asList(); + assertEquals(3, childNodes.size()); + } + + tester.clock.advance(Duration.ofHours(25)); + tester.allNodesMakeAConfigRequestExcept(); + tester.failer.run(); + + { + // dockerHost2 is active with wantToRetire + Node parentNode = tester.nodeRepository.getNode(dockerHost2).orElseThrow(); + assertEquals(Node.State.active, parentNode.state()); + assertFalse(parentNode.status().wantToRetire()); + } } @Test |