diff options
author | HÃ¥kon Hallingstad <hakon@oath.com> | 2019-02-18 22:05:10 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-02-18 22:05:10 +0100 |
commit | 9ac30d19ff990a05d4427ff9e8d8bf46c9892dbf (patch) | |
tree | a9205f09813e179b21b46f6e16b81c1e80b7569c | |
parent | c3f34b6b1de9836d6d2175757a3081ffcc29a90c (diff) | |
parent | ca276d29713c78d4fb8a9e510b682f53faae3b23 (diff) |
Merge pull request #8544 from vespa-engine/hakonhall/require-all-child-nodes-to-be-suspended-in-nodefailer
Require all child nodes to be suspended in NodeFailer
2 files changed, 74 insertions, 1 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index c1c2d7b9cc5..d7a71580a46 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -210,7 +210,7 @@ public class NodeFailer extends Maintainer { for (Node node : activeNodes) { if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) { nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); - } else if (nodeSuspended(node)) { + } else if (hostSuspended(node, activeNodes)) { if (node.status().hardwareFailureDescription().isPresent()) { nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get()); } else { @@ -296,6 +296,16 @@ public class NodeFailer extends Maintainer { } } + /** Is the node and all active children suspended? */ + private boolean hostSuspended(Node node, List<Node> activeNodes) { + if (!nodeSuspended(node)) return false; + if (node.parentHostname().isPresent()) return true; // optimization + return activeNodes.stream() + .filter(childNode -> childNode.parentHostname().isPresent() && + childNode.parentHostname().get().equals(node.hostname())) + .allMatch(this::nodeSuspended); + } + /** * We can attempt to fail any number of *tenant* and *host* nodes because the operation will not be effected * unless the node is replaced. diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index eaacf7c656b..4aa9fb2b11d 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -118,6 +118,69 @@ public class NodeFailerTest { } @Test + public void hw_fail_only_if_whole_host_is_suspended() { + NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(6); + String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2); + assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); + + // The host has 2 nodes in active and 1 ready + Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.list().childrenOf(hostWithFailureReports).asList().stream() + .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); + assertEquals(2, hostnamesByState.get(Node.State.active).size()); + String activeChild1 = hostnamesByState.get(Node.State.active).get(0); + String activeChild2 = hostnamesByState.get(Node.State.active).get(1); + assertEquals(1, hostnamesByState.get(Node.State.ready).size()); + String readyChild = hostnamesByState.get(Node.State.ready).get(0); + + // Set failure report to the parent and all its children. + Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low"); + tester.nodeRepository.getNodes().stream() + .filter(node -> node.hostname().equals(hostWithFailureReports)) + .forEach(node -> { + Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport)); + tester.nodeRepository.write(updatedNode); + }); + + // The ready node will be failed, but neither the host nor the 2 active nodes since they have not been suspended + tester.allNodesMakeAConfigRequestExcept(); + tester.failer.run(); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild1).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild2).get().state()); + + // Suspending the host will not fail any more since none of the children are suspened + tester.suspend(hostWithFailureReports); + tester.clock.advance(Duration.ofHours(25)); + tester.allNodesMakeAConfigRequestExcept(); + tester.failer.run(); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild1).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild2).get().state()); + + // Suspending one child node will fail that out. + tester.suspend(activeChild1); + tester.clock.advance(Duration.ofHours(25)); + tester.allNodesMakeAConfigRequestExcept(); + tester.failer.run(); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild1).get().state()); + assertEquals(Node.State.active, tester.nodeRepository.getNode(activeChild2).get().state()); + + // Suspending the second child node will fail that out and the host. + tester.suspend(activeChild2); + tester.clock.advance(Duration.ofHours(25)); + tester.allNodesMakeAConfigRequestExcept(); + tester.failer.run(); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyChild).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(hostWithFailureReports).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild1).get().state()); + assertEquals(Node.State.failed, tester.nodeRepository.getNode(activeChild2).get().state()); + } + + @Test public void nodes_for_suspended_applications_are_not_failed() { NodeFailTester tester = NodeFailTester.withTwoApplications(); tester.suspend(NodeFailTester.app1); |